In [1]:
import os
import re

import pandas as pd

# %pip install youtube-comment-downloader
from youtube_comment_downloader import YoutubeCommentDownloader, SORT_BY_RECENT

# %pip install wordcloud
import wordcloud
from wordcloud import WordCloud, STOPWORDS

# %pip install matplotlib
import matplotlib.pyplot as plt

# %pip install nltk
import nltk
# nltk.download('all')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# %pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

import matplotlib.pyplot as plt

In [8]:
from concurrent.futures import ThreadPoolExecutor

YOUTUBES_VIDEO_ID = ["-6ZRL6AhPWk"]

try:
    def process_video(video_id):
        download_comments(video_id)
        generate_wordcloud(video_id)
        generate_word_frequency(video_id)
    
    with ThreadPoolExecutor(max_workers=4) as executor:
        executor.map(process_video, YOUTUBES_VIDEO_ID)
except NameError as e:
    pass

# Download Comments

In [3]:
def download_comments(video_id):
    downloader = YoutubeCommentDownloader()
    comments = downloader.get_comments(video_id, sort_by=SORT_BY_RECENT)

    comments = pd.DataFrame(comments)

    print(f"Total Comments: {len(comments)}")

    os.makedirs(f"data/{video_id}", exist_ok=True)

    # Save to CSV
    comments.to_feather(f"data/{video_id}/comments.feather")

    print(comments.iloc[0])
    return comments

# Word Cloud

### Get All Comments

In [4]:
def get_comments(video_id):
    if os.path.exists(f"data/{video_id}/all_comments.txt"):
        with open(f"data/{video_id}/all_comments.txt", "r") as f:
            all_comments = f.read()
    else:
        comments = pd.read_feather(f"data/{video_id}/comments.feather")

        all_comments = ""

        for i in range(len(comments)):
            comment = comments.iloc[i]['text'].lower()
            comment = re.sub('[^a-zA-Z\s]', '', comment)
            comment = re.sub('\s+', ' ', comment)  # Clean multiple spaces into single space
            all_comments += comment

        with open(f"data/{video_id}/all_comments.txt", "w") as f:
            f.write(all_comments)
    
    return all_comments


### Tokenize & Stopwords & Stemming

In [5]:
def tokenize_stopwords_stemming(comments, stopwords_extend=[]):
    # Tokenize
    tokens = word_tokenize(comments)

    # Stopwords
    stop_words = stopwords.words("indonesian")
    stop_words.extend(stopwords_extend)
    
    tokens_stopwords = [word for word in tokens if word not in set(stop_words)]

    # Stemming
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tokens_stemming = [stemmer.stem(word) for word in tokens_stopwords]

    return tokens_stemming

### Generate Word Frequency


In [6]:
def generate_word_frequency(video_id, n=20):
    all_comments = get_comments(video_id)
    tokens_stopwords = tokenize_stopwords_stemming(all_comments)

    word_freq = nltk.FreqDist(tokens_stopwords)    
    word_freq.plot(n, cumulative=False)

    plt.savefig(f"data/{video_id}/word_frequency.png", bbox_inches='tight', dpi=300)
    plt.close()

### Generate Word Cloud

In [7]:
def generate_wordcloud(video_id):
    all_comments = get_comments(video_id)
    tokens_stopwords = tokenize_stopwords_stemming(all_comments)

    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(tokens_stopwords))

    # Display the word cloud

    plt.figure(figsize=(10, 5))

    plt.imshow(wordcloud, interpolation="bilinear")

    plt.axis("off")

    plt.savefig(f"data/{video_id}/wordcloud.png", bbox_inches='tight', dpi=300)
    plt.close()

# Word Frequency
