In [1]:
!pip install snscrape
!pip install youtube-comment-downloader
!pip install gensim
!pip install Sastrawi
!pip install tensorflow
!pip install scikit-learn



In [2]:
import pandas as pd
import os, re, json, math, time, joblib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from youtube_comment_downloader import YoutubeCommentDownloader

# gensim Word2Vec
from gensim.models import Word2Vec

# TensorFlow / Keras for LSTM
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.callbacks import EarlyStopping

# NLP: NLTK + Sastrawi (Indonesian stemming)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [3]:
from google.colab import files
uploaded = files.upload()

Saving youtube_comments.csv to youtube_comments (1).csv


In [5]:
df = pd.read_csv('youtube_comments.csv')

In [6]:
df.head()

Unnamed: 0,comment,video
0,Bangkeeee üòÇüòÖü§£ü§£ü§£,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
1,47:53 .. MasyaAllah ..... üéâ,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
2,.,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
3,"MasyaAllah bg mael, lihat tu cara minum nya, m...",https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
4,"Pecah emang , sumpah , kompor gass, bag pras d...",https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40


In [7]:
print(df.shape)

(5000, 2)


In [8]:
df = df.dropna(subset=['comment']).reset_index(drop=True)

In [9]:
df = df[df['comment'].str.strip() != ""].reset_index(drop=True)
print("After cleaning empty:", df.shape)

After cleaning empty: (5000, 2)


In [10]:
df.sample(5)


Unnamed: 0,comment,video
4277,03:23 kejadian yg begitu cepat sampai tidak bi...,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
3896,Endingnya keren parah‚ù§,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
434,Ngga lama lgi maele bakaln bikn konten maulid ...,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
907,Bang mael cocok nih untuk co host,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40
932,fajar kocak abisüòÇ,https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40


In [11]:
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    # hapus URL, mention, emoji dasar, tanda baca, dan angka
    text = re.sub(r'http\S+|www\S+', ' ', text)
    text = re.sub(r'@\w+', ' ', text)
    text = re.sub(r'[^a-z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [12]:
INDO_STOPWORDS = set(stopwords.words('indonesian'))
stemmer = StemmerFactory().create_stemmer()

In [13]:
def stem_and_remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in INDO_STOPWORDS and len(t) > 1]
    stemmed = [stemmer.stem(t) for t in tokens]
    return " ".join(stemmed)

In [14]:
df['clean'] = df['comment'].astype(str).apply(clean_text)
df['clean'] = df['clean'].apply(stem_and_remove_stopwords)
print(df.head())

                                             comment  \
0                                    Bangkeeee üòÇüòÖü§£ü§£ü§£   
1                        47:53 .. MasyaAllah ..... üéâ   
2                                                  .   
3  MasyaAllah bg mael, lihat tu cara minum nya, m...   
4  Pecah emang , sumpah , kompor gass, bag pras d...   

                                              video  \
0  https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40   
1  https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40   
2  https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40   
3  https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40   
4  https://youtu.be/NIr72bFGFts?si=YHGohGGVPQ91ZM40   

                                               clean  
0                                          bangkeeee  
1                                   47 53 masyaallah  
2                                                     
3  masyaallah bg mael lihat tu minum nya dalam ka...  
4  pecah emang sumpah kompor gass bag p

In [15]:
!pip install --quiet textblob
from textblob import TextBlob

In [16]:
def tb_label(text):
    try:
        score = TextBlob(text).sentiment.polarity
    except:
        score = 0.0
    if score > 0.05:
        return 'positif'
    elif score < -0.05:
        return 'negatif'
    else:
        return 'netral'

In [17]:
df['label'] = df['clean'].apply(tb_label)

In [18]:
print(df['label'].value_counts())

label
netral     4670
positif     275
negatif      55
Name: count, dtype: int64


In [19]:
print(df['label'].value_counts())
df['label'].value_counts(normalize=True).mul(100).round(2)

label
netral     4670
positif     275
negatif      55
Name: count, dtype: int64


Unnamed: 0_level_0,proportion
label,Unnamed: 1_level_1
netral,93.4
positif,5.5
negatif,1.1


In [20]:
label_map = {'negatif': 0, 'netral': 1, 'positif': 2}
df['label_id'] = df['label'].map(label_map)
SEED = 42

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
X_tfidf = tfidf.fit_transform(df['clean'])

In [22]:
from gensim.models import Word2Vec
import numpy as np

sentences = [s.split() for s in df['clean']]
w2v_model = Word2Vec(
    sentences,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    seed=SEED
)

def w2v_avg_vector(doc):
    words = doc.split()
    vecs = [w2v_model.wv[w] for w in words if w in w2v_model.wv]
    if len(vecs) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(vecs, axis=0)

X_w2v = np.vstack(df['clean'].apply(w2v_avg_vector).values)


In [23]:
MAX_VOCAB = 20000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean'])
sequences = tokenizer.texts_to_sequences(df['clean'])
X_seq = pad_sequences(sequences, maxlen=MAX_LEN, padding='post', truncating='post')

In [24]:
y = df['label_id'].values

In [25]:
from sklearn.model_selection import train_test_split

X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(
    X_tfidf, y, test_size=0.2, random_state=SEED, stratify=y
)

X_w2v_train, X_w2v_test, _, _ = train_test_split(
    X_w2v, y, test_size=0.2, random_state=SEED, stratify=y
)

In [26]:
X_seq_train, X_seq_test, _, _ = train_test_split(
    X_seq, y, test_size=0.2, random_state=SEED, stratify=y
)

In [27]:
print("TF-IDF train/test:", X_tfidf_train.shape, X_tfidf_test.shape)
print("W2V train/test:", X_w2v_train.shape, X_w2v_test.shape)
print("Seq train/test:", X_seq_train.shape, X_seq_test.shape)

TF-IDF train/test: (4000, 20000) (1000, 20000)
W2V train/test: (4000, 100) (1000, 100)
Seq train/test: (4000, 100) (1000, 100)


In [28]:
svm = LinearSVC(random_state=SEED)
svm.fit(X_tfidf_train, y_train)

In [29]:
y_pred_svm = svm.predict(X_tfidf_test)

In [30]:
print("SVM (TF-IDF) Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=['negatif', 'netral', 'positif']))

SVM (TF-IDF) Accuracy: 0.963
              precision    recall  f1-score   support

     negatif       1.00      0.36      0.53        11
      netral       0.96      1.00      0.98       934
     positif       1.00      0.45      0.62        55

    accuracy                           0.96      1000
   macro avg       0.99      0.61      0.71      1000
weighted avg       0.96      0.96      0.96      1000



In [31]:
rf = RandomForestClassifier(n_estimators=200, random_state=SEED)
rf.fit(X_w2v_train, y_train)

In [32]:
y_pred_rf = rf.predict(X_w2v_test)

In [33]:
print("RF (Word2Vec) Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=['negatif', 'netral', 'positif']))

RF (Word2Vec) Accuracy: 0.942
              precision    recall  f1-score   support

     negatif       1.00      0.09      0.17        11
      netral       0.94      1.00      0.97       934
     positif       1.00      0.13      0.23        55

    accuracy                           0.94      1000
   macro avg       0.98      0.41      0.45      1000
weighted avg       0.95      0.94      0.92      1000



In [34]:
EMBED_DIM = 128

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=EMBED_DIM),
    Bidirectional(LSTM(128, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])

model.build(input_shape=(None, MAX_LEN))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

In [35]:
es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(
    X_seq_train, y_train,
    epochs=10,
    batch_size=64,
    validation_split=0.1,
    callbacks=[es]
)

Epoch 1/10
[1m57/57[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m34s[0m 442ms/step - accuracy: 0.8539 - loss: 0.5152 - val_accuracy: 0.9500 - val_loss: 0.2107
Epoch 2/10
[1m57/57[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m25s[0m 443ms/step - accuracy: 0.9251 - loss: 0.2758 - val_accuracy: 0.9500 - val_loss: 0.1859
Epoch 3/10
[1m57/57[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m40s[0m 429ms/step - accuracy: 0.9515 - loss: 0.1533 - val_accuracy: 0.9725 - val_loss: 0.1442
Epoch 4/10
[1m57/57[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m26s[0m 454ms/step - accuracy: 0.9818 - loss: 0.0555 - val_accuracy: 0.9750 - val_loss: 0.1657
Epoch 5/10
[1m57/57[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m25s[0m 436ms/step - accuracy: 0.9821 - loss: 0.0441 - val_accuracy: 0.9650 - va

In [36]:
loss, acc = model.evaluate(X_seq_test, y_test, verbose=0)
print(f"Akurasi LSTM di data uji: {acc * 100:.2f}%")

Akurasi LSTM di data uji: 95.90%


In [37]:
y_pred_lstm = np.argmax(model.predict(X_seq_test), axis=1)
print(classification_report(y_test, y_pred_lstm, target_names=['negatif', 'netral', 'positif']))

[1m32/32[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 98ms/step
              precision    recall  f1-score   support

     negatif       0.00      0.00      0.00        11
      netral       0.97      1.00      0.98       934
     positif       0.76      0.53      0.62        55

    accuracy                           0.96      1000
   macro avg       0.58      0.51      0.53      1000
weighted avg       0.94      0.96      0.95      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
def predict_svm(text):
    t = stem_and_remove_stopwords(clean_text(text))
    vec = tfidf.transform([t])
    return svm.predict(vec)[0]

In [39]:
def predict_rf(text):
    t = stem_and_remove_stopwords(clean_text(text))
    vec = w2v_avg_vector(t).reshape(1, -1)
    return rf.predict(vec)[0]

In [40]:
def predict_lstm(text):
    t = stem_and_remove_stopwords(clean_text(text))
    seq = tokenizer.texts_to_sequences([t])
    pad = pad_sequences(seq, maxlen=MAX_LEN, padding='post', truncating='post')
    pred = model.predict(pad, verbose=0)[0]
    return np.argmax(pred), float(np.max(pred))

In [41]:
map_label = {0: 'negatif', 1: 'netral', 2: 'positif'}

examples = [
    "Videonya keren banget, saya suka kualitas suaranya",
    "Biasa aja, terlalu panjang dan membosankan",
    "Sangat mengecewakan, banyak kesalahan editing"
]

for ex in examples:
    print("Text:", ex)
    print(" SVM  ->", map_label[predict_svm(ex)])
    print(" RF   ->", map_label[predict_rf(ex)])
    label_idx, prob = predict_lstm(ex)
    print(" LSTM ->", map_label[label_idx])
    print("------")


Text: Videonya keren banget, saya suka kualitas suaranya
 SVM  -> netral
 RF   -> netral
 LSTM -> netral
------
Text: Biasa aja, terlalu panjang dan membosankan
 SVM  -> netral
 RF   -> netral
 LSTM -> netral
------
Text: Sangat mengecewakan, banyak kesalahan editing
 SVM  -> netral
 RF   -> netral
 LSTM -> netral
------


In [42]:
joblib.dump(svm, "svm_tfidf.joblib")
joblib.dump(tfidf, "tfidf_vectorizer.joblib")
joblib.dump(rf, "rf_w2v.joblib")
w2v_model.save("word2vec.model")
model.save("lstm_sentiment.h5")
joblib.dump(tokenizer, "tokenizer.joblib")
df.to_csv("youtube_comments_labelled.csv", index=False)

print("Semua model dan dataset telah disimpan.")



Semua model dan dataset telah disimpan.


In [43]:
!pip install pipreqs
!pipreqs --force .

INFO: Not scanning for jupyter notebooks.
INFO: Successfully saved requirements file in ./requirements.txt


In [44]:
!pip freeze > requirements.txt

In [45]:
print("Saved models and requirements.txt")

Saved models and requirements.txt
