In [8]:
import json
import re
import string
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import tensorflow as tf

import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nlp_id.lemmatizer import Lemmatizer
from nlp_id.stopword import StopWord
from nlp_id.tokenizer import Tokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertForSequenceClassification
from wordcloud import WordCloud

import joblib

  from .autonotebook import tqdm as notebook_tqdm





In [9]:
df = pd.read_csv("D:/22.11.5169/SEM 6/PROJ DATMIN/huh/vvv/csv/Youtube-Sentimen.csv")
df

Unnamed: 0,Time,UserName,Comment,LikeCount,preprocessing,stemming_ulasan
0,2025-05-15T15:04:49Z,@fajarhamdani1134,Sangat betul...,0,[],
1,2025-05-13T21:18:46Z,@SuponoPono-n5g,Ternak MULYONO manaaa ngertiiii......biangnya ...,0,"['ternak', 'mulyono', 'manaa', 'ngertii', 'bia...",ternak mulyono manaa ngertii biang gak jelaass...
2,2025-05-13T15:41:27Z,@BadariahAhmad-b7t,Kerja sawit di msia sudah middle upper class bro,0,"['kerja', 'sawit', 'msia', 'middle', 'upper', ...",kerja sawit msia middle upper class saudara la...
3,2025-05-13T09:29:59Z,@mibaa7578,Seorang yg begitu bijak.. tahniah org muda.❤,0,"['bijak', 'tahniah', 'org', 'muda']",bijak tahniah org muda
4,2025-05-13T06:34:59Z,@RobiRahmathidayat01,Parah 😂,0,['parah'],parah
...,...,...,...,...,...,...
19290,2024-10-19T11:59:14Z,@AndreCirebon,​​@@Sirimons12SMA sya juga ojol SMA kayak lu b...,2,"['sya', 'ojol', 'kayak', 'bang', 'sgtu', 'hasi...",sya ojol kayak bang sgtu hasil sya phk usaha s...
19291,2024-10-19T11:59:17Z,@kingki1953,No. 2 setelah pak gugem,3,"['no', 'gugem']",no gugem
19292,2024-10-19T12:17:37Z,@byonechannel2419,No 2 setelah Ngomongin Uang,3,"['no', 'bicara', 'uang']",no bicara uang
19293,2024-10-19T11:01:08Z,@zapz,summon,5,['summon'],summon


# **PERSIAPAN DATA UNTUK INDOBERT DAN ML KLASIK**
# Gunakan kolom 'processed_comment' yang sudah dioptimasi

In [12]:
data_for_models = df[['processed_comment', 'label']] # 'label' diasumsikan ground truth

df_train, df_test_val = train_test_split(data_for_models, test_size=0.2, random_state=42, stratify=data_for_models['label'])
df_val, df_test = train_test_split(df_test_val, test_size=0.5, random_state=42, stratify=df_test_val['label'])

KeyError: "None of [Index(['processed_comment', 'label'], dtype='object')] are in the [columns]"

In [10]:


print('Training data shape:', df_train.shape)
print('Validation data shape:', df_val.shape)
print('Test data shape:', df_test.shape)

plt.figure(figsize=(5, 5))
sns.countplot(x=df_train['label'])
plt.title("Distribusi Label di Data Training")
plt.show()
df_train.to_csv('data_training_indobert.csv', index=False)

plt.figure(figsize=(5, 5))
sns.countplot(x=df_val['label'])
plt.title("Distribusi Label di Data Validasi")
plt.show()
df_val.to_csv('data_validasi_indobert.csv', index=False)

plt.figure(figsize=(5, 5))
sns.countplot(x=df_test['label'])
plt.title("Distribusi Label di Data Testing")
plt.show()
df_test.to_csv('data_testing_indobert.csv', index=False)

# --- INDOBERT MODEL ---
print("\n--- IndoBERT Model Training & Evaluation ---")
bert_tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p2')

print('Kalimat contoh:', df['processed_comment'].iloc[0])
print('BERT Tokenizer:', bert_tokenizer.tokenize(df['processed_comment'].iloc[0]))

bert_input = bert_tokenizer.encode_plus(
    df['processed_comment'].iloc[0],
    add_special_tokens=True,
    padding='max_length',
    truncation='longest_first',
    max_length=50,
    return_attention_mask=True,
    return_token_type_ids=True
)

print('\nKeys from bert_input:', bert_input.keys())
print('Kalimat\t\t:', df['processed_comment'].iloc[0])
print('Tokenizer\t:', bert_tokenizer.convert_ids_to_tokens(bert_input['input_ids']))
print('Input IDs\t:', bert_input['input_ids'])
print('Token Type IDs\t:', bert_input['token_type_ids'])
print('Attention Mask\t:', bert_input['attention_mask'])

token_lens = []
for txt in df['processed_comment']:
    tokens = bert_tokenizer.encode(txt)
    token_lens.append(len(tokens))

sns.histplot(token_lens, kde=True, stat='density', linewidth=0)
plt.xlim([0, 100])
plt.xlabel('Token count')
plt.title("Distribusi Panjang Token Komentar")
plt.show()

MAX_LENGTH = 50

def convert_example_to_feature(sentence):
    return bert_tokenizer.encode_plus(
        sentence,
        add_special_tokens=True,
        padding='max_length',
        truncation='longest_first',
        max_length=MAX_LENGTH,
        return_attention_mask=True,
        return_token_type_ids=True
    )

def map_example_to_dict(input_ids, attention_masks, token_type_ids, label):
    return {
        "input_ids": input_ids,
        "token_type_ids": token_type_ids,
        "attention_mask": attention_masks,
    }, label

def encode_dataset(data_df):
    input_ids_list = []
    token_type_ids_list = []
    attention_mask_list = []
    label_list = []
    for sentence, label in data_df.to_numpy():
        bert_input = convert_example_to_feature(str(sentence))
        input_ids_list.append(bert_input['input_ids'])
        token_type_ids_list.append(bert_input['token_type_ids'])
        attention_mask_list.append(bert_input['attention_mask'])
        label_list.append([label])
    return tf.data.Dataset.from_tensor_slices((input_ids_list, attention_mask_list, token_type_ids_list, label_list)).map(map_example_to_dict)

train_encoded = encode_dataset(df_train).batch(32)
test_encoded = encode_dataset(df_test).batch(32)
val_encoded = encode_dataset(df_val).batch(32)

bert_model = TFBertForSequenceClassification.from_pretrained(
    'indobenchmark/indobert-base-p2', num_labels=3) # 3 kelas: negatif (0), positif (1), netral (2)

bert_model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.00003),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.keras.metrics.SparseCategoricalAccuracy('accuracy'))

print("\nMemulai pelatihan model IndoBERT...")
bert_history = bert_model.fit(train_encoded, epochs=5,
                              batch_size=32, validation_data=val_encoded)

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel('Epochs')
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.title(f"Training and Validation {string.capitalize()}")
    plt.show()

plot_graphs(bert_history, 'accuracy')
plot_graphs(bert_history, 'loss')

print('\nEpoch No.  Train Accuracy   Train Loss         Val Accuracy     Val Loss')
for i in range(len(bert_history.history['accuracy'])):
    print('{:8d} {:10f} \t {:10f} \t {:10f} \t {:10f}'.format(i + 1, bert_history.history['accuracy'][i],
                                                             bert_history.history['loss'][i],
                                                             bert_history.history['val_accuracy'][i],
                                                             bert_history.history['val_loss'][i]))

bert_model.save_weights('bert-model-youtube-sentiment.h5')
print("\nBobot model IndoBERT berhasil disimpan.")

print("\nEvaluasi model IndoBERT pada data testing...")
score = bert_model.evaluate(test_encoded)
print(f"Test Accuracy: {score[1]:.4f}")

predicted_raw = bert_model.predict(test_encoded)
y_pred_bert = np.argmax(predicted_raw['logits'], axis=1)
# Ekstrak label asli dari tf.data.Dataset
y_true_bert = np.array([label for _, label_tensor in test_encoded.unbatch() for label in label_tensor.numpy().flatten()])

print("\nAccuracy Score (IndoBERT):", accuracy_score(y_true_bert, y_pred_bert))

# Menampilkan Classification Report dengan label yang benar
sentiment_labels_numeric = {0: 'negatif', 1: 'positif', 2: 'netral'}
y_true_bert_named = np.array([sentiment_labels_numeric[val] for val in y_true_bert])
y_pred_bert_named = np.array([sentiment_labels_numeric[val] for val in y_pred_bert])

print("\nClassification Report (IndoBERT):\n", classification_report(y_true_bert_named, y_pred_bert_named, zero_division=0))

confm_bert = confusion_matrix(y_true_bert, y_pred_bert)
columns_bert = ['negatif','positif','netral'] # Urutan ini harus cocok dengan mapping numerik 0, 1, 2
df_cm_bert = pd.DataFrame(confm_bert, index=columns_bert, columns=columns_bert)
ax_bert = sns.heatmap(df_cm_bert, cmap='Blues', annot=True, fmt='d')
ax_bert.set_title('Confusion Matrix IndoBERT')
ax_bert.set_xlabel('Label Prediksi')
ax_bert.set_ylabel('Label Sebenarnya')
plt.show()

# --- TRADITIONAL MACHINE LEARNING MODEL COMPARISON ---
print("\n--- Traditional Machine Learning Model Comparison ---")

# Persiapan data untuk ML Klasik
# Menggunakan processed_comment sebagai fitur teks dan 'label' sebagai target
X_train_ml = df_train['processed_comment']
y_train_ml = df_train['label']
X_test_ml = df_test['processed_comment']
y_test_ml = df_test['label']

# Inisialisasi TF-IDF Vectorizer
# Anda bisa menyesuaikan parameter seperti max_features, min_df, max_df
vectorizer = TfidfVectorizer(max_features=5000, min_df=5, max_df=0.8)
X_train_vectorized = vectorizer.fit_transform(X_train_ml)
X_test_vectorized = vectorizer.transform(X_test_ml)

# Simpan vectorizer
joblib.dump(vectorizer, 'vectorizer_ml.pkl')

models = {
    "LinearSVC": LinearSVC(random_state=42, max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "Logistic Regression": LogisticRegression(random_state=42, max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42, n_estimators=100),
    "KNN": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42, n_estimators=100),
    "AdaBoost": AdaBoostClassifier(random_state=42, n_estimators=100),
    "SGD Classifier": SGDClassifier(random_state=42, max_iter=1000, tol=1e-3),
    "MLP Classifier": MLPClassifier(random_state=42, max_iter=300),
    "Dummy Classifier": DummyClassifier(strategy="most_frequent")
}

results = {}

for name, model in models.items():
    print(f"\n--- Evaluasi Model: {name} ---")
    model.fit(X_train_vectorized, y_train_ml)
    y_pred_ml = model.predict(X_test_vectorized)

    acc = accuracy_score(y_test_ml, y_pred_ml)
    prec = precision_score(y_test_ml, y_pred_ml, average='weighted', zero_division=0)
    rec = recall_score(y_test_ml, y_pred_ml, average='weighted', zero_division=0)
    f1 = f1_score(y_test_ml, y_pred_ml, average='weighted', zero_division=0)

    results[name] = {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "F1-Score": f1
    }

    # Menampilkan Classification Report
    sentiment_labels_numeric_ml = {0: 'negatif', 1: 'positif', 2: 'netral'}
    y_true_ml_named = np.array([sentiment_labels_numeric_ml[val] for val in y_test_ml])
    y_pred_ml_named = np.array([sentiment_labels_numeric_ml[val] for val in y_pred_ml])

    print(classification_report(y_true_ml_named, y_pred_ml_named, zero_division=0))

    # Confusion Matrix
    cm_ml = confusion_matrix(y_test_ml, y_pred_ml, labels=[0, 1, 2]) # Sesuaikan dengan mapping numerik Anda
    df_cm_ml = pd.DataFrame(cm_ml, index=['negatif', 'positif', 'netral'], columns=['negatif', 'positif', 'netral'])
    sns.heatmap(df_cm_ml, cmap='Blues', annot=True, fmt='d')
    plt.title(f'Confusion Matrix - {name}')
    plt.xlabel('Label Prediksi')
    plt.ylabel('Label Sebenarnya')
    plt.show()

results_df = pd.DataFrame(results).T.sort_values(by='F1-Score', ascending=False)
print("\nPerbandingan Performa Model Klasifikasi:")
from IPython.display import display # Perlu import ini jika ingin display() bekerja di non-Jupyter notebook
display(results_df.style.background_gradient(cmap='Blues').format("{:.2%}"))

# --- TRAINING DAN SIMPAN MODEL FINAL (CONTOH: DECISION TREE) ---
# Anda bisa memilih model terbaik dari hasil perbandingan di atas
print("\n--- Training Final Model (contoh: Decision Tree) ---")
final_ml_model = DecisionTreeClassifier(random_state=42) # Ganti dengan model terbaik Anda jika berbeda
final_ml_model.fit(X_train_vectorized, y_train_ml)

y_pred_final_ml = final_ml_model.predict(X_test_vectorized)

print(classification_report(y_test_ml, y_pred_final_ml, zero_division=0))

cm_final_ml = confusion_matrix(y_test_ml, y_pred_final_ml, labels=final_ml_model.classes_)

df_cm_final_ml = pd.DataFrame(cm_final_ml, index=['negatif', 'positif', 'netral'], columns=['negatif', 'positif', 'netral'])
sns.heatmap(df_cm_final_ml, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix - Final ML Model (Decision Tree)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

print(f"Akurasi             : {accuracy_score(y_test_ml, y_pred_final_ml):.2f}")
print(f"Precision (Weighted): {precision_score(y_test_ml, y_pred_final_ml, average='weighted', zero_division=0):.2f}")
print(f"Recall (Weighted)   : {recall_score(y_test_ml, y_pred_final_ml, average='weighted', zero_division=0):.2f}")
print(f"F1-Score (Weighted) : {f1_score(y_test_ml, y_pred_final_ml, average='weighted', zero_division=0):.2f}")

joblib.dump(final_ml_model, 'sentiment_ml_model.pkl') # Beri nama file yang jelas
print("\nModel ML Klasik final berhasil disimpan.")


# --- FUNGSI PREDIKSI UNTUK DEPLOYMENT SIMULASI ---
# Load kembali model dan vectorizer (simulasi deployment)
loaded_ml_model = joblib.load('sentiment_ml_model.pkl')
loaded_vectorizer = joblib.load('vectorizer_ml.pkl')

# Fungsi prediksi menggunakan model ML Klasik
def predict_sentiment_ml_inference(comment):
    # Pra-proses komentar menggunakan fungsi yang sama
    processed_comment = my_full_text_preprocessor_optimized(comment)
    # Vektorisasi komentar
    vect_comment = loaded_vectorizer.transform([processed_comment])
    # Prediksi sentimen numerik
    pred_numeric = loaded_ml_model.predict(vect_comment)[0]
    # Konversi numerik ke label teks
    sentiment_map = {0: 'negatif', 1: 'positif', 2: 'netral'}
    return sentiment_map[pred_numeric]

# Fungsi prediksi menggunakan model IndoBERT
def predict_sentiment_bert_inference(comment):
    processed_comment = my_full_text_preprocessor_optimized(comment)
    # Tokenisasi dan encode untuk BERT
    bert_input = convert_example_to_feature(processed_comment)
    input_dict = {
        "input_ids": tf.constant([bert_input['input_ids']], dtype=tf.int32),
        "token_type_ids": tf.constant([bert_input['token_type_ids']], dtype=tf.int32),
        "attention_mask": tf.constant([bert_input['attention_mask']], dtype=tf.int32)
    }
    # Prediksi
    outputs = bert_model(input_dict)
    logits = outputs.logits
    pred_numeric = tf.argmax(logits, axis=1).numpy()[0]
    sentiment_map = {0: 'negatif', 1: 'positif', 2: 'netral'}
    return sentiment_map[pred_numeric]

print("\n--- Simulasi Prediksi Komentar Baru ---")
user_input = input("Masukkan komentar YouTube: ")

# Prediksi menggunakan model ML Klasik
result_ml_inf = predict_sentiment_ml_inference(user_input)
print(f"\nSentimen prediksi (ML Klasik): {result_ml_inf}")

# Prediksi menggunakan model IndoBERT
result_bert_inf = predict_sentiment_bert_inference(user_input)
print(f"Sentimen prediksi (IndoBERT): {result_bert_inf}")

# Prediksi menggunakan model Leksikon
result_lexicon_inf = prediksiSentiment_lexicon_advanced(
    my_full_text_preprocessor_optimized(user_input),
    sentiment_lexicon, all_negation_words, intensifiers, reducers
)
print(f"Sentimen prediksi (Leksikon): {result_lexicon_inf}")


ValueError: Could not interpret value `sentiment_indobert` for `x`. An entry with this name does not appear in `data`.

<Figure size 600x400 with 0 Axes>