In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


ModuleNotFoundError: No module named 'pandas'

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

ModuleNotFoundError: No module named 'sklearn'

In [6]:
# Load dataset
df = pd.read_csv('dataset_labeled.csv')
print(df.head())

          userName                                            content  score  \
0  Pengguna Google  ada bug pas update, login gak bisa katanya jar...      1   
1  Pengguna Google  LEGEND PETIR KETEMU RANDOM MASIH BANYAK TOLOLN...      2   
2  Pengguna Google                                                  👍      5   
3  Pengguna Google  game busuk match tidak seimbang sama sekali ma...      1   
4  Pengguna Google  oke.terima kasih.gamenya Uda bagus.karna saya ...      5   

     label                   at  
0  negatif  2025-05-22 22:57:27  
1  negatif  2025-05-20 23:51:29  
2  positif  2025-05-17 21:03:18  
3  negatif  2025-05-09 14:53:58  
4  positif  2025-05-07 01:14:33  


In [None]:
# Drop unused columns
df = df[['content', 'label']]
df.dropna(inplace=True)

In [None]:
# Label encoding
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['label'])

In [None]:
# Tokenization
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(df['content'].astype(str))

sequences = tokenizer.texts_to_sequences(df['content'].astype(str))
padded_sequences = pad_sequences(sequences, padding='post', maxlen=100)

In [None]:
# Data split (three scenarios)
experiments = [
    (0.8, 'LSTM', False),
    (0.8, 'LSTM', True),
    (0.7, 'LSTM', False)
]

results = []
for split_ratio, model_type, use_avg_pooling in experiments:
    X_train, X_test, y_train, y_test = train_test_split(
        padded_sequences, df['label'], test_size=1 - split_ratio, random_state=42, stratify=df['label']
    )

    model = Sequential()
    model.add(Embedding(input_dim=20000, output_dim=128, input_length=100))
    if use_avg_pooling:
        model.add(GlobalAveragePooling1D())
    else:
        model.add(LSTM(64))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(3, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    history = model.fit(np.array(X_train), np.array(y_train), epochs=5, batch_size=32,
                        validation_data=(np.array(X_test), np.array(y_test)), verbose=2)

    train_acc = history.history['accuracy'][-1]
    val_acc = history.history['val_accuracy'][-1]
    results.append((split_ratio, model_type, use_avg_pooling, train_acc, val_acc))

    print("\nClassification Report for Test Set")
    y_pred = model.predict(np.array(X_test))
    y_pred_classes = np.argmax(y_pred, axis=1)
    print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_))

    sample_texts = [
        "fitur update terbaru sangat bagus dan berguna",
        "game crash terus padahal koneksi lancar",
        "oke lah untuk game casual"
    ]
    sample_seq = tokenizer.texts_to_sequences(sample_texts)
    sample_pad = pad_sequences(sample_seq, maxlen=100, padding='post')
    predictions = model.predict(sample_pad)
    predicted_labels = label_encoder.inverse_transform(np.argmax(predictions, axis=1))
    for text, label in zip(sample_texts, predicted_labels):
        print(f"Text: {text} => Predicted Label: {label}")


In [None]:
print("\nExperiment Results:")
for res in results:
    print(f"Split: {res[0]*100:.0f}%, Model: {res[1]}, Pooling: {res[2]}, Train Acc: {res[3]*100:.2f}%, Test Acc: {res[4]*100:.2f}%")