<a href="https://colab.research.google.com/github/Shiro03kuuhaku/Comparison-of-LSTM-CNN-GRU-and-IndoBERTweet-Methods/blob/main/UAS_DL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%pip install tf-keras==2.11.0 tensorflow==2.11.0 --quiet
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

In [None]:
# === 1. Import & Preprocessing ===
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns
import tensorflow as tf
import re, string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from nltk.corpus import stopwords
import nltk; nltk.download('stopwords')
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from tensorflow.keras.optimizers import Adam

from google.colab import files
uploaded = files.upload()

# === 2. Load Dataset ===
df = pd.read_csv("PRDECT_ID.csv")
df.columns = df.columns.str.strip()
df.rename(columns={'Customer Review': 'review', 'Emotion': 'emotion'}, inplace=True)
df.dropna(subset=['review', 'emotion'], inplace=True)

# === 3. Cleaning & Encoding ===
stop_words = set(stopwords.words('indonesian'))
def clean_text(t):
    t = "".join(c for c in t.lower() if c.isalpha() or c.isspace())
    return " ".join(w for w in t.split() if w not in stop_words)

df['clean_review'] = df['review'].apply(clean_text)
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['emotion'])
num_classes = len(label_encoder.classes_)

# === 4. Tokenizing LSTM/GRU/CNN ===
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(df['clean_review'])
X_seq = tokenizer.texts_to_sequences(df['clean_review'])
X_pad = pad_sequences(X_seq, maxlen=100, padding='post')
y = df['label'].values

X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)

# === 5. Build & Train LSTM ===
model_lstm = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=100),
    LSTM(64),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist_lstm = model_lstm.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=32)
model_lstm.save("model_lstm.h5")

# === 6. Build & Train GRU ===
model_gru = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=100),
    GRU(64),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])
model_gru.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist_gru = model_gru.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=32)
model_gru.save("model_gru.h5")

# === 7. Build & Train CNN ===
model_cnn = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=100),
    Conv1D(128, 5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])
model_cnn.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
hist_cnn = model_cnn.fit(X_train, y_train, validation_split=0.1, epochs=3, batch_size=32)
model_cnn.save("model_cnn.h5")

# === 8. IndoBERTweet ===
bert_tokenizer = AutoTokenizer.from_pretrained("cahya/bert-base-indonesian-522M")
bert_enc_train = bert_tokenizer(list(df['clean_review']), padding=True, truncation=True, max_length=128, return_tensors='tf')
dataset = tf.data.Dataset.from_tensor_slices(({
    'input_ids': bert_enc_train['input_ids'],
    'attention_mask': bert_enc_train['attention_mask']
}, df['label'].values)).batch(16)

bert_model = TFAutoModelForSequenceClassification.from_pretrained("cahya/bert-base-indonesian-522M", num_labels=num_classes)
bert_model.compile(optimizer=Adam(learning_rate=2e-5),
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   metrics=['accuracy'])
bert_model.fit(dataset, epochs=3)
bert_model.save_pretrained("bert_model")

# === 9. Evaluation Function ===
def evaluate_model(model, X, y_true, name, is_bert=False):
    if is_bert:
        preds = model.predict(X)['logits']
        y_pred = np.argmax(preds, axis=1)
    else:
        preds = model.predict(X)
        y_pred = np.argmax(preds, axis=1)
    print(f"\n--- {name} ---")
    print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title(f'Confusion Matrix: {name}'); plt.xlabel('Pred'); plt.ylabel('True'); plt.show()

# === 10. Evaluation ===
evaluate_model(model_lstm, X_test, y_test, "LSTM")
evaluate_model(model_gru, X_test, y_test, "GRU")
evaluate_model(model_cnn, X_test, y_test, "CNN")

# === 11. Evaluate BERT ===
bert_enc_test = bert_tokenizer(list(df['clean_review']), padding=True, truncation=True, max_length=128, return_tensors='tf')
bert_test_dataset = {
    'input_ids': bert_enc_test['input_ids'],
    'attention_mask': bert_enc_test['attention_mask']
}
evaluate_model(bert_model, bert_test_dataset, df['label'].values, "BERT", is_bert=True)

# === 12. Plot Accuracy & Loss ===
def plot_history(history, title):
    plt.figure(figsize=(10,4))
    plt.subplot(1,2,1)
    plt.plot(history.history['accuracy'], label='train')
    plt.plot(history.history['val_accuracy'], label='val')
    plt.title(f'{title} Accuracy'); plt.legend()
    plt.subplot(1,2,2)
    plt.plot(history.history['loss'], label='train')
    plt.plot(history.history['val_loss'], label='val')
    plt.title(f'{title} Loss'); plt.legend()
    plt.show()

plot_history(hist_lstm, "LSTM")
plot_history(hist_gru, "GRU")
plot_history(hist_cnn, "CNN")
