In [53]:
# Import library
import pandas as pd
import numpy as np
import keras
import tensorflow as tf

from preprocessing import LSTMPreprocess

In [54]:
# Load Dataset
train_set = pd.read_csv("data/train.csv")
valid_set = pd.read_csv("data/valid.csv")
test_set = pd.read_csv("data/test.csv")

In [55]:
# Preprocess dataset (SPLIT)
label_map = {"positive": 0, "neutral": 1, "negative": 2}
num_classes = len(label_map)

print("Memproses dataset...")
train_texts = train_set["text"]
train_labels = np.array(train_set["label"].map(lambda x: label_map[x]))

test_texts = test_set["text"]
test_labels = np.array(test_set["label"].map(lambda x: label_map[x]))

valid_texts = valid_set["text"]
valid_labels = np.array(valid_set["label"].map(lambda x: label_map[x]))


Memproses dataset...


In [56]:
# Preprocess dataset (Tokenization)
preprocess_cls = LSTMPreprocess()

train_token = preprocess_cls.tokenization(train_texts)
test_token = preprocess_cls.tokenization(test_texts)
valid_token = preprocess_cls.tokenization(valid_texts)

In [57]:
# Model

embedding_dim = 128  # Dimensi vektor embedding
lstm_units = 64  # Jumlah unit di layer LSTM
dropout_rate = 0.5  # Rate untuk dropout
vocab_size = 10000  # Vocabulary Size

# Bangun model Sequential Keras
model = keras.Sequential(
    [
        # 1. Embedding Layer
        keras.layers.Embedding(
            input_dim=vocab_size, output_dim=embedding_dim, mask_zero=True
        ),
        # 2. Bidirectional LSTM Layer
        keras.layers.Bidirectional(keras.layers.LSTM(lstm_units)),
        # 3. Dropout Layer
        keras.layers.Dropout(dropout_rate),
        # 4. Dense Layer (Hidden)
        keras.layers.Dense(32, activation="relu"),
        keras.layers.Dropout(0.3),
        # 5. Dense Layer (Output)
        keras.layers.Dense(num_classes, activation="softmax"),
    ]
)

model.build(input_shape=train_token.shape)

# Summary
model.summary()

# Kompilasi model
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

In [58]:
# Train
epochs = 100
batch_size = 64

history = model.fit(
    train_token,
    train_labels,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(valid_token, valid_labels),
    # callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)]
)

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 139ms/step - accuracy: 0.3786 - loss: 1.0949 - val_accuracy: 0.3800 - val_loss: 1.0889
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step - accuracy: 0.4891 - loss: 1.0687 - val_accuracy: 0.4500 - val_loss: 1.0716
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step - accuracy: 0.5391 - loss: 1.0190 - val_accuracy: 0.4800 - val_loss: 1.0329
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step - accuracy: 0.5469 - loss: 0.9140 - val_accuracy: 0.4800 - val_loss: 1.0034
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 117ms/step - accuracy: 0.7075 - loss: 0.6649 - val_accuracy: 0.5300 - val_loss: 1.1273
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 122ms/step - accuracy: 0.8794 - loss: 0.4255 - val_accuracy: 0.4900 - val_loss: 1.4681
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━

In [59]:
# Simpan bobot model
model.save("lstm_keras.h5")
print("Model Keras telah disimpan ke lstm_keras.h5")

# (Opsional) Simpan seluruh model
# model.save("lstm_model.keras")



Model Keras telah disimpan ke lstm_keras.h5


In [60]:
# Evaluasi model pada data test
print("\nMengevaluasi model pada data test...")
# Use one-hot encoded test_labels for evaluation
loss, accuracy = model.evaluate(test_token, test_labels, verbose=0)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")


# Menghitung F1 Score
y_pred_probs = model.predict(test_token)

# One-hot encode the true labels (y_true)
test_labels_one_hot = tf.keras.utils.to_categorical(test_labels, num_classes=num_classes)

f1 = keras.metrics.F1Score(average='macro')
f1.update_state(test_labels_one_hot, y_pred_probs)
print(f"Test F1 Score: {f1.result().numpy():.4f}")



# Prediksi contoh
print("\nMelakukan prediksi pada beberapa data test...")
predictions = model.predict(test_token[:5])
predicted_labels = np.argmax(predictions, axis=1)

label_reverse_map = {v: k for k, v in label_map.items()}

for i in range(5):
    print(f"Ke-{i+1}")
    # Use original integer test_labels_int for comparison here
    print(f"Label Asli: {label_reverse_map[test_labels[i]]}")
    print(f"Label Prediksi: {label_reverse_map[predicted_labels[i]]}\n")


Mengevaluasi model pada data test...
Test Loss: 6.1082
Test Accuracy: 0.3975
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step
Test F1 Score: 0.3815

Melakukan prediksi pada beberapa data test...
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
Ke-1
Label Asli: positive
Label Prediksi: negative

Ke-2
Label Asli: neutral
Label Prediksi: negative

Ke-3
Label Asli: negative
Label Prediksi: positive

Ke-4
Label Asli: positive
Label Prediksi: negative

Ke-5
Label Asli: neutral
Label Prediksi: neutral

