In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional, Conv1D, MaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping

In [36]:
data = pd.read_csv('gojek_reviews.csv')
data = data.dropna(subset=['content'])

## Labeling Sentimen dan Membagi Data ke Training & Testing

In [27]:
def map_sentiment(score):
    if score <= 2:
        return 'negatif'
    elif score == 3:
        return 'netral'
    else:
        return 'positif'

data['sentiment'] = data['score'].apply(map_sentiment)

X = data['content']
y = data['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

## Tokenisasi dan Padding Teks Review untuk Input Model Deep Learning

In [28]:
tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_length = 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post')

## Encoding Label Sentimen

In [29]:
le = LabelEncoder()
y_train_enc = to_categorical(le.fit_transform(y_train))
y_test_enc = to_categorical(le.transform(y_test))

## Early Stopping untuk Menghindari Overfitting saat Training Model

In [30]:
early_stop = EarlyStopping(monitor='val_accuracy', patience=2, restore_best_weights=True)

## Skema 1 - CNN + LSTM

In [17]:
model1 = Sequential()
model1.add(Embedding(input_dim=50000, output_dim=128, input_length=max_length))
model1.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model1.add(Dropout(0.5))
model1.add(Dense(3, activation='softmax'))

model1.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history1 = model1.fit(X_train_pad, y_train_enc,
                      epochs=10,
                      batch_size=128,
                      validation_data=(X_test_pad, y_test_enc),
                      callbacks=[early_stop])

Epoch 1/10




[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 412ms/step - accuracy: 0.4107 - loss: 1.0607 - val_accuracy: 0.6433 - val_loss: 0.8653
Epoch 2/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 402ms/step - accuracy: 0.6411 - loss: 0.8613 - val_accuracy: 0.7137 - val_loss: 0.6813
Epoch 3/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 404ms/step - accuracy: 0.7147 - loss: 0.7105 - val_accuracy: 0.7290 - val_loss: 0.6505
Epoch 4/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 385ms/step - accuracy: 0.7383 - loss: 0.6299 - val_accuracy: 0.8373 - val_loss: 0.4417
Epoch 5/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 431ms/step - accuracy: 0.8446 - loss: 0.4278 - val_accuracy: 0.8893 - val_loss: 0.3355
Epoch 6/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 401ms/step - accuracy: 0.8742 - loss: 0.3661 - val_accuracy: 0.9040 - val_loss: 0.2974
Epoch 7/10
[1m71/71[0m [32m━━━

## Skema 2 - BiLSTM

In [18]:
model2 = Sequential([
    Embedding(input_dim=50000, output_dim=128, input_length=max_length),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(3, activation='softmax')
])
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
history2 = model2.fit(X_train_pad, y_train_enc, epochs=10, batch_size=128, validation_data=(X_test_pad, y_test_enc), callbacks=[early_stop])

Epoch 1/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 644ms/step - accuracy: 0.5529 - loss: 0.9526 - val_accuracy: 0.8477 - val_loss: 0.4376
Epoch 2/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 636ms/step - accuracy: 0.8938 - loss: 0.3418 - val_accuracy: 0.9677 - val_loss: 0.0956
Epoch 3/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 635ms/step - accuracy: 0.9692 - loss: 0.1010 - val_accuracy: 0.9887 - val_loss: 0.0418
Epoch 4/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 641ms/step - accuracy: 0.9817 - loss: 0.0612 - val_accuracy: 0.9807 - val_loss: 0.0386
Epoch 5/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m86s[0m 695ms/step - accuracy: 0.9842 - loss: 0.0574 - val_accuracy: 0.9913 - val_loss: 0.0305
Epoch 6/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 616ms/step - accuracy: 0.9857 - loss: 0.0485 - val_accuracy: 0.9920 - val_loss: 0.0286
Epoch 7/10
[1m71/71[

## Skema 3 - CNN + BiLSTM

In [19]:
model3 = Sequential()
model3.add(Embedding(input_dim=50000, output_dim=128, input_length=max_length))
model3.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)))
model3.add(Dropout(0.5))
model3.add(Dense(3, activation='softmax'))

model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

history3 = model3.fit(X_train_pad, y_train_enc,
                      epochs=10,
                      batch_size=128,
                      validation_data=(X_test_pad, y_test_enc),
                      callbacks=[early_stop])

Epoch 1/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 592ms/step - accuracy: 0.5763 - loss: 0.9098 - val_accuracy: 0.8847 - val_loss: 0.3682
Epoch 2/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 573ms/step - accuracy: 0.9336 - loss: 0.2320 - val_accuracy: 0.9793 - val_loss: 0.0709
Epoch 3/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 597ms/step - accuracy: 0.9790 - loss: 0.0791 - val_accuracy: 0.9867 - val_loss: 0.0431
Epoch 4/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 600ms/step - accuracy: 0.9852 - loss: 0.0437 - val_accuracy: 0.9877 - val_loss: 0.0367
Epoch 5/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 568ms/step - accuracy: 0.9871 - loss: 0.0411 - val_accuracy: 0.9920 - val_loss: 0.0253
Epoch 6/10
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 562ms/step - accuracy: 0.9889 - loss: 0.0324 - val_accuracy: 0.9907 - val_loss: 0.0254
Epoch 7/10
[1m71/71[

## Skema 4 - Logistic Regression

In [34]:
tfidf3 = TfidfVectorizer(max_features=5000)
X_train_tfidf3 = tfidf3.fit_transform(X_train)
X_test_tfidf3 = tfidf3.transform(X_test)

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf3, y_train)
lr_preds = lr_model.predict(X_test_tfidf3)
print("Akurasi Logistic Regression:", accuracy_score(y_test, lr_preds))
print(classification_report(y_test, lr_preds))

Akurasi Logistic Regression: 0.9813333333333333
              precision    recall  f1-score   support

     negatif       0.98      1.00      0.99      1192
      netral       0.99      0.95      0.97       611
     positif       0.98      0.98      0.98      1197

    accuracy                           0.98      3000
   macro avg       0.98      0.98      0.98      3000
weighted avg       0.98      0.98      0.98      3000



## Skema 5 - Naive Bayes

In [33]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_preds = nb_model.predict(X_test_tfidf)
print("Akurasi Naive Bayes:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))

Akurasi Naive Bayes: 0.9383333333333334
              precision    recall  f1-score   support

     negatif       0.89      0.99      0.94      1192
      netral       0.99      0.83      0.91       611
     positif       0.96      0.94      0.95      1197

    accuracy                           0.94      3000
   macro avg       0.95      0.92      0.93      3000
weighted avg       0.94      0.94      0.94      3000



## Menyimpan Tokenizer dan Label Encoder ke File untuk Inference Model

In [31]:
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [32]:
model4.save('best_sentiment_model2.h5')



#KESIMPULAN

- Model terbaik secara performa akurasi dan generalisasi adalah BiLSTM dan CNN-BiLSTM.

- Logistic Regression masih sangat kompetitif, dengan akurasi mendekati model deep learning, cocok untuk deployment cepat.

- Naive Bayes cocok digunakan sebagai baseline awal, tapi tidak unggul dibanding model lain.