*Library*

In [39]:
!pip install nltk scikit-learn
!pip install gensim
!pip install tensorflow



In [32]:
# Library untuk Preprocessing dan Deep Learning
import re
import string
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('indonesian'))
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
from google.colab import files

# Mengunggah file CSV ke Colab
uploaded = files.upload()

# Membaca file CSV yang diunggah
df = pd.read_csv(next(iter(uploaded)))
df.head()


Saving shoppe_reviews.csv to shoppe_reviews.csv


Unnamed: 0,review,rating
0,amanah,5
1,good job,5
2,seandainya di tambahin fitur tema gelap lebih ...,5
3,saya mengucapkan banyak terimakasih atas penge...,5
4,Baguss app nya,5


# Labeling

In [34]:
def label_sentiment(rating):
    if rating >= 4:
        return 2  # Positif
    elif rating == 3:
        return 1  # Netral
    else:
        return 0  # Negatif

df['label'] = df['rating'].apply(label_sentiment)

# Processing

In [35]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['clean_review'] = df['review'].apply(clean_text)

In [36]:
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['clean_review']).toarray()
y = to_categorical(df['label'])

# Skema Pelatihan 1 (TF-IDF, 80/20 split)

In [37]:
# Pembagian data (80/20)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.2, random_state=42)

In [38]:
# Model Deep Learning
model1 = Sequential([
    Dense(128, activation='relu', input_shape=(X.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

model1.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model1.fit(X_train1, y_train1, epochs=5, batch_size=128, validation_split=0.1, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - accuracy: 0.7169 - loss: 0.9277 - val_accuracy: 0.8438 - val_loss: 0.4243
Epoch 2/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 16ms/step - accuracy: 0.8403 - loss: 0.3897 - val_accuracy: 0.8875 - val_loss: 0.2934
Epoch 3/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.9055 - loss: 0.2813 - val_accuracy: 0.9087 - val_loss: 0.2633
Epoch 4/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9281 - loss: 0.2173 - val_accuracy: 0.9150 - val_loss: 0.2613
Epoch 5/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.9377 - loss: 0.1828 - val_accuracy: 0.9137 - val_loss: 0.2681


<keras.src.callbacks.history.History at 0x7dba532c7790>

In [39]:
# Evaluasi model
y_pred1 = np.argmax(model1.predict(X_test1), axis=1)
y_true1 = np.argmax(y_test1, axis=1)
print("Skema 1 - Akurasi:", accuracy_score(y_true1, y_pred1) * 100)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
Skema 1 - Akurasi: 89.8


# Skema 2: Deep Learning (Word2Vec + 80/20 Split)

In [40]:
# Ekstraksi fitur dengan Word2Vec
sentences = [review.split() for review in df['clean_review']]
model_w2v = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

In [42]:
def get_word2vec_features(review):
    words = review.split()
    vec = np.zeros(100)
    count = 0
    for word in words:
        if word in model_w2v.wv:
            vec += model_w2v.wv[word]
            count += 1
    if count > 0:
        vec /= count
    return vec

X_w2v = np.array([get_word2vec_features(review) for review in df['clean_review']])
y = to_categorical(df['label'])

In [43]:
# Pembagian data (80/20)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X_w2v, y, test_size=0.2, random_state=42)

In [44]:
# Model Deep Learning
model2 = Sequential([
    Dense(128, activation='relu', input_shape=(X_w2v.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

model2.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model2.fit(X_train2, y_train2, epochs=5, batch_size=128, validation_split=0.1, verbose=1)

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.7971 - loss: 0.7183 - val_accuracy: 0.8438 - val_loss: 0.5751
Epoch 2/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8343 - loss: 0.5710 - val_accuracy: 0.8438 - val_loss: 0.5000
Epoch 3/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8413 - loss: 0.5074 - val_accuracy: 0.8438 - val_loss: 0.4892
Epoch 4/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8372 - loss: 0.5137 - val_accuracy: 0.8438 - val_loss: 0.4844
Epoch 5/5
[1m57/57[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8293 - loss: 0.5232 - val_accuracy: 0.8438 - val_loss: 0.4821


<keras.src.callbacks.history.History at 0x7dba533bcbd0>

In [45]:
# Evaluasi model
y_pred2 = np.argmax(model2.predict(X_test2), axis=1)
y_true2 = np.argmax(y_test2, axis=1)
print("Skema 2 - Akurasi:", accuracy_score(y_true2, y_pred2) * 100)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Skema 2 - Akurasi: 82.75


# Skema 3: Deep Learning (CountVectorizer + 70/30 Split)

In [46]:
# Ekstraksi fitur dengan CountVectorizer
count_vectorizer = CountVectorizer(max_features=5000)
X_count = count_vectorizer.fit_transform(df['clean_review']).toarray()
y = to_categorical(df['label'])

In [47]:
# Pembagian data (70/30)
X_train3, X_test3, y_train3, y_test3 = train_test_split(X_count, y, test_size=0.3, random_state=42)

In [48]:
# Model Deep Learning
model3 = Sequential([
    Dense(128, activation='relu', input_shape=(X_count.shape[1],)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])

model3.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model3.fit(X_train3, y_train3, epochs=5, batch_size=128, validation_split=0.1, verbose=1)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 24ms/step - accuracy: 0.7510 - loss: 0.9026 - val_accuracy: 0.8586 - val_loss: 0.3671
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.8789 - loss: 0.3312 - val_accuracy: 0.9057 - val_loss: 0.2668
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step - accuracy: 0.9154 - loss: 0.2509 - val_accuracy: 0.9171 - val_loss: 0.2520
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9316 - loss: 0.2065 - val_accuracy: 0.9071 - val_loss: 0.2501
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.9499 - loss: 0.1593 - val_accuracy: 0.9071 - val_loss: 0.2626


<keras.src.callbacks.history.History at 0x7dba18469050>

In [49]:
# Evaluasi model
y_pred3 = np.argmax(model3.predict(X_test3), axis=1)
y_true3 = np.argmax(y_test3, axis=1)
print("Skema 3 - Akurasi:", accuracy_score(y_true3, y_pred3) * 100)

[1m94/94[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
Skema 3 - Akurasi: 89.76666666666667


In [51]:
# Skema 1
print("Classification Report Skema 1:\n", classification_report(y_true1, y_pred1))
print("Confusion Matrix Skema 1:\n", confusion_matrix(y_true1, y_pred1))

# Skema 2
print("Classification Report Skema 2:\n", classification_report(y_true2, y_pred2))
print("Confusion Matrix Skema 2:\n", confusion_matrix(y_true2, y_pred2))

# Skema 3
print("Classification Report Skema 3:\n", classification_report(y_true3, y_pred3))
print("Confusion Matrix Skema 3:\n", confusion_matrix(y_true3, y_pred3))

Classification Report Skema 1:
               precision    recall  f1-score   support

           0       0.67      0.73      0.70       278
           1       0.00      0.00      0.00        67
           2       0.94      0.96      0.95      1655

    accuracy                           0.90      2000
   macro avg       0.54      0.57      0.55      2000
weighted avg       0.87      0.90      0.88      2000

Confusion Matrix Skema 1:
 [[ 204    0   74]
 [  36    0   31]
 [  63    0 1592]]
Classification Report Skema 2:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       278
           1       0.00      0.00      0.00        67
           2       0.83      1.00      0.91      1655

    accuracy                           0.83      2000
   macro avg       0.28      0.33      0.30      2000
weighted avg       0.68      0.83      0.75      2000

Confusion Matrix Skema 2:
 [[   0    0  278]
 [   0    0   67]
 [   0    0 1655]]
Classifica

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
