# Import Library

In [1]:
import pandas as pd

import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from textblob import TextBlob
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Dense
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Dataset Loading

In [3]:
df = pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,71fd9775-8281-4340-89f0-5de265197b2b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Game buruk, system matchmaking game paling bur...",4,586,1.9.64.10601,2025-03-28 09:50:38,,,1.9.64.10601
1,d7c2cb2d-48af-4959-999d-d7fff6d7182b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,asli nih game gak danta!! gua udah 7× main di ...,2,457,1.9.64.10601,2025-03-28 14:52:21,,,1.9.64.10601
2,9bd15e3b-7ae1-4827-af1c-f80b9aea4fa2,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,tiap main rank pasti aja ada player beban/male...,1,4555,1.9.65.10602,2025-03-28 15:42:15,,,1.9.65.10602
3,2cc02e76-efa0-4108-ba9d-e174d7ccc98c,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Matchmaking yang sangat buruk ketika bermain s...,1,417,1.9.64.10601,2025-03-30 01:25:13,,,1.9.64.10601
4,ada60d3f-14b4-4a8d-a3d6-61ae50eebf55,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Pengalaman bermain yang buruk, Game sering fre...",1,1645,1.9.64.10601,2025-03-27 14:23:01,,,1.9.64.10601


In [4]:
df.shape

(121500, 11)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121500 entries, 0 to 121499
Data columns (total 11 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              121500 non-null  object
 1   userName              121500 non-null  object
 2   userImage             121500 non-null  object
 3   content               121500 non-null  object
 4   score                 121500 non-null  int64 
 5   thumbsUpCount         121500 non-null  int64 
 6   reviewCreatedVersion  100580 non-null  object
 7   at                    121500 non-null  object
 8   replyContent          5152 non-null    object
 9   repliedAt             5152 non-null    object
 10  appVersion            100580 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.2+ MB


# Preprocessing

In [6]:
df_cleaned = df.drop(columns=['replyContent', 'repliedAt'])
df_cleaned = df_cleaned.dropna()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100580 entries, 0 to 121499
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              100580 non-null  object
 1   userName              100580 non-null  object
 2   userImage             100580 non-null  object
 3   content               100580 non-null  object
 4   score                 100580 non-null  int64 
 5   thumbsUpCount         100580 non-null  int64 
 6   reviewCreatedVersion  100580 non-null  object
 7   at                    100580 non-null  object
 8   appVersion            100580 non-null  object
dtypes: int64(2), object(7)
memory usage: 7.7+ MB


In [7]:
df_cleaned = df_cleaned.drop_duplicates()
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
Index: 100580 entries, 0 to 121499
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype 
---  ------                --------------   ----- 
 0   reviewId              100580 non-null  object
 1   userName              100580 non-null  object
 2   userImage             100580 non-null  object
 3   content               100580 non-null  object
 4   score                 100580 non-null  int64 
 5   thumbsUpCount         100580 non-null  int64 
 6   reviewCreatedVersion  100580 non-null  object
 7   at                    100580 non-null  object
 8   appVersion            100580 non-null  object
dtypes: int64(2), object(7)
memory usage: 7.7+ MB


In [8]:
def textCleaning(text):
  text = re.sub(r"#[A-Za-z0-9]", "", text)
  text = re.sub(r"@[A-Za-z0-9]", "", text)
  text = re.sub(r"http\S+", "", text)
  text = re.sub(r"RT[\s]", "", text)
  text = re.sub(r"[^\w\s]", "", text)
  text = re.sub(r"[0-9]", "", text)
  text = text.strip(" ")
  text = text.replace("\n", " ")
  text = text.translate(str.maketrans("", "", string.punctuation))
  text = text.lower() # Text Casefolding
  cleaned_text = word_tokenize(text) # Text tokenize
  return cleaned_text

In [9]:
def textFilter(text):
  stopwordsList = set(stopwords.words('indonesian'))
  stopwords_english = set(stopwords.words('english'))
  stopwordsList.update(stopwords_english)
  stopwordsList.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
  filtered_words = []
  for word in text:
    if word not in stopwordsList:
      filtered_words.append(word)
  return filtered_words

In [10]:
def changeToSentence(wordsList):
  sen = ' '.join(word for word in wordsList)
  return sen

In [11]:
df_cleaned['text_clean'] = df_cleaned['content'].apply(textCleaning)
df_cleaned['text_stopword'] = df_cleaned['text_clean'].apply(textFilter)
df_cleaned['final_text'] = df_cleaned['text_stopword'].apply(changeToSentence)


In [12]:
df_cleaned.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion,text_clean,text_stopword,final_text
0,71fd9775-8281-4340-89f0-5de265197b2b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Game buruk, system matchmaking game paling bur...",4,586,1.9.64.10601,2025-03-28 09:50:38,1.9.64.10601,"[game, buruk, system, matchmaking, game, palin...","[game, buruk, system, matchmaking, game, buruk...",game buruk system matchmaking game buruk yg pe...
1,d7c2cb2d-48af-4959-999d-d7fff6d7182b,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,asli nih game gak danta!! gua udah 7× main di ...,2,457,1.9.64.10601,2025-03-28 14:52:21,1.9.64.10601,"[asli, nih, game, gak, danta, gua, udah, main,...","[asli, nih, game, danta, gua, udah, main, rank...",asli nih game danta gua udah main rank ngebug ...
2,9bd15e3b-7ae1-4827-af1c-f80b9aea4fa2,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,tiap main rank pasti aja ada player beban/male...,1,4555,1.9.65.10602,2025-03-28 15:42:15,1.9.65.10602,"[tiap, main, rank, pasti, aja, ada, player, be...","[main, rank, aja, player, bebanmales, main, pl...",main rank aja player bebanmales main player pe...
3,2cc02e76-efa0-4108-ba9d-e174d7ccc98c,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Matchmaking yang sangat buruk ketika bermain s...,1,417,1.9.64.10601,2025-03-30 01:25:13,1.9.64.10601,"[matchmaking, yang, sangat, buruk, ketika, ber...","[matchmaking, buruk, bermain, solo, gangguan, ...",matchmaking buruk bermain solo gangguan sinyal...
4,ada60d3f-14b4-4a8d-a3d6-61ae50eebf55,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Pengalaman bermain yang buruk, Game sering fre...",1,1645,1.9.64.10601,2025-03-27 14:23:01,1.9.64.10601,"[pengalaman, bermain, yang, buruk, game, serin...","[pengalaman, bermain, buruk, game, freezing, d...",pengalaman bermain buruk game freezing ditenga...


# Labeling

In [13]:
def sentiment_labeling(teks):
    polarity = TextBlob(teks).sentiment.polarity
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

In [14]:
df_cleaned['polarity'] = df_cleaned['final_text'].apply(sentiment_labeling)

In [15]:
df_cleaned.polarity.value_counts()

Unnamed: 0_level_0,count
polarity,Unnamed: 1_level_1
negative,56552
neutral,25671
positive,18357


# Modeling

## Pelatihan dengan Algoritma Deep Learning

- **Ekstraksi Fitur: TF-IDF**
- **Pembagian Data: 80/20**
- **Pelatihan: Deep Learning**

In [16]:
X = df_cleaned['final_text']
y = df_cleaned['polarity']

In [17]:
lencoder = LabelEncoder()
df_cleaned['polarity_encoded'] = lencoder.fit_transform(df_cleaned['polarity'])
y_encoded = df_cleaned['polarity_encoded']
y_categorical = to_categorical(y_encoded)

In [18]:
tfidf_nn = TfidfVectorizer(max_features=5000)
X_tfidf_nn = tfidf_nn.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_nn, y_categorical, test_size=0.2, random_state=42)

In [19]:
model_nn = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')
])
model_nn.compile(optimizer='Adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [20]:
hist = model_nn.fit(X_train, y_train, epochs=5)

Epoch 1/5
[1m2515/2515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 16ms/step - accuracy: 0.8418 - loss: 0.3833
Epoch 2/5
[1m2515/2515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 15ms/step - accuracy: 0.9837 - loss: 0.0531
Epoch 3/5
[1m2515/2515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 13ms/step - accuracy: 0.9908 - loss: 0.0287
Epoch 4/5
[1m2515/2515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 16ms/step - accuracy: 0.9954 - loss: 0.0159
Epoch 5/5
[1m2515/2515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 12ms/step - accuracy: 0.9977 - loss: 0.0083


In [21]:
test_results = model_nn.evaluate(X_test, y_test, batch_size=1)

[1m20116/20116[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m55s[0m 3ms/step - accuracy: 0.9768 - loss: 0.1167


In [22]:
accuracy_train_nn = hist.history['accuracy'][-1]
accuracy_test_nn = test_results[1]

In [23]:
print('Akurasi Neural Network (Training)', accuracy_train_nn)
print('Akurasi Neural Network (Testing)', accuracy_test_nn)

Akurasi Neural Network (Training) 0.9973404407501221
Akurasi Neural Network (Testing) 0.9781268835067749


## Skema Pelatihan 1

- **Ekstraksi Fitur: TF-IDF**
- **Pembagian Data: 70/30**
- **Pelatihan: Logistic Regression**

In [24]:
tfidf_lr = TfidfVectorizer(max_features=300)
X_tfidf_lr = tfidf_lr.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_lr, y, test_size=0.3, random_state=42)

In [25]:
lr = LogisticRegression()
lr.fit(X_train.toarray(), y_train)

In [26]:
y_pred_train_lr = lr.predict(X_train.toarray())
y_pred_test_lr = lr.predict(X_test.toarray())

accuracy_train_lr = accuracy_score(y_pred_train_lr, y_train)
accuracy_test_lr = accuracy_score(y_pred_test_lr, y_test)

print('Akurasi Logistic Regression (Training):', accuracy_train_lr)
print('Akurasi Logistic Regression (Testing):', accuracy_test_lr)

Akurasi Logistic Regression (Training): 0.9658409794619777
Akurasi Logistic Regression (Testing): 0.9645058659773315


## Skema Pelatihan 2

- **Ekstraksi Fitur: BoW**
- **Pembagian Data: 80/20**
- **Pelatihan: SVM**

In [27]:
vectorizer = CountVectorizer(max_features=300, min_df=17, max_df=0.8)
X_bow = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=42)

In [28]:
svm = LinearSVC()
svm.fit(X_train.toarray(), y_train)

In [29]:
y_pred_train_svm = svm.predict(X_train.toarray())
y_pred_test_svm = svm.predict(X_test.toarray())

accuracy_train_svm = accuracy_score(y_train, y_pred_train_svm)
accuracy_test_svm = accuracy_score(y_test, y_pred_test_svm)

print("Akurasi SVM (Training):", accuracy_train_svm)
print("Akurasi SVM (Testing):", accuracy_test_svm)

Akurasi SVM (Training): 0.9675382779876716
Akurasi SVM (Testing): 0.9649532710280374


## Skema Pelatihan 3

- **Ekstraksi Fitur: TF-IDF**
- **Pembagian Data: 80/20**
- **Pelatihan: Decision Tree**

In [30]:
tfidf_dt = TfidfVectorizer(max_features=5000)
X_tfidf_dt = tfidf_dt.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_tfidf_dt, y, test_size=0.2, random_state=42)

In [31]:
decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train.toarray(), y_train)

In [32]:
y_pred_train_dt = decision_tree.predict(X_train.toarray())
y_pred_test_dt = decision_tree.predict(X_test.toarray())

accuracy_train_dt = accuracy_score(y_pred_train_dt, y_train)
accuracy_test_dt = accuracy_score(y_pred_test_dt, y_test)

print("Akurasi Decision Tree (Training):", accuracy_train_dt)
print("Akurasi Decision Tree (Testing):", accuracy_test_dt)

Akurasi Decision Tree (Training): 1.0
Akurasi Decision Tree (Testing): 0.9818552396102604


# Hasil Evaluasi

In [33]:
accuracy_df = pd.DataFrame({
    'Model': ['Neural Network', 'Logistic Regression', 'SVM', 'Decision Tree'],
    'Train Accuracy': [accuracy_train_nn, accuracy_train_lr, accuracy_train_svm, accuracy_train_dt],
    'Test Accuracy': [accuracy_test_nn, accuracy_test_lr, accuracy_test_svm, accuracy_test_dt]
})

In [34]:
accuracy_df

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Neural Network,0.99734,0.978127
1,Logistic Regression,0.965841,0.964506
2,SVM,0.967538,0.964953
3,Decision Tree,1.0,0.981855


In [35]:
accuracy_df_sorted = accuracy_df.sort_values(by='Test Accuracy', ascending=False)
accuracy_df_sorted

Unnamed: 0,Model,Train Accuracy,Test Accuracy
3,Decision Tree,1.0,0.981855
0,Neural Network,0.99734,0.978127
2,SVM,0.967538,0.964953
1,Logistic Regression,0.965841,0.964506


# Inference

In [36]:
def predict_model(text, feature_extractor, model, is_nn=False):
  X_new = feature_extractor.transform([text])
  prediction = model.predict(X_new.toarray() if is_nn else X_new)

  if is_nn:
    prediction = prediction.argmax(axis=1)
    prediction = lencoder.inverse_transform(prediction)

  return prediction[0]

In [37]:
kalimat_baru = "Game jelek, koneksi buruk"

kalimat_baru_cleaned = textCleaning(kalimat_baru)
kalimat_baru_filtered = textFilter(kalimat_baru_cleaned)
kalimat_baru_final = changeToSentence(kalimat_baru_filtered)

prediction_nn = predict_model(kalimat_baru_final, tfidf_nn, model_nn, is_nn=True)
prediction_lr = predict_model(kalimat_baru_final, tfidf_lr, lr)
prediction_svm = predict_model(kalimat_baru_final, vectorizer, svm)
prediction_dt = predict_model(kalimat_baru_final, tfidf_dt, decision_tree)

print("Kalimat: ", kalimat_baru)
print(f'Prediksi Neural Network: {prediction_nn}')
print(f'Prediksi Logistic Regression: {prediction_lr}')
print(f'Prediksi SVM: {prediction_svm}')
print(f'Prediksi Decision Tree: {prediction_dt}')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step
Kalimat:  Game jelek, koneksi buruk
Prediksi Neural Network: negative
Prediksi Logistic Regression: negative
Prediksi SVM: negative
Prediksi Decision Tree: negative
