# 1. Menginstall dan import Library yang dibutuhkan untuk tahap model

In [2]:
!pip install nltk Sastrawi # Library untuk bahasa Indonesia
!pip install scikit-learn

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [66]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.svm import SVC
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [6]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 2. Load Dataset dan EDA

In [89]:
# Load Dataset
df = pd.read_csv("playstore_reviews.csv")
print(df.head())

                                              review sentiment
0              kenapa tdk bisa login akun yaa??🤷🏻‍♀️   negatif
1                           hilang lah semua progres   negatif
2  Aplikasi sangat Baik, banyak Pengetahuan yang ...   positif
3                      sangat bagus dan mudah sekali   positif
4                                              anjay   positif


In [90]:
print("\nDistribusi Sentimen:")
print(df['sentiment'].value_counts())


Distribusi Sentimen:
sentiment
positif    5207
negatif    4265
netral      528
Name: count, dtype: int64


# 3. Pre-processing (Membersihkan Data)

In [9]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# Untuk membersihkan text (tanda baca, mengubah menjadi semua nya menjadi huruf kecil, dll)

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        words = text.split()
        words = [word for word in words if word not in stop_words]
        words = [stemmer.stem(word) for word in words]
        return " ".join(words)
    return ""
df["clean_review"] = df["review"].apply(clean_text)

In [11]:
print(df["clean_review"].isna().sum())
print((df["clean_review"].str.strip() == "").sum())

0
343


In [12]:
# Menghapus Missing Value
df = df.dropna(subset=["clean_review"])
df = df[df["clean_review"].str.strip() != ""]
print(df["clean_review"].isna().sum())

0


In [14]:
df.to_csv("playstore_reviews_cleaned.csv", index=False)
print("Dataset disimpan sebagai playstore_reviews_cleaned.csv.")

Dataset disimpan sebagai playstore_reviews_cleaned.csv.


Insight:
* Setelah dibersihkan di save dulu dataset nya agar lebih mudah dalam debugging ketika error

# 4. Proses TF-IDF (Ekstraksi dan Label Data)

In [23]:
# Proses Ekstraksi Fitur Dengan TF-IDF
df = pd.read_csv("playstore_reviews_cleaned.csv")
df["clean_review"] = df["clean_review"].astype(str)

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["clean_review"])
X_df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
X_df["sentiment"] = df["sentiment"].values

In [24]:
X_df.to_csv("playstore_reviews_tfidf.csv", index=False)
print("Ekstraksi fitur selesai!")
print(X_df.head())

Ekstraksi fitur selesai!
   aangat  abah  abal  abas  abdet  abdete  abdullah  abglokal  abgus  abiez  \
0     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
1     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
2     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
3     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
4     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   

   ...  yudah  yuhuuuu  yup  yuppie  yutub  ywdh   yy  zaman  zelidraw  \
0  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
1  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
2  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
3  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
4  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   

   sentiment  
0    negatif  
1    negatif  
2   

Insight:
* Hasil ekstraksi TF-IDF ini menunjukkan representasi numerik dari kata-kata paling relevan dalam ulasan pengguna, yang siap digunakan untuk pelatihan model analisi sentimen.

# 5. Pembangunan Model dengan SVM

**Machine Learning Traditional**

In [29]:
df = pd.read_csv("playstore_reviews_tfidf.csv")

Insight:
* Memanggil dataset yang sudah di save setelah proses TF-IDF

In [52]:
df = df[df['sentiment'].isin(['positif', 'negatif'])]
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = le.fit_transform(df['sentiment'])


**Deep Learning**

In [74]:
df_deep = pd.read_csv("playstore_reviews_cleaned.csv")

In [80]:
# Hpyermeter dan Tokenizer
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df_deep["clean_review"])
X_deep = tokenizer.texts_to_sequences(df_deep["clean_review"])
X_deep = pad_sequences(X_deep, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

In [81]:
df_deep = df_deep[df_deep['sentiment'].isin(['positif', 'negatif'])]
le = LabelEncoder()
y_deep = le.fit_transform(df_deep["sentiment"])

Insight:
* Digunakan untuk algoritma DeepLearning (TextCNN)

# a. Splitting Data

**SVM + RF**

In [70]:
X = df.drop(columns=['sentiment'])
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

**TextCNN**

In [82]:
X_train_deep, X_test_deep, y_train_deep, y_test_deep = train_test_split(
    X_deep, y_deep, test_size=0.2, random_state=42, stratify=y_deep
)

# b. Training Model

* Pembangunan Model untuk TextCNN

In [71]:
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM)(input_layer)

conv1 = Conv1D(256, 3, activation='relu')(embedding_layer)
conv2 = Conv1D(256, 4, activation='relu')(embedding_layer)
conv3 = Conv1D(256, 5, activation='relu')(embedding_layer)

pool1 = GlobalMaxPooling1D()(conv1)
pool2 = GlobalMaxPooling1D()(conv2)
pool3 = GlobalMaxPooling1D()(conv3)

concat = Concatenate()([pool1, pool2, pool3])
dropout1 = Dropout(0.5)(concat)
dense = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense)
output_layer = Dense(2, activation='softmax')(dropout2)

textcnn_model = Model(inputs=input_layer, outputs=output_layer)

textcnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)
textcnn_model.summary()

In [84]:
#Text CNN
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5)

textcnn_model.fit(
    X_train_deep, y_train_deep,
    validation_data=(X_test_deep, y_test_deep),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 214ms/step - accuracy: 0.9209 - loss: 0.2221 - val_accuracy: 0.8802 - val_loss: 0.3049 - learning_rate: 5.0000e-04
Epoch 2/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 215ms/step - accuracy: 0.9513 - loss: 0.1568 - val_accuracy: 0.8714 - val_loss: 0.3518 - learning_rate: 5.0000e-04
Epoch 3/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 220ms/step - accuracy: 0.9635 - loss: 0.1132 - val_accuracy: 0.8660 - val_loss: 0.4058 - learning_rate: 5.0000e-04
Epoch 4/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 222ms/step - accuracy: 0.9731 - loss: 0.0943 - val_accuracy: 0.8693 - val_loss: 0.4343 - learning_rate: 2.5000e-04


<keras.src.callbacks.history.History at 0x79e49c5c9f50>

In [32]:
# SVM
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [38]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# c. Evaluasi Data

In [33]:
print("Accuracy SVM:", accuracy_score(y_test, y_pred) * 100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy SVM: 87.63676148796499 %

Classification Report:
               precision    recall  f1-score   support

     negatif       0.85      0.88      0.87       846
     positif       0.90      0.87      0.88       982

    accuracy                           0.88      1828
   macro avg       0.88      0.88      0.88      1828
weighted avg       0.88      0.88      0.88      1828



In [39]:
print("Accuracy Random Forest:", accuracy_score(y_test, y_pred_rf) * 100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, target_names=le.classes_))

Accuracy Random Forest: 85.77680525164114 %

Classification Report:
               precision    recall  f1-score   support

     negatif       0.83      0.87      0.85       846
     positif       0.88      0.85      0.87       982

    accuracy                           0.86      1828
   macro avg       0.86      0.86      0.86      1828
weighted avg       0.86      0.86      0.86      1828



In [87]:
y_pred_deep = np.argmax(textcnn_model.predict(X_test_deep), axis=1)
acc = accuracy_score(y_test_deep, y_pred_deep)
print(f"\nAccuracy TextCNN: {acc * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test_deep, y_pred_rf, target_names=le.classes_))

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 88ms/step

Accuracy TextCNN: 88.02%

Classification Report:
               precision    recall  f1-score   support

     negatif       0.83      0.87      0.85       846
     positif       0.88      0.85      0.87       982

    accuracy                           0.86      1828
   macro avg       0.86      0.86      0.86      1828
weighted avg       0.86      0.86      0.86      1828



Kesimpulan:
* Berdasarkan hasil evaluasi model pada ketiga algoritma SVM, Random Forest, dan TextCNN dapat disimpulkan bahwa TextCNN merupakan model dengan performa terbaik untuk tugas klasifikasi sentimen pada dataset ulasan pada aplikasi x atau twitter. TextCNN berhasil mencapai akurasi 88.02%, mengungguli SVM dengan akurasi 87.63% dan Random Forest dengan akurasi 85.77%.
* Selain itu, TextCNN juga menunjukkan skor precision, recall, dan f1-score yang seimbang di kedua kelas (positif dan negatif), yang menandakan kemampuannya dalam menangkap pola teks secara lebih kompleks melalui pendekatan deep learning.
* Dengan demikian, jika mempertimbangkan akurasi dan stabilitas performa pada kedua label sentimen, TextCNN menjadi pilihan paling ideal di antara model yang diuji untuk digunakan dalam klasifikasi sentimen pada data ini.
* Dalam dataset ini seharusnya terdapat 3 kelas label (Netral, Positif dan Negatif), namun pada saat pelatihan model, dikarenakan label negatif yang terlalu sedikit dan kecil, sehingga menganggu hasil dari akhir dari akurasi tersebut, jadi label netral dihapus saat training, dan hanya menggunakan 2 label saja.