# 1. Menginstall dan import Library yang dibutuhkan untuk tahap model

In [1]:
!pip install nltk Sastrawi # Library untuk bahasa Indonesia
!pip install scikit-learn

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [2]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.svm import SVC
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
import nltk
from nltk.corpus import stopwords
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, GlobalMaxPooling1D, Concatenate, Dropout, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# 2. Load Dataset dan EDA

In [33]:
# Load Dataset
df = pd.read_csv("playstore_reviews.csv")
print(df.head())

                                              review  sentiment
0                                        sangat kren          5
1                    sangat membantu untuk informasi          5
2  tiba tiba ditangguhkan anjir padahal masih aku...          1
3  cm liat liat medsos aja tau tau di tangguhkan ...          1
4                                               good          5


In [34]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     10000 non-null  object
 1   sentiment  10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 156.4+ KB
None


# 3. Pre-processing (Membersihkan Data)

In [35]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# Untuk membersihkan text (tanda baca, mengubah menjadi semua nya menjadi huruf kecil, dll)

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        words = text.split()
        words = [word for word in words if word not in stop_words]
        words = [stemmer.stem(word) for word in words]
        return " ".join(words)
    return ""
df["clean_review"] = df["review"].apply(clean_text)

In [36]:
print(df["clean_review"].isna().sum())
print((df["clean_review"].str.strip() == "").sum())

0
347


In [37]:
# Menghapus Missing Value
df = df.dropna(subset=["clean_review"])
df = df[df["clean_review"].str.strip() != ""]
print(df["clean_review"].isna().sum())

0


In [38]:
# Labeling Data Sentimen
def label_sentiment(score):
    if score >= 4:
        return "positif"
    elif score == 3:
        return "netral"
    else:
        return "negatif"
df['sentiment'] = df['sentiment'].apply(label_sentiment)

Insight:
* Labeling Data dilakukan pada preprocessing data, karena akan digunakan untuk proses ekstraksi TF-IDF dan untuk TextCNN tanpa proses ekstraksi TF-IDF terlebih dahulu

In [39]:
df.to_csv("playstore_reviews_cleaned.csv", index=False)
print("Dataset disimpan sebagai playstore_reviews_cleaned.csv.")

Dataset disimpan sebagai playstore_reviews_cleaned.csv.


Insight:
* Setelah dibersihkan di save dulu dataset nya agar lebih mudah dalam debugging ketika error

# 4. Proses TF-IDF (Ekstraksi Data)

In [43]:
df = pd.read_csv("playstore_reviews_cleaned.csv")

In [44]:
# Proses Ekstraksi Fitur Dengan TF-IDF
df["clean_review"] = df["clean_review"].astype(str)

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["clean_review"])
X_df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
X_df["sentiment"] = df["sentiment"].values

In [45]:
X_df.to_csv("playstore_reviews_tfidf.csv", index=False)
print("Ekstraksi fitur selesai!")
print(X_df.head())

Ekstraksi fitur selesai!
   abah  abal  abas  abdet  abdete  abdullah  abglokal  abgus  abiez  abis  \
0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   0.0   
1   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   0.0   
2   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   0.0   
3   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   0.0   
4   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   0.0   

   ...  ytta  yuda  yudah  yuhuuuu  yup  yuppie  yutub  ywdh  zaman  sentiment  
0  ...   0.0   0.0    0.0      0.0  0.0     0.0    0.0   0.0    0.0    positif  
1  ...   0.0   0.0    0.0      0.0  0.0     0.0    0.0   0.0    0.0    positif  
2  ...   0.0   0.0    0.0      0.0  0.0     0.0    0.0   0.0    0.0    negatif  
3  ...   0.0   0.0    0.0      0.0  0.0     0.0    0.0   0.0    0.0    negatif  
4  ...   0.0   0.0    0.0      0.0  0.0     0.0    0.0   0.0    0.0    positif  

[5 rows x 5001 colu

Insight:
* Hasil ekstraksi TF-IDF ini menunjukkan representasi numerik dari kata-kata paling relevan dalam ulasan pengguna, yang siap digunakan untuk pelatihan model analisi sentimen.

# 5. Pembangunan Model dengan SVM

****

**Machine Learning Traditional**

In [46]:
df = pd.read_csv("playstore_reviews_tfidf.csv")

Insight:
* Memanggil dataset yang sudah di save setelah proses TF-IDF

In [47]:
df = df[df['sentiment'].isin(['positif', 'negatif'])]
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = le.fit_transform(df['sentiment'])


**Deep Learning**

In [53]:
df_deep = pd.read_csv("playstore_reviews_cleaned.csv")

In [59]:
# Hpyermeter dan Tokenizer
MAX_NB_WORDS = 5000
MAX_SEQUENCE_LENGTH = 150
EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, oov_token="<OOV>")
tokenizer.fit_on_texts(df_deep["clean_review"])
X_deep = tokenizer.texts_to_sequences(df_deep["clean_review"])
X_deep = pad_sequences(X_deep, maxlen=MAX_SEQUENCE_LENGTH, padding="post")

In [60]:
df_deep = df_deep[df_deep['sentiment'].isin(['positif', 'negatif'])]
le = LabelEncoder()
y_deep = le.fit_transform(df_deep["sentiment"])

Insight:
* Digunakan untuk algoritma DeepLearning (TextCNN)

# a. Splitting Data

**SVM + RF**

In [51]:
X = df.drop(columns=['sentiment'])
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

**TextCNN**

In [61]:
X_train_deep, X_test_deep, y_train_deep, y_test_deep = train_test_split(
    X_deep, y_deep, test_size=0.2, random_state=42, stratify=y_deep
)

# b. Training Model

In [64]:
# TextCNN
input_layer = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM)(input_layer)

conv1 = Conv1D(256, 3, activation='relu')(embedding_layer)
conv2 = Conv1D(256, 4, activation='relu')(embedding_layer)
conv3 = Conv1D(256, 5, activation='relu')(embedding_layer)

pool1 = GlobalMaxPooling1D()(conv1)
pool2 = GlobalMaxPooling1D()(conv2)
pool3 = GlobalMaxPooling1D()(conv3)

concat = Concatenate()([pool1, pool2, pool3])
dropout1 = Dropout(0.5)(concat)
dense = Dense(128, activation='relu')(dropout1)
dropout2 = Dropout(0.3)(dense)
output_layer = Dense(2, activation='softmax')(dropout2)

textcnn_model = Model(inputs=input_layer, outputs=output_layer)

textcnn_model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam(learning_rate=1e-3),
    metrics=['accuracy']
)
textcnn_model.summary()
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-5)

textcnn_model.fit(
    X_train_deep, y_train_deep,
    validation_data=(X_test_deep, y_test_deep),
    batch_size=32,
    epochs=20,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

Epoch 1/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 214ms/step - accuracy: 0.7674 - loss: 0.4889 - val_accuracy: 0.8933 - val_loss: 0.2789 - learning_rate: 0.0010
Epoch 2/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 203ms/step - accuracy: 0.9189 - loss: 0.2324 - val_accuracy: 0.8922 - val_loss: 0.2826 - learning_rate: 0.0010
Epoch 3/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 206ms/step - accuracy: 0.9473 - loss: 0.1625 - val_accuracy: 0.8829 - val_loss: 0.3206 - learning_rate: 0.0010
Epoch 4/20
[1m229/229[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 211ms/step - accuracy: 0.9670 - loss: 0.1018 - val_accuracy: 0.8856 - val_loss: 0.3743 - learning_rate: 5.0000e-04


<keras.src.callbacks.history.History at 0x795a08893b90>

In [65]:
# SVM
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [66]:
# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

# c. Evaluasi Data

In [67]:
print("Accuracy SVM:", accuracy_score(y_test, y_pred) * 100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy SVM: 89.49096880131363 %

Classification Report:
               precision    recall  f1-score   support

     negatif       0.88      0.90      0.89       848
     positif       0.91      0.89      0.90       979

    accuracy                           0.89      1827
   macro avg       0.89      0.89      0.89      1827
weighted avg       0.90      0.89      0.89      1827



In [68]:
print("Accuracy Random Forest:", accuracy_score(y_test, y_pred_rf) * 100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf, target_names=le.classes_))

Accuracy Random Forest: 87.24685276409414 %

Classification Report:
               precision    recall  f1-score   support

     negatif       0.85      0.89      0.87       848
     positif       0.90      0.86      0.88       979

    accuracy                           0.87      1827
   macro avg       0.87      0.87      0.87      1827
weighted avg       0.87      0.87      0.87      1827



In [69]:
y_pred_deep = np.argmax(textcnn_model.predict(X_test_deep), axis=1)
acc = accuracy_score(y_test_deep, y_pred_deep)
print(f"\nAccuracy TextCNN: {acc * 100:.2f}%")
print("\nClassification Report:\n", classification_report(y_test_deep, y_pred_rf, target_names=le.classes_))

[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 49ms/step

Accuracy TextCNN: 89.33%

Classification Report:
               precision    recall  f1-score   support

     negatif       0.85      0.89      0.87       848
     positif       0.90      0.86      0.88       979

    accuracy                           0.87      1827
   macro avg       0.87      0.87      0.87      1827
weighted avg       0.87      0.87      0.87      1827



Kesimpulan:
* Berdasarkan hasil evaluasi model pada ketiga algoritma SVM, Random Forest, dan TextCNN dapat disimpulkan bahwa TextCNN merupakan model dengan performa terbaik untuk tugas klasifikasi sentimen pada dataset ulasan pada aplikasi x atau twitter. SVM berhasil mencapai akurasi 89.49%, mengungguli TextCNN dengan akurasi 89.33% dan Random Forest dengan akurasi 87.24%.
* Dengan demikian, hasil yang didapatkan oleh SVM dan TextCNN sangat tipis sekali, hanya terdapat beda dalam 0.13% saja. Maka kedua algoritma ini bisa disimpulkan sangat cocok untuk dataset ini, dibanding Random Forest.
* Dalam dataset ini seharusnya terdapat 3 kelas label (Netral, Positif dan Negatif), namun pada saat pelatihan model, dikarenakan label negatif yang terlalu sedikit dan kecil, sehingga menganggu hasil dari akhir dari akurasi tersebut, jadi label netral dihapus saat training, dan hanya menggunakan 2 label saja.