In [12]:
# Install library yang ingin digunakan
!pip install google-play-scraper
!pip install nltk Sastrawi # Library untuk bahasa Indonesia
!pip install scikit-learn

Collecting google-play-scraper
  Downloading google_play_scraper-1.2.7-py3-none-any.whl.metadata (50 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading google_play_scraper-1.2.7-py3-none-any.whl (28 kB)
Installing collected packages: google-play-scraper
Successfully installed google-play-scraper-1.2.7
Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [18]:
# Import Library
from google_play_scraper import reviews
from google_play_scraper import Sort
import pandas as pd
import re
import string
import nltk
import numpy as np
from sklearn.svm import SVC
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

In [7]:
# Download data yang dibutuhkan
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [8]:
# Scrapping Data
app_id = "com.twitter.android"
result, _ = reviews(
    app_id,
    lang='id',
    country='id',
    count=10000,
    sort=Sort.NEWEST
)
df = pd.DataFrame(result)[['content', 'score']]
df.rename(columns={'content': 'review', 'score': 'sentiment'}, inplace=True)

def label_sentiment(score):
    if score >= 4:
        return "positif"
    elif score == 3:
        return "netral"
    else:
        return "negatif"
df['sentiment'] = df['sentiment'].apply(label_sentiment)

df.to_csv("playstore_reviews.csv", index=False)
print(f"Dataset disimpan sebagai playstore_reviews.csv dengan {len(df)} data.")

print("\nDistribusi Sentimen:")
print(df['sentiment'].value_counts())

Dataset disimpan sebagai playstore_reviews.csv dengan 10000 data.

Distribusi Sentimen:
sentiment
positif    5207
negatif    4265
netral      528
Name: count, dtype: int64


In [9]:
# Load Dataset
df = pd.read_csv("playstore_reviews.csv")

In [10]:
print(df.head())

                                              review sentiment
0              kenapa tdk bisa login akun yaa??🤷🏻‍♀️   negatif
1                           hilang lah semua progres   negatif
2  Aplikasi sangat Baik, banyak Pengetahuan yang ...   positif
3                      sangat bagus dan mudah sekali   positif
4                                              anjay   positif


In [11]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()
stop_words = set(stopwords.words('indonesian'))

# Untuk membersihkan text (tanda baca, mengubah menjadi semua nya menjadi huruf kecil, dll)

def clean_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r'\s+', ' ', text).strip()
        words = text.split()
        words = [word for word in words if word not in stop_words]
        words = [stemmer.stem(word) for word in words]
        return " ".join(words)
    return ""
df["clean_review"] = df["review"].apply(clean_text)

In [12]:
df.to_csv("playstore_reviews_cleaned.csv", index=False)
print("Dataset disimpan sebagai playstore_reviews_cleaned.csv.")

Dataset disimpan sebagai playstore_reviews_cleaned.csv.


In [13]:
# Load Dataset setelah clearing
df = pd.read_csv("playstore_reviews_cleaned.csv")
print(df[["review", "clean_review", "sentiment"]].head())

                                              review  \
0              kenapa tdk bisa login akun yaa??🤷🏻‍♀️   
1                           hilang lah semua progres   
2  Aplikasi sangat Baik, banyak Pengetahuan yang ...   
3                      sangat bagus dan mudah sekali   
4                                              anjay   

                                        clean_review sentiment  
0                                 tdk login akun yaa   negatif  
1                                     hilang progres   negatif  
2  aplikasi tahu bagi aplikasi ai grokai integras...   positif  
3                                        bagus mudah   positif  
4                                              anjay   positif  


In [14]:
print(df["clean_review"].isna().sum())

339


Insight:
* Terdapat missing value sebanyak 339, maka setelah itu kita harus menghapus missing value tersebut di tahap selanjutnya sebelum masuk ke ekstraksi

In [15]:
# Menghapus Missing Value
df = df.dropna(subset=["clean_review"])
df = df[df["clean_review"].str.strip() != ""]
print(df["clean_review"].isna().sum())

0


In [16]:
df.to_csv("playstore_reviews_clearing_missing_value.csv", index=False)

In [20]:
# Proses Ekstraksi Fitur Dengan TF-IDF
df = pd.read_csv("playstore_reviews_clearing_missing_value.csv")
df["clean_review"] = df["clean_review"].astype(str)

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["clean_review"])
X_df = pd.DataFrame(X.toarray(), columns=tfidf.get_feature_names_out())
X_df["sentiment"] = df["sentiment"].values

In [21]:
X_df.to_csv("playstore_reviews_tfidf.csv", index=False)
print("Ekstraksi fitur selesai!")
print(X_df.head())

Ekstraksi fitur selesai!
   aangat  abah  abal  abas  abdet  abdete  abdullah  abglokal  abgus  abiez  \
0     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
1     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
2     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
3     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   
4     0.0   0.0   0.0   0.0    0.0     0.0       0.0       0.0    0.0    0.0   

   ...  yudah  yuhuuuu  yup  yuppie  yutub  ywdh   yy  zaman  zelidraw  \
0  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
1  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
2  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
3  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   
4  ...    0.0      0.0  0.0     0.0    0.0   0.0  0.0    0.0       0.0   

   sentiment  
0    negatif  
1    negatif  
2   

In [10]:
df = pd.read_csv("playstore_reviews_tfidf.csv")

In [17]:
df = df[df['sentiment'].isin(['positif', 'negatif'])]
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = le.fit_transform(df['sentiment'])


In [7]:
X = df.drop(columns=['sentiment'])
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
svm = SVC(kernel='linear', C=1)
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)

In [19]:
print("Accuracy SVM:", accuracy_score(y_test, y_pred) * 100, "%")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy SVM: 87.63676148796499 %

Classification Report:
               precision    recall  f1-score   support

     negatif       0.85      0.88      0.87       846
     positif       0.90      0.87      0.88       982

    accuracy                           0.88      1828
   macro avg       0.88      0.88      0.88      1828
weighted avg       0.88      0.88      0.88      1828



Conclusion:
* Model Yang digunakan untuk analisis sentimen adalah menggunakan SVM dengan tingkat akurasi terkahir berada di 87.63% dan angka tersebut suda melewati dari angka rata-rata, sehingga dapat dipertanggungjawabkan hasil nya.
* Dalam dataset ini seharusnya terdapat 3 kelas label (Netral, Positif dan Negatif), namun pada saat pelatihan model, dikarenakan label negatif yang terlalu sedikit dan kecil, sehingga menganggu hasil dari akhir dari akurasi tersebut, jadi label netral dihapus saat training, dan hanya menggunakan 2 label saja.