# Proyek Pertama: Analisis Sentimen

- Nama: Nicolas Debrito
- Email: nicolas.debrito66@gmail.com
- ID Dicoding: reezzy

## Install Library

In [42]:
!pip install sastrawi
!pip install google_play_scraper



## Import Library

In [43]:
import re
import string
import csv
import requests
from io import StringIO
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from google_play_scraper import app, reviews, Sort, reviews_all
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences


nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nicol\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Scraping Data

In [44]:
scrapreview = reviews_all(
    'com.supercell.clashofclans',
    lang='id',
    country='id',
    sort=Sort.MOST_RELEVANT,
    count=10000
)

In [45]:
df = pd.DataFrame(scrapreview)
df.head()

Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,appVersion
0,6cedfaca-247b-4209-a285-39d04ab59e63,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Clash of clans sekarang sering gangguan atau l...,2,1980,16.654.16,2024-12-14 19:45:20,,,16.654.16
1,7d669783-6878-4bda-a5be-1381b11ec532,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Update bug update terbaru : 1.utk pelatihan pa...,3,6,17.126.6,2025-03-26 19:16:26,,,17.126.6
2,2c5f4c06-e407-4fbf-a1a2-b22c42fc2a6f,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,"Yth SuperCell, Coba buat suatu gebrakan yang b...",4,7,17.126.6,2025-03-27 06:47:16,,,17.126.6
3,f846cef1-1211-41e2-97f8-87012065b2ac,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,aku kurangin bintang nya satu jadi bintang emp...,4,3,17.126.6,2025-03-28 17:41:24,,,17.126.6
4,0098ea38-7a6c-46cf-a33b-e2ad29744b90,Pengguna Google,https://play-lh.googleusercontent.com/EGemoI2N...,Game nya makin lama makin aneh karena: 1. Trof...,1,147,17.126.6,2025-03-27 15:58:38,,,17.126.6


In [46]:
df.shape

(49500, 11)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49500 entries, 0 to 49499
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   reviewId              49500 non-null  object        
 1   userName              49500 non-null  object        
 2   userImage             49500 non-null  object        
 3   content               49500 non-null  object        
 4   score                 49500 non-null  int64         
 5   thumbsUpCount         49500 non-null  int64         
 6   reviewCreatedVersion  38637 non-null  object        
 7   at                    49500 non-null  datetime64[ns]
 8   replyContent          0 non-null      object        
 9   repliedAt             0 non-null      object        
 10  appVersion            38637 non-null  object        
dtypes: datetime64[ns](1), int64(2), object(8)
memory usage: 4.2+ MB


In [48]:
df = df.drop_duplicates()

In [49]:
jumlah_ulasan, jumlah_kolom = df.shape
print("Jumlah Ulasan", jumlah_ulasan)
print("Jumlah Kolom", jumlah_kolom)

Jumlah Ulasan 49500
Jumlah Kolom 11


In [50]:
clean_df = df[['content']]
clean_df = clean_df.rename(columns={"content": "review"})
clean_df = clean_df.head(12000)
clean_df

Unnamed: 0,review
0,Clash of clans sekarang sering gangguan atau l...
1,Update bug update terbaru : 1.utk pelatihan pa...
2,"Yth SuperCell, Coba buat suatu gebrakan yang b..."
3,aku kurangin bintang nya satu jadi bintang emp...
4,Game nya makin lama makin aneh karena: 1. Trof...
...,...
11995,Maaf supercell pembaruannya kurang bagus.. Apa...
11996,"update gamenya terlalu sering, Yang bagus cuma..."
11997,"Please Kembalikan barak !, Karena itu adalah s..."
11998,"Untuk game COC ini seru, menantang, dan bertah..."


## Preprocessing Data

In [51]:
def cleaningText(text):
    emoticon_pattern = re.compile(
        "["u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)

    text = emoticon_pattern.sub(r'', text)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)  # Menghapus mention (@username)
    text = re.sub(r'#[A-Za-z0-9]+', '', text)  # Menghapus hashtag
    text = re.sub(r'RT[\s]', '', text)  # Menghapus RT (Retweet)
    text = re.sub(r"http\S+", '', text)  # Menghapus URL
    text = re.sub(r'[0-9]+', '', text)  # Menghapus angka
    text = re.sub(r'[^\w\s]', '', text)  # Menghapus karakter non-alfanumerik

    text = text.replace('\n', ' ')  # Menghapus newline
    text = text.translate(str.maketrans('', '', string.punctuation))  # Menghapus tanda baca
    text = text.strip()  # Menghapus spasi ekstra di awal & akhir

    return text

def casefoldingText(text):
    return text.lower()

slangwords = {
    "@": "di", "abis": "habis", "wtb": "beli", "masi": "masih",
    "wts": "jual", "wtt": "tukar", "bgt": "banget", "maks": "maksimal",
    "gg": "hebat", "noob": "pemula", "op": "terlalu kuat", "ez": "mudah"
}

def fix_slangwords(text):
    words = text.lower().split()  # Konversi ke lowercase lebih awal
    fixed_words = [slangwords.get(word, word) for word in words]
    return ' '.join(fixed_words)

def tokenizingText(text):
    return word_tokenize(text)  # Pastikan nltk.download('punkt') telah dilakukan

def filteringText(text):
    factory = StopWordRemoverFactory()
    listStopwords = set(factory.get_stop_words())  # Stopword bahasa Indonesia
    listStopwords1 = set(stopwords.words('english'))  # Stopword bahasa Inggris

    # Menambahkan stopword tambahan
    listStopwords.update(listStopwords1)
    listStopwords.update(['iya','yaa','gak','nya','na','sih','ku',"di","ga",
                          "ya","gaa","loh","kah","woi","woii","woy"])

    return [word for word in text if word not in listStopwords]

factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stemmingText(text):
    return ' '.join(stemmer.stem(word) for word in text)

In [52]:
clean_df['text_clean'] = clean_df['review'].apply(cleaningText)
clean_df['text_casefoldingText'] = clean_df['text_clean'].apply(casefoldingText)
clean_df['text_slangwords'] = clean_df['text_casefoldingText'].apply(fix_slangwords)
clean_df['text_tokenizingText'] = clean_df['text_slangwords'].apply(tokenizingText)
clean_df['text_stopword'] = clean_df['text_tokenizingText'].apply(filteringText)
clean_df['text_stemming'] = clean_df['text_stopword'].apply(stemmingText)

In [53]:
clean_df

Unnamed: 0,review,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_stemming
0,Clash of clans sekarang sering gangguan atau l...,Clash of clans sekarang sering gangguan atau l...,clash of clans sekarang sering gangguan atau l...,clash of clans sekarang sering gangguan atau l...,"[clash, of, clans, sekarang, sering, gangguan,...","[clash, clans, sekarang, sering, gangguan, los...",clash clans sekarang sering ganggu lose connec...
1,Update bug update terbaru : 1.utk pelatihan pa...,Update bug update terbaru utk pelatihan pasuk...,update bug update terbaru utk pelatihan pasuk...,update bug update terbaru utk pelatihan pasuka...,"[update, bug, update, terbaru, utk, pelatihan,...","[update, bug, update, terbaru, utk, pelatihan,...",update bug update baru utk latih pasu pakai wa...
2,"Yth SuperCell, Coba buat suatu gebrakan yang b...",Yth SuperCell Coba buat suatu gebrakan yang be...,yth supercell coba buat suatu gebrakan yang be...,yth supercell coba buat suatu gebrakan yang be...,"[yth, supercell, coba, buat, suatu, gebrakan, ...","[yth, supercell, coba, buat, suatu, gebrakan, ...",yth supercell coba buat suatu gebrak beda sara...
3,aku kurangin bintang nya satu jadi bintang emp...,aku kurangin bintang nya satu jadi bintang emp...,aku kurangin bintang nya satu jadi bintang emp...,aku kurangin bintang nya satu jadi bintang emp...,"[aku, kurangin, bintang, nya, satu, jadi, bint...","[aku, kurangin, bintang, satu, jadi, bintang, ...",aku rangin bintang satu jadi bintang empat kel...
4,Game nya makin lama makin aneh karena: 1. Trof...,Game nya makin lama makin aneh karena Trofiny...,game nya makin lama makin aneh karena trofiny...,game nya makin lama makin aneh karena trofinya...,"[game, nya, makin, lama, makin, aneh, karena, ...","[game, makin, lama, makin, aneh, trofinya, sed...",game makin lama makin aneh trofi sedikit kali ...
...,...,...,...,...,...,...,...
11995,Maaf supercell pembaruannya kurang bagus.. Apa...,Maaf supercell pembaruannya kurang bagus Apala...,maaf supercell pembaruannya kurang bagus apala...,maaf supercell pembaruannya kurang bagus apala...,"[maaf, supercell, pembaruannya, kurang, bagus,...","[maaf, supercell, pembaruannya, kurang, bagus,...",maaf supercell baru kurang bagus trophy legend...
11996,"update gamenya terlalu sering, Yang bagus cuma...",update gamenya terlalu sering Yang bagus cuma ...,update gamenya terlalu sering yang bagus cuma ...,update gamenya terlalu sering yang bagus cuma ...,"[update, gamenya, terlalu, sering, yang, bagus...","[update, gamenya, terlalu, sering, bagus, cuma...",update gamenya terlalu sering bagus cuma grafi...
11997,"Please Kembalikan barak !, Karena itu adalah s...",Please Kembalikan barak Karena itu adalah seb...,please kembalikan barak karena itu adalah seb...,please kembalikan barak karena itu adalah sebu...,"[please, kembalikan, barak, karena, itu, adala...","[please, kembalikan, barak, sebuah, bangunan, ...",please kembali barak buah bangun sangat pentin...
11998,"Untuk game COC ini seru, menantang, dan bertah...",Untuk game COC ini seru menantang dan bertahan...,untuk game coc ini seru menantang dan bertahan...,untuk game coc ini seru menantang dan bertahan...,"[untuk, game, coc, ini, seru, menantang, dan, ...","[game, coc, seru, menantang, bertahan, musuh, ...",game coc seru tantang tahan musuh sya tingkat ...


## Labeling Data

In [54]:
clean_df = clean_df.dropna()
print("Jumlah Ulasan", clean_df.shape[0])

Jumlah Ulasan 12000


In [55]:
lexicon_positive = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_positive.csv')

if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_positive[row[0]] = int(row[1])
else:
    print("Failed to fetch positive lexicon data")

lexicon_negative = dict()

response = requests.get('https://raw.githubusercontent.com/angelmetanosaa/dataset/main/lexicon_negative.csv')

if response.status_code == 200:
    reader = csv.reader(StringIO(response.text), delimiter=',')
    for row in reader:
        lexicon_negative[row[0]] = int(row[1])
else:
    print("Failed to fetch negative lexicon data")

In [56]:
def sentiment_analysis_lexicon_indonesia(text):
    score = 0
    for word in text.split():
        if (word in lexicon_positive):
            score = score + lexicon_positive[word]

    for word in text.split():
        if (word in lexicon_negative):
            score = score + lexicon_negative[word]

    polarity=''

    if (score > 0):
        polarity = 'positive'
    elif (score < 0):
        polarity = 'negative'
    else:
        polarity = 'neutral'

    return score, polarity

In [57]:
results = clean_df['text_stemming'].apply(sentiment_analysis_lexicon_indonesia)
results = list(zip(*results))
clean_df['sentimen_score'] = results[0]
clean_df['sentimen'] = results[1]
print(clean_df['sentimen'].value_counts())

sentimen
negative    7616
positive    3758
neutral      626
Name: count, dtype: int64


In [58]:
clean_df.to_csv('labeling_data.csv', index=False)