Referensi :
*   Review Dataset : https://www.kaggle.com/datasets/grikomsn/lazada-indonesian-reviews
*   Stoplist Dataset : https://www.kaggle.com/oswinrh/indonesian-stoplist
*   Slang Word : https://github.com/okkyibrohim/id-multi-label-hate-speech-and-abusive-language-detection

# Preparing Library

In [None]:
!pip install Sastrawi



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk

nltk.download('punkt')

from tqdm import tqdm
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Preprocessing

## Preparing Data and Handle Missing Value

In [None]:
data = pd.read_csv('/content/drive/MyDrive/Kaggle/lazada/20191002-reviews.csv')
print(data.shape)
data.head()

(203787, 15)


Unnamed: 0,itemId,category,name,rating,originalRating,reviewTitle,reviewContent,likeCount,upVotes,downVotes,helpful,relevanceScore,boughtDate,clientType,retrievedDate
0,100002528,beli-harddisk-eksternal,Kamal U.,5,,,bagus mantap dah sesui pesanan,0,0,0,True,26.51,09 Apr 2019,androidApp,2019-10-02
1,100002528,beli-harddisk-eksternal,yofanca m.,4,,,"Bagus, sesuai foto",0,0,0,True,22.49,24 Sep 2017,androidApp,2019-10-02
2,100002528,beli-harddisk-eksternal,Lazada Customer,5,,ok mantaaapppp barang sesuai pesanan.. good,okkkkk mantaaaaaaapppp ... goood,0,0,0,True,21.5,04 Apr 2018,androidApp,2019-10-02
3,100002528,beli-harddisk-eksternal,Lazada Customer,4,,,bagus sesuai,0,0,0,True,20.51,22 Sep 2017,androidApp,2019-10-02
4,100002528,beli-harddisk-eksternal,Yosep M.,5,,,,0,0,0,True,16.01,17 Agu 2018,androidApp,2019-10-02


In [None]:
df = data[['reviewContent', 'rating']]
print(df.shape)
df.head()

(203787, 2)


Unnamed: 0,reviewContent,rating
0,bagus mantap dah sesui pesanan,5
1,"Bagus, sesuai foto",4
2,okkkkk mantaaaaaaapppp ... goood,5
3,bagus sesuai,4
4,,5


In [None]:
df.isnull().sum()

reviewContent    96758
rating               0
dtype: int64

In [None]:
df = df.dropna().reset_index(drop=True)
df.isnull().sum()

reviewContent    0
rating           0
dtype: int64

In [None]:
print(df.shape)
df.head()

(107029, 2)


Unnamed: 0,reviewContent,rating
0,bagus mantap dah sesui pesanan,5
1,"Bagus, sesuai foto",4
2,okkkkk mantaaaaaaapppp ... goood,5
3,bagus sesuai,4
4,bima,1


## Regex Sentence

In [None]:
text_preproc1 = []
for i in tqdm(range(df.shape[0])):
  x = df['reviewContent'][i]
  y = df['rating'][i]
  pattern = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
  x = x.lower()
  x = re.sub(pattern,' ',x) #remove urls if any
  x = re.sub(r'[^.,a-zA-Z0-9 \n\.]',' ',x) #remove symbol
  x = re.sub('[\s]+', ' ', x) #menghilangkan additional whitespace
  x = re.sub(r'[^\w\s]','',x) #remove punctuation
  x = x.strip() #menghilangkan enter, tab, dll
  hasil = str(x),y
  text_preproc1.append(hasil)

100%|██████████| 107029/107029 [00:03<00:00, 32626.24it/s]


In [None]:
len(text_preproc1)

107029

In [None]:
df1 = pd.DataFrame(text_preproc1, columns=['text_preproc1', 'rating'])
df1.head()

Unnamed: 0,text_preproc1,rating
0,bagus mantap dah sesui pesanan,5
1,bagus sesuai foto,4
2,okkkkk mantaaaaaaapppp goood,5
3,bagus sesuai,4
4,bima,1


## Stopword and Slangword with file

In [None]:
fSlang = '/content/drive/MyDrive/Kaggle/lazada/new_kamusalay.csv'
sw = open(fSlang, encoding='utf-8', errors ='ignore', mode='r');SlangS = sw.readlines(); sw.close()
SlangS = {slang.strip().split(',')[0]:slang.strip().split(',')[1] for slang in SlangS}

In [None]:
fStop = '/content/drive/MyDrive/Kaggle/lazada/stopwordbahasa.csv'
st = open(fStop, encoding='utf-8', errors ='ignore', mode='r');StopS = st.readlines(); st.close()
StopS = {Stop.strip() for Stop in StopS}

In [None]:
def formaldanstop(t):
  t = word_tokenize(t)
  for i,x in enumerate(t):
    if x in SlangS.keys():
      t[i] = SlangS[x]
  return ''.join(' '.join(x for x in t if x not in StopS))

In [None]:
df1['text_preproc2'] = df1['text_preproc1'].map(formaldanstop)
df1.head()

Unnamed: 0,text_preproc1,rating,text_preproc2
0,bagus mantap dah sesui pesanan,5,bagus mantap sesui pesanan
1,bagus sesuai foto,4,bagus sesuai foto
2,okkkkk mantaaaaaaapppp goood,5,okkkkk mantaaaaaaapppp goood
3,bagus sesuai,4,bagus sesuai
4,bima,1,bima


## Stopword and Stemming from Sastrawi

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stopword = StopWordRemoverFactory().create_stop_word_remover()
stemmer = StemmerFactory().create_stemmer()

In [None]:
clean_text = []
for i,kalimat in tqdm(enumerate(df1['text_preproc2'])):
  stop = stopword.remove(kalimat)
  stem = stemmer.stem(stop)
  # if i%10000 ==0:
  #   print('loading kalimat ke:',i,'dari',len(df1['text_preproc2']))
  clean_text.append(stem)

107029it [1:08:21, 26.09it/s]   


In [None]:
df1['clean_text'] = clean_text
df2 = df1[['clean_text','rating']]
data.head()

Unnamed: 0,itemId,category,name,rating,originalRating,reviewTitle,reviewContent,likeCount,upVotes,downVotes,helpful,relevanceScore,boughtDate,clientType,retrievedDate
0,100002528,beli-harddisk-eksternal,Kamal U.,5,,,bagus mantap dah sesui pesanan,0,0,0,True,26.51,09 Apr 2019,androidApp,2019-10-02
1,100002528,beli-harddisk-eksternal,yofanca m.,4,,,"Bagus, sesuai foto",0,0,0,True,22.49,24 Sep 2017,androidApp,2019-10-02
2,100002528,beli-harddisk-eksternal,Lazada Customer,5,,ok mantaaapppp barang sesuai pesanan.. good,okkkkk mantaaaaaaapppp ... goood,0,0,0,True,21.5,04 Apr 2018,androidApp,2019-10-02
3,100002528,beli-harddisk-eksternal,Lazada Customer,4,,,bagus sesuai,0,0,0,True,20.51,22 Sep 2017,androidApp,2019-10-02
4,100002528,beli-harddisk-eksternal,Yosep M.,5,,,,0,0,0,True,16.01,17 Agu 2018,androidApp,2019-10-02


In [None]:
df2.to_csv('/content/drive/MyDrive/clean_text.csv', index=False)