# Import Library

In [1]:
import re
import pandas as pd
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from datetime import timedelta, datetime
from sklearn.model_selection import train_test_split
import mapply #Library untuk mengaktifkan multi threading/ multi core
import joblib

In [2]:
mapply.init(
    n_workers=-1,
    chunk_size=2000,
    max_chunks_per_worker=8,
    progressbar=False
)

## Inisialisasi Proporsi

In [3]:
proportion = 2
if proportion == 1:
    test_size = 0.1
elif proportion == 2:
    test_size = 0.2
elif proportion == 3:
    test_size = 0.3
elif proportion == 4:
    test_size = 0.4
else:
    test_size = 0.2

## Import Corpus Stopword & Normalisasi

In [4]:
with open("stopword.txt") as f:
    stopwords = f.read().splitlines()
# import stopwords

In [5]:
with open("normalisasi.txt") as f:
    normalisasi = f.read().splitlines()
# import normalisasi

# Buat Fungsi Preprocessing

In [6]:
# Cleaning
def _cleaning(term):
    # Hapus pencatuman username    
    _term = re.sub(r"@[\w]*", "", term)
    # Hapus hyperlink  
    _term =  re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', _term)
    # Hapus special character    
    _term = re.sub(r"[()\"#/@;:<>{}*`'+=~|.!?,]", "", _term)
    _term = re.sub('[^a-zA-Zа-яА-Я]+', ' ', _term)
    # Remove multiple space 
    _term = re.sub("\s\s+", " ", _term)
    # Remove emoticons
    _regrex_pattern = re.compile(pattern="["
                                    u"\U0001F600-\U0001F64F"  # emoticons
                                    u"\U0001F300-\U0001F5FF"  # symbols
                                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                    "]+", flags=re.UNICODE)
    return re.sub(_regrex_pattern, "", _term)
    
# Casefolding (lowercase)
def _casefolding(term):
    _term = term.lower()
    return _term

# Tokenizing (token)
def _tokenizing(term):
    if not isinstance(term, str): return
    return term.split()

# Normalisasi (Ubah kata alay atau typo)
def _normalisasi(term):
    temp = term
    for w in normalisasi:
        try:
            w_ = w.split(',')
            if term.index(w_[0]):
                temp = [w_[1] if word == w_[0] else word for word in term]
        except:
            continue
    return temp

# Stopword (hapus kata tidak penting (bag of words))
def _stopword(term):
    temp = [w for w in term if not w in stopwords]
    return temp

# Steeming (pengambilan kata dasar)
def _steeming(terms):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    tweet = stemmer.stem(" ".join(terms))
    return tweet.split()

# Proses Text Pre-processing
def normalization(term): 
    cleaning = _cleaning(term)
    casefolding = _casefolding(cleaning)
    tokenizing = _tokenizing(casefolding)
    normalisasi = _normalisasi(tokenizing)
    stopword = _stopword(normalisasi)
    steeming = _steeming(stopword)
    return [cleaning, casefolding, tokenizing, normalisasi, stopword, steeming]

def callback(x):
    cleaning, casefolding, tokenizing, normalisasi, stopword, steeming = normalization(x)
    return pd.Series({'cleaning': cleaning, 'casefolding': casefolding, 'tokenizing': tokenizing,'normalisasi':normalisasi,'stopword': stopword,'steeming': steeming, 'cleaned': " ".join(steeming)})

# Import Data Training

In [7]:
df = pd.read_csv("project2.csv",sep=',')
df

Unnamed: 0,link,no,date,author,comment,id,sentiment,annotator,annotation_id,created_at,updated_at,lead_time
0,https://www.instagram.com/p/B9g3hvMnYrA/,171,"Mar 9, 2020",zifaainnr,Mau syuting apa kak risa,6748,Neutral,1,1439,2022-08-02T13:04:33.451851Z,2022-08-02T13:04:33.451851Z,3.148
1,https://www.instagram.com/p/B9g3hvMnYrA/,170,"Mar 9, 2020",lemonteasquihy,Ke duaa nihh,6747,Neutral,1,1438,2022-08-02T13:04:25.651468Z,2022-08-02T13:04:25.651468Z,3.126
2,https://www.instagram.com/p/B9g3hvMnYrA/,169,"Mar 9, 2020",sulis_saja09,Bear nya manah??? 😍,6746,Neutral,1,1437,2022-08-02T13:04:20.522557Z,2022-08-02T13:04:20.522557Z,2.447
3,https://www.instagram.com/p/B9g3hvMnYrA/,168,"Mar 9, 2020",wldnaar_,Dia smp apa sd sih bingung wkwkwks,6745,Neutral,1,1436,2022-08-02T13:04:15.672919Z,2022-08-02T13:04:15.672919Z,6.164
4,https://www.instagram.com/p/B9g3hvMnYrA/,167,"Mar 9, 2020",ririmwldr_,Masha and the bear versi real sigemesss,6744,Neutral,1,1435,2022-08-02T13:04:07.389773Z,2022-08-02T13:04:07.389773Z,2.406
...,...,...,...,...,...,...,...,...,...,...,...,...
1415,https://www.instagram.com/p/CgWyjY5JkUt/,5,"Jul 23, 2022",myrissa12,Presiden rusia sama amerika dateng ga bang?,5332,Negative,1,24,2022-07-27T15:44:08.154439Z,2022-07-27T15:44:08.154439Z,8.484
1416,https://www.instagram.com/p/CgWyjY5JkUt/,4,"Jul 23, 2022",ginthaputri,hebat wagub sumbar bisa terbang👍🏻,5331,Negative,1,23,2022-07-27T15:43:58.537241Z,2022-07-27T15:43:58.537241Z,2.756
1417,https://www.instagram.com/p/CgWyjY5JkUt/,3,"Jul 23, 2022",anaa_istianah,Bang maafin klo blh saran jgn pakai kuas bulu ...,5330,Positive,1,22,2022-07-27T15:43:54.650729Z,2022-07-27T15:43:54.650729Z,2.655
1418,https://www.instagram.com/p/CgWyjY5JkUt/,2,"Jul 23, 2022",p.oci_,Bang kalau Dine in tapi meja penuh terus mau m...,5329,Positive,1,21,2022-07-27T15:43:50.803404Z,2022-07-27T15:43:50.803404Z,4.277


In [8]:
#normalisasi data to table
normalization = df['comment'].mapply(callback)
df = pd.concat([df,normalization], axis=1, join='inner')
df

Unnamed: 0,link,no,date,author,comment,id,sentiment,annotator,annotation_id,created_at,updated_at,lead_time,cleaning,casefolding,tokenizing,normalisasi,stopword,steeming,cleaned
0,https://www.instagram.com/p/B9g3hvMnYrA/,171,"Mar 9, 2020",zifaainnr,Mau syuting apa kak risa,6748,Neutral,1,1439,2022-08-02T13:04:33.451851Z,2022-08-02T13:04:33.451851Z,3.148,Mau syuting apa kak risa,mau syuting apa kak risa,"[mau, syuting, apa, kak, risa]","[mau, syuting, apa, kak, risa]","[syuting, kak, risa]","[syuting, kak, risa]",syuting kak risa
1,https://www.instagram.com/p/B9g3hvMnYrA/,170,"Mar 9, 2020",lemonteasquihy,Ke duaa nihh,6747,Neutral,1,1438,2022-08-02T13:04:25.651468Z,2022-08-02T13:04:25.651468Z,3.126,Ke duaa nihh,ke duaa nihh,"[ke, duaa, nihh]","[ke, duaa, nih]","[duaa, nih]","[duaa, nih]",duaa nih
2,https://www.instagram.com/p/B9g3hvMnYrA/,169,"Mar 9, 2020",sulis_saja09,Bear nya manah??? 😍,6746,Neutral,1,1437,2022-08-02T13:04:20.522557Z,2022-08-02T13:04:20.522557Z,2.447,Bear nya manah,bear nya manah,"[bear, nya, manah]","[bear, nya, manah]","[bear, nya, manah]","[bear, nya, manah]",bear nya manah
3,https://www.instagram.com/p/B9g3hvMnYrA/,168,"Mar 9, 2020",wldnaar_,Dia smp apa sd sih bingung wkwkwks,6745,Neutral,1,1436,2022-08-02T13:04:15.672919Z,2022-08-02T13:04:15.672919Z,6.164,Dia smp apa sd sih bingung wkwkwks,dia smp apa sd sih bingung wkwkwks,"[dia, smp, apa, sd, sih, bingung, wkwkwks]","[dia, sampai, apa, sd, sih, bingung, wkwkwks]","[sd, sih, bingung, wkwkwks]","[sd, sih, bingung, wkwkwks]",sd sih bingung wkwkwks
4,https://www.instagram.com/p/B9g3hvMnYrA/,167,"Mar 9, 2020",ririmwldr_,Masha and the bear versi real sigemesss,6744,Neutral,1,1435,2022-08-02T13:04:07.389773Z,2022-08-02T13:04:07.389773Z,2.406,Masha and the bear versi real sigemesss,masha and the bear versi real sigemesss,"[masha, and, the, bear, versi, real, sigemesss]","[masha, and, the, bear, versi, real, sigemesss]","[masha, and, the, bear, versi, real, sigemesss]","[masha, and, the, bear, versi, real, sigemesss]",masha and the bear versi real sigemesss
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1415,https://www.instagram.com/p/CgWyjY5JkUt/,5,"Jul 23, 2022",myrissa12,Presiden rusia sama amerika dateng ga bang?,5332,Negative,1,24,2022-07-27T15:44:08.154439Z,2022-07-27T15:44:08.154439Z,8.484,Presiden rusia sama amerika dateng ga bang,presiden rusia sama amerika dateng ga bang,"[presiden, rusia, sama, amerika, dateng, ga, b...","[presiden, rusia, sama, amerika, dateng, engga...","[presiden, rusia, amerika, dateng, bang]","[presiden, rusia, amerika, dateng, bang]",presiden rusia amerika dateng bang
1416,https://www.instagram.com/p/CgWyjY5JkUt/,4,"Jul 23, 2022",ginthaputri,hebat wagub sumbar bisa terbang👍🏻,5331,Negative,1,23,2022-07-27T15:43:58.537241Z,2022-07-27T15:43:58.537241Z,2.756,hebat wagub sumbar bisa terbang,hebat wagub sumbar bisa terbang,"[hebat, wagub, sumbar, bisa, terbang]","[hebat, wagub, sumbar, bisa, terbang]","[hebat, wagub, sumbar, terbang]","[hebat, wagub, sumbar, terbang]",hebat wagub sumbar terbang
1417,https://www.instagram.com/p/CgWyjY5JkUt/,3,"Jul 23, 2022",anaa_istianah,Bang maafin klo blh saran jgn pakai kuas bulu ...,5330,Positive,1,22,2022-07-27T15:43:54.650729Z,2022-07-27T15:43:54.650729Z,2.655,Bang maafin klo blh saran jgn pakai kuas bulu ...,bang maafin klo blh saran jgn pakai kuas bulu ...,"[bang, maafin, klo, blh, saran, jgn, pakai, ku...","[bang, maafin, klo, blh, saran, jgn, pakai, ku...","[bang, maafin, klo, blh, saran, jgn, pakai, ku...","[bang, maafin, klo, blh, saran, jgn, pakai, ku...",bang maafin klo blh saran jgn pakai kuas bulu ...
1418,https://www.instagram.com/p/CgWyjY5JkUt/,2,"Jul 23, 2022",p.oci_,Bang kalau Dine in tapi meja penuh terus mau m...,5329,Positive,1,21,2022-07-27T15:43:50.803404Z,2022-07-27T15:43:50.803404Z,4.277,Bang kalau Dine in tapi meja penuh terus mau m...,bang kalau dine in tapi meja penuh terus mau m...,"[bang, kalau, dine, in, tapi, meja, penuh, ter...","[bang, kalau, dine, in, tapi, meja, penuh, ter...","[bang, dine, in, meja, penuh, makannya, motor]","[bang, dine, in, meja, penuh, makan, motor]",bang dine in meja penuh makan motor


## Split Data Test & Data Latih

In [9]:
#pengubahan data training
train, test = train_test_split(df, test_size=test_size)
train.head()

Unnamed: 0,link,no,date,author,comment,id,sentiment,annotator,annotation_id,created_at,updated_at,lead_time,cleaning,casefolding,tokenizing,normalisasi,stopword,steeming,cleaned
33,https://www.instagram.com/p/B9g3hvMnYrA/,138,"Mar 9, 2020",crusshh07,Iiiiiii gemesshhhhhhhhhh,6715,Neutral,1,1406,2022-08-02T13:00:14.930092Z,2022-08-02T13:00:14.930092Z,2.258,Iiiiiii gemesshhhhhhhhhh,iiiiiii gemesshhhhhhhhhh,"[iiiiiii, gemesshhhhhhhhhh]","[iiiiiii, gemesshhhhhhhhhh]","[iiiiiii, gemesshhhhhhhhhh]","[iiiiiii, gemesshhhhhhhhhh]",iiiiiii gemesshhhhhhhhhh
1311,https://www.instagram.com/p/CgWyjY5JkUt/,109,"Jul 23, 2022",pangestufirmanbudiana,👍🙏🙏,5436,Neutral,1,128,2022-07-30T10:07:02.480412Z,2022-07-30T10:07:02.480412Z,2.347,,,[],[],[],[],
102,https://www.instagram.com/p/B9g3hvMnYrA/,69,"Mar 9, 2020",puppypinnypigy,Aaa gemesss,6646,Positive,1,1337,2022-08-02T12:52:31.341341Z,2022-08-02T12:52:31.341341Z,2.929,Aaa gemesss,aaa gemesss,"[aaa, gemesss]","[aaa, gemas]","[aaa, gemas]","[aaa, gemas]",aaa gemas
867,https://www.instagram.com/p/Ce3lI-uJF68/,8,"Jun 16, 2022",hendraken.22,Ulangin😍,5882,Positive,1,572,2022-08-01T11:41:37.268974Z,2022-08-01T11:41:37.268974Z,1.819,Ulangin,ulangin,[ulangin],[ulangin],[ulangin],[ulangin],ulangin
197,https://www.instagram.com/p/CAkd0VfHSSD/,80,"May 24, 2020",ryntyptry_,Aing juga atuh💖,6551,Neutral,1,1242,2022-08-02T12:40:04.091321Z,2022-08-02T12:40:04.091321Z,3.075,Aing juga atuh,aing juga atuh,"[aing, juga, atuh]","[aing, juga, atuh]","[aing, atuh]","[aing, atuh]",aing atuh


In [10]:
# Proses pengubahan data kedalam bentuk csv ( buat folder data terlebih dahulu )
train.to_csv("./data/dataset_"+str(proportion)+"/cleaned_train.csv", index=False)
test.to_csv("./data/dataset_"+str(proportion)+"/cleaned_test.csv", index=False)

# train = pd.read_csv("./data/dataset_"+str(proportion)+"/cleaned_train.csv")
# test = pd.read_csv("./data/dataset_"+str(proportion)+"/cleaned_test.csv")
df.to_csv("./data/dataset_"+str(proportion)+"/cleaned_dataset.csv", index=False)
model_dump = {'train': train, 'test': test}
joblib.dump(model_dump, open("./data/dataset_"+str(proportion)+"/preprocessing"+".model", "wb"))

# Export Menjadi File CSV

In [11]:
filename = "./data/dataset_"+str(proportion)+"/hasil/Hasil_PreProcessing_Comment.csv"
df.to_csv(filename, index=False) #buat folder data terlebih dahulu

In [12]:
filename = "./data/dataset_"+str(proportion)+"/hasil/Hasil_PreProcessing_Comment.csv"
df.to_csv(filename, index=False)