# Import Library

In [1]:
import re
import nltk
import swifter
import pandas as pd
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to C:\Users\HP Victus
[nltk_data]     16\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\HP Victus
[nltk_data]     16\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Read Dataset

In [2]:
file_path = r'C:\Users\HP Victus 16\Documents\TA_Code\Dataset\Dataset_Review_Film.csv'
df = pd.read_csv(file_path)

df.drop('date', axis=1, inplace=True)
df.head(10)

Unnamed: 0,judul,username,tweet,plot,akting,direktor
0,the desperate hour,penggores_pena,the desperate hour lakewood 2022 beberapa wakt...,1,1,1
1,pursuit,angeldevyoo,ini edisi males review jd singkat aja ya beber...,1,1,0
2,pursuit,lovinasoenmi,plot utama adalah dua orang deserter pursuit a...,1,1,0
3,pursuit,yeobuun,film hereditary horror thrill midsommar gatau...,0,0,0
4,pursuit,bebekpakelobe,batman paling manusiawi combatnya juga sadis s...,1,0,0
5,pursuit,sonweeendy,iyaa aku udah baca itu yaampun sumpa keren pa...,1,0,0
6,pursuit,patheticlone,aku kasih judul nya aja liat kebioskop21 com ...,1,0,1
7,pursuit,tramadanur,yang memberi ruang terhadap plot twist tersemb...,-1,0,-1
8,pursuit,KeiNicro,tampan tailor nah kalo yg ini disebut sebut mi...,1,1,0
9,pursuit,kakdeo,62 cold pursuit 2019 jangan berani berani gang...,1,0,0


# Cleansing Data



In [3]:
def cleansing(data):
    # remove multiple space, enter, tab, \n, dan \mvs\
    data = re.sub(r'[\n\t]|/mvs/|\s+', ' ', data)
    
    # remove hashtag and mention
    data = re.sub(r"#\w+|@\w+", "", data)
    
    # remove URLs
    data = re.sub(r'https?:\/\/(?:www\.)?\S+', '', data)
    
    # remove special character
    data = re.sub('[^A-Za-z0-9]+', ' ', data)
    
    # remove number
    data = re.sub(r"\d+", "", data)
    
    return data

# Preprocessing


In [4]:
def preprocessing(data):
    # Cleansing Data
    data = cleansing(data)
    
    # Case Folding
    data = data.lower()
        
    return data

In [5]:
tqdm.pandas(desc="Cleansing Data : ")
df['clean_text'] = df['tweet'].progress_apply(preprocessing)
df.head(10)

Cleansing Data : 100%|████████████████████████████████████████████████████████| 17247/17247 [00:00<00:00, 43298.04it/s]


Unnamed: 0,judul,username,tweet,plot,akting,direktor,clean_text
0,the desperate hour,penggores_pena,the desperate hour lakewood 2022 beberapa wakt...,1,1,1,the desperate hour lakewood beberapa waktu la...
1,pursuit,angeldevyoo,ini edisi males review jd singkat aja ya beber...,1,1,0,ini edisi males review jd singkat aja ya beber...
2,pursuit,lovinasoenmi,plot utama adalah dua orang deserter pursuit a...,1,1,0,plot utama adalah dua orang deserter pursuit a...
3,pursuit,yeobuun,film hereditary horror thrill midsommar gatau...,0,0,0,film hereditary horror thrill midsommar gatau...
4,pursuit,bebekpakelobe,batman paling manusiawi combatnya juga sadis s...,1,0,0,batman paling manusiawi combatnya juga sadis s...
5,pursuit,sonweeendy,iyaa aku udah baca itu yaampun sumpa keren pa...,1,0,0,iyaa aku udah baca itu yaampun sumpa keren pa...
6,pursuit,patheticlone,aku kasih judul nya aja liat kebioskop21 com ...,1,0,1,aku kasih judul nya aja liat kebioskop com aj...
7,pursuit,tramadanur,yang memberi ruang terhadap plot twist tersemb...,-1,0,-1,yang memberi ruang terhadap plot twist tersemb...
8,pursuit,KeiNicro,tampan tailor nah kalo yg ini disebut sebut mi...,1,1,0,tampan tailor nah kalo yg ini disebut sebut mi...
9,pursuit,kakdeo,62 cold pursuit 2019 jangan berani berani gang...,1,0,0,cold pursuit jangan berani berani ganggu kel...


# Tokenization

In [6]:
# Tokenization
def tokenizing(data):
    return nltk.tokenize.word_tokenize(data)

tqdm.pandas(desc="Tokenizing Data : ")
df['tokens'] = df['clean_text'].progress_apply(tokenizing)
df.head(10)

Tokenizing Data : 100%|████████████████████████████████████████████████████████| 17247/17247 [00:01<00:00, 9074.58it/s]


Unnamed: 0,judul,username,tweet,plot,akting,direktor,clean_text,tokens
0,the desperate hour,penggores_pena,the desperate hour lakewood 2022 beberapa wakt...,1,1,1,the desperate hour lakewood beberapa waktu la...,"[the, desperate, hour, lakewood, beberapa, wak..."
1,pursuit,angeldevyoo,ini edisi males review jd singkat aja ya beber...,1,1,0,ini edisi males review jd singkat aja ya beber...,"[ini, edisi, males, review, jd, singkat, aja, ..."
2,pursuit,lovinasoenmi,plot utama adalah dua orang deserter pursuit a...,1,1,0,plot utama adalah dua orang deserter pursuit a...,"[plot, utama, adalah, dua, orang, deserter, pu..."
3,pursuit,yeobuun,film hereditary horror thrill midsommar gatau...,0,0,0,film hereditary horror thrill midsommar gatau...,"[film, hereditary, horror, thrill, midsommar, ..."
4,pursuit,bebekpakelobe,batman paling manusiawi combatnya juga sadis s...,1,0,0,batman paling manusiawi combatnya juga sadis s...,"[batman, paling, manusiawi, combatnya, juga, s..."
5,pursuit,sonweeendy,iyaa aku udah baca itu yaampun sumpa keren pa...,1,0,0,iyaa aku udah baca itu yaampun sumpa keren pa...,"[iyaa, aku, udah, baca, itu, yaampun, sumpa, k..."
6,pursuit,patheticlone,aku kasih judul nya aja liat kebioskop21 com ...,1,0,1,aku kasih judul nya aja liat kebioskop com aj...,"[aku, kasih, judul, nya, aja, liat, kebioskop,..."
7,pursuit,tramadanur,yang memberi ruang terhadap plot twist tersemb...,-1,0,-1,yang memberi ruang terhadap plot twist tersemb...,"[yang, memberi, ruang, terhadap, plot, twist, ..."
8,pursuit,KeiNicro,tampan tailor nah kalo yg ini disebut sebut mi...,1,1,0,tampan tailor nah kalo yg ini disebut sebut mi...,"[tampan, tailor, nah, kalo, yg, ini, disebut, ..."
9,pursuit,kakdeo,62 cold pursuit 2019 jangan berani berani gang...,1,0,0,cold pursuit jangan berani berani ganggu kel...,"[cold, pursuit, jangan, berani, berani, ganggu..."


# Normalization


In [7]:
normalizad_word = pd.read_excel("dict_norm_film.xlsx")

normalizad_word_dict = {}

for index, row in normalizad_word.iterrows():
    if row[0] not in normalizad_word_dict:
        normalizad_word_dict[row[0]] = row[1] 

def normalized_term(document):
    return [normalizad_word_dict[term] if term in normalizad_word_dict else term for term in document]

tqdm.pandas(desc="Normalizing Data : ")
df['tokens_normalized'] = df['tokens'].progress_apply(normalized_term)
df.head(10)

Normalizing Data : 100%|█████████████████████████████████████████████████████| 17247/17247 [00:00<00:00, 254735.99it/s]


Unnamed: 0,judul,username,tweet,plot,akting,direktor,clean_text,tokens,tokens_normalized
0,the desperate hour,penggores_pena,the desperate hour lakewood 2022 beberapa wakt...,1,1,1,the desperate hour lakewood beberapa waktu la...,"[the, desperate, hour, lakewood, beberapa, wak...","[the, desperate, hour, lakewood, beberapa, wak..."
1,pursuit,angeldevyoo,ini edisi males review jd singkat aja ya beber...,1,1,0,ini edisi males review jd singkat aja ya beber...,"[ini, edisi, males, review, jd, singkat, aja, ...","[ini, edisi, males, review, jd, singkat, aja, ..."
2,pursuit,lovinasoenmi,plot utama adalah dua orang deserter pursuit a...,1,1,0,plot utama adalah dua orang deserter pursuit a...,"[plot, utama, adalah, dua, orang, deserter, pu...","[plot, utama, adalah, dua, orang, deserter, pu..."
3,pursuit,yeobuun,film hereditary horror thrill midsommar gatau...,0,0,0,film hereditary horror thrill midsommar gatau...,"[film, hereditary, horror, thrill, midsommar, ...","[film, hereditary, horror, thrill, midsommar, ..."
4,pursuit,bebekpakelobe,batman paling manusiawi combatnya juga sadis s...,1,0,0,batman paling manusiawi combatnya juga sadis s...,"[batman, paling, manusiawi, combatnya, juga, s...","[batman, paling, manusiawi, bertarung nya, jug..."
5,pursuit,sonweeendy,iyaa aku udah baca itu yaampun sumpa keren pa...,1,0,0,iyaa aku udah baca itu yaampun sumpa keren pa...,"[iyaa, aku, udah, baca, itu, yaampun, sumpa, k...","[iyaa, aku, udah, baca, itu, yaampun, sumpa, k..."
6,pursuit,patheticlone,aku kasih judul nya aja liat kebioskop21 com ...,1,0,1,aku kasih judul nya aja liat kebioskop com aj...,"[aku, kasih, judul, nya, aja, liat, kebioskop,...","[aku, kasih, judul, nya, aja, liat, kebioskop,..."
7,pursuit,tramadanur,yang memberi ruang terhadap plot twist tersemb...,-1,0,-1,yang memberi ruang terhadap plot twist tersemb...,"[yang, memberi, ruang, terhadap, plot, twist, ...","[yang, memberi, ruang, terhadap, plot, twist, ..."
8,pursuit,KeiNicro,tampan tailor nah kalo yg ini disebut sebut mi...,1,1,0,tampan tailor nah kalo yg ini disebut sebut mi...,"[tampan, tailor, nah, kalo, yg, ini, disebut, ...","[tampan, tailor, nah, kalo, yg, ini, disebut, ..."
9,pursuit,kakdeo,62 cold pursuit 2019 jangan berani berani gang...,1,0,0,cold pursuit jangan berani berani ganggu kel...,"[cold, pursuit, jangan, berani, berani, ganggu...","[cold, pursuit, jangan, berani, berani, ganggu..."


# Stopword Removal

In [8]:
# memilih kata penting
listStopword =  stopwords.words('indonesian')
# append additional stopword
listStopword.extend(['yg', 'dg', 'rt', 'dgn', 'ny', 'd', 'klo', 'kalo', 'amp', 
                     'biar', 'bikin', 'bilang', 'krn', 'nya', 'nih', 'sih', 'si',
                     'tau', 'tdk', 'tuh', 'utk', 'ya', 'jd', 'jgn', 'sdh', 'aja', 
                     'n', 't', 'nyg', 'hehe', 'pen', 'u', 'nan', 'loh', 'rt',
                     '&amp', 'p', 'jd', 'nek', 'e', 'yo', 'o', 'np', 'nw', 
                     'https', 'http', 't', 'co', 'moilnyaw', 'agshshsjs','dmp',
                     'dfess','zdy','ahiak','wooooy', 'hhahahahahaha','murmuring',
                     'hhaaddeehhhh','eey','raheuuut','hemmmhh','ahahahahahahaha',
                     'bunggg','ndag','urghhhh','xdd','uehe','ewkwkwk','ltl',
                     'aaaakk','rvlog','cuuy','broooooooooo','woiiiiiii','woooooh',
                     'yeuuu','hahahhaahhaahahhah','dihh','yass','utbk','skskskk',
                     'hahahahahhaha','brrip', 'humn','laaaaah','hikd','wkkwkwkwk',
                     'hhahahaah','hahahahahaah','wwkaokwoakwoak','awkakwkawkak',
                     'weyyy','umphhh','pfttttttt','lsc','nyoih','dhhddgh',
                     'uuiii','tbz','hahhhhh','bhak','tteh','nyingnying','pfff',
                     'aiiiihhh','thg','akakakakak','pgw','bradd','wessss','dgm',
                     'miiin','diihh','ajskahsjajsjajsk','woilah','tars','cokkk',
                     'jiaahhhh','wkwkwkwkwkwkwkwk','wkwkwkkwkw','coeg','upis',
                     'jdksbskebsksj','hahehoh','uwuwuwuwuuw', 'aksjsjsksks','jcm',
                     'ndeeeer','hwhw','ququ','boooook','kwah','nyees','geekbahpodcast',
                     'sumargo','flickmagazine','lacuk','wowoowowowowowoo','fassbender',
                     'aaaaak','uwaaaaa','yaelahhh','djdiidhdhsndndjdijdd','fsai',
                     'rrrrrah','aaaaakkk','eeeuuuuhhh','tenenbaum','hadohh','potus',
                     'monek','arggghhhhhh','aaaaaarghh', 'heheheheheueu','iteotfw',
                     'semsem','tobiojersey','idot','morim','sekkk','ejenan',
                     'afkzkunuqkfkigieowkdbjosk','fvck','wkakakka','terrrrrr',
                     'ajsksksjshsjs','dpmallxxi', 'yourwiuwiu','sksksjsj','peele',
                     'huuuuwwaaaahhhh','hahhart','mbahput','huwaaaaaa','fckd',
                     'mmjd','knet','gwangj','nyahahahaha','aaarhh','',
                     'booos','owo','sedeksegaraga','fxxxing','lolll','auliarani',
                     'ehehehhe','hshhshs','theeeeeeee','iqiyi','weeeeehhhh',
                     'tncfu','crotan','huancuuk', 'boxboxid','mchtsvrtn','decc',
                     'ohmy','jff','hukshuks','htgawm','ckli','huweeee',
                     'aaaaarrgghhh','antongekngok','mnet','faaak','trns'
                     'wsowksksnwk','hotrl','falk','egggghhhhhh','fakkkk',
                     'iftw','aokbab','cuklawwww','poonpiriya','nattawut',
                     'kimcop','wkwkwkwkkwkwkwkwk','ftd','ashkskaksjd',
                     'aaaaahggggg','pimchanok','satsetsatset','swadikap',
                     'bebb','hahahahahh','uwma','dangal','dpo','eyyy',
                     'provokemagazine','bhahahahak','seeeeh','wedewww',
                     'tvxq','ueueu','dinitain','lahyaa','hamberr','cexi',
                     'ngentott','wkowkwo','smlmt','fwb','ngeten','whatthefvck',
                     'diputarbelit','cakmangkok', 'ovrl','siiiihhh','daddt',
                     'paspispus','ueueueu','wkaokqowlaowkok','iwww','rdwnfrmnsyh',
                     'lapett','hddu','lsk','asshskhs','wkakakakak','wataheck',
                     'wabb','kamseeeeeeeuuuu','cooy','tefak','mzk','lautnernyaaa',
                     'uben','sils','ngekngok','foxmoviespremiumhd','pbio','tepuuu',
                     'haihhhh', 'tsom', 'wwkwkwkwkk','samoek','brayy','peroah',
                     'pppfffttt','hhahaa','hailaaa','aghf','ahfakk','jooooo',
                     'elaahhhh','kuhhh','hhhhhhhh','wowkwkwk', 'vyll','hiikkkhhh',
                     'ajskqjskaana','wooooiiiiiii','ahelah','rtya','ngebe','rctv',
                     'cuar','hunham','bocen','semprol','gamonin','upill',
                     'lyv','bamf','arghtt','hahsjdbeocnkenfkdnjsbxockskcisndkdcdknxkdmkwndkdnkandksnsnmdjdnskansjbwjsndjdnsjxndmsn',
                     'pelirit','uwoogh','dofp','deeyach','hhahahaaa','hass','mbts',
                     'akwkwk','omgomg','onggg', 'csrnya','leutualy','bebeto','shayy',
                     'widihh','aghny','skskak','wwkwkwkkw','wqwqw','wadohhh','pubg',
                     'habede','kaaah','fuckk', 'yaksip','watchmenid','akwkwoowk',
                     'eskil','klikfilm','molatv','wokwowko','luurrrr','yasss',
                     'waaaaahhhh','duuuhhhh','wooooowww','qwtwfwtwywgsvqqqqwtwuwisip',
                     'tourent', 'hfa','hshdh','tflix','waduu','duilah','drakorindo',
                     'cuiii','pft','arhdjjdb','yaaw','smzbsmale','mbaaakk','shjjshfj','ahahahhh',
                     'terwow','hajahjhs','boomm','ihiyy','snewen','skksks','shshsjsj',
                     'aksksk','aaaahsgehwnahwhwhwvwghq','hadadahhhhh','nderrrr',
                     'aksjsggs','anjahskahs','woowwwww', 'hhhhhhh','ueueueue','heyyyyy',
                     'sjsiznsksk','bahhh','hahhehhoh','shshsh','wtcb','uwahh','hshshshsh',
                     'terrrr','aaaaaak','yihuy','aaargghhh', 'oyeee','hadeeuh','ftf','dipnya',
                     'woohee','nokhorom','ihhhhh','mincot','wkwkwwkwkwkwk','bruhhhhh',
                     'akwkwow','hheu','pisuh','haaaaaaaa', 'dadadahhh','ealaaahh','asdgjhfjkgdfh',
                     'shittt','bensu','mewgulf','yeeehaaaaaa','woohoooooo','skwk','ajdhsnjdhs',
                     'asfsgahshsh','posin','koks','hueeee', 'kuaya','huwawiwew','laff','booooommm',
                     'kubaper','ngiung','awkawk','wkwkkwwk','asshdhgshsjsk','wkwkwkkwkwwkwk',
                     'blaaah','yawlaa','krktr','starbak', 'wtffff','gtv','oalahh','seeeeehhhhh',
                     'momz','mamirt','xzkx','akahdhffvh','hdhejsjdks','wle','hfft','broou',
                     'coooy','woeee', 'hfttt','sksksksks','djaaaannnnccfc','akwjwj','ooooooo',
                     'hhhfftt','kwkwkkw','cihh','laaaaaaa','areyoulostbebiguuyrl','hdeehhh','chuakss',
                     'ponhub','ovt', 'mbaaaa','oooooom','skjsw','whaha','yeyeyeee','ajdghdkh','cenya',
                     'kkekekkk','coyyyy','hahahhahahaha','ahshwvwhsbwvs','otm','arggghh','ngahaha',
                     'eeeemmmm','ahayyyyy','heyhey','tfc','ckckrt','nmn','kulapongvanich','aitthipat',
                     'bitt','kahh','aaaakkkkk','duuuu','mskwydit','wkkwkwkwkwk', 'tchy','ngkoh',
                     'eyyyyy','fahhhh','dsah','eekin','wodnrhwkxneualo','alaaaaa','ueueeee','cucmeyyy',
                     'akhkh','luwh','citraxxi','hahajja', 'hasshhh','oohalah','beeppp','jbcu','waaaawaw',
                     'ahelahhh','ffuffu','hrrrggghh','xoxooxo','arggg','wkwkkwkwkw','ajkdkdks',
                     'aksksksjdkcj','nsd', 'colimon','knetz','soop','plekkkk','heumm','ngerong',
                     'woylah','hmmzz','ahsjskskfjk','huvt','skfnfikejd','mweehehe','iddkdksk','hsjsks',
                     'masnov','lghdtv','lgbtqvwxyz','aksksnnshdksk','hshs','adoyaai','jkfhegfliagwifgiywqgfyqwgfyqgfyiegf',
                     'waaay','nintik','asdghkkl','hwhwhwh','wkakak','aowkwowk','sjsjsjsjs',
                     'aksjskajwyatsh','muahahahahah','lbhs','wkwkwkwjsjsjssj','wqwqwqwq','ftw',
                     'wiboe','tnk','ekwkwk','hshsh','awowkowk','kdpp','meehhh','ajel',
                     'beehh','mgm','angrok','yaowoohh','akakskks','ppkm','meeks','ngokkkk',
                     'akhsks','whwhwh','kstew','fipm','tgc','wasweswos', 'cretttttt','cinn',
                     'ahahabab','etdahhhh','fck','wkwkwkkwkwkwkwkwkkw','nttd','beugh','xnya',
                     'sksksks','vxkxhsnxndmdjdndjdjdn','wwkw','ngederegdeg','borr',
                     'wadau','sembet','emk','shshshshshshshshshsh','huftttttt','yayayay',
                     'aksowkdoek','jasjuss','alhambra','molotovgirl','dantdm','gituhdwndiabwdhabwfh',
                     'dctv','nyetil', 'eeaao','hshshshs','uhuuy','zzoe','atez','bler','blor','sampis',
                     'jlo','iyeel','wooowwww','arhhhg','heeuuuu','wkkwwk', 'wkakakakakaka','uhuyyy',
                     'skwmwkwmwkw','bjir','wkwkwkwkkwu','wewew','bakekok','bnha','ahik','movimax',
                     'paragonxxi','oowhhh','tsang','lgux', 'ngegarong','lahkok','tcog','wzd','jder','ntond',
                     'ginit','wadidaw','kelulus','klux','uwoh','cokk','jedak','junji', 'ahoy','nyumm',
                     'pfoa','waoww','woee','eok','acau','mcdonalds','flis','bekasinians','aaakkk',
                     'awokwok','faakkk','ueueue', 'woooy','sksks','jbjb','awikwok','slebew','bayona',
                     'nsfw','unru','tnh','kpai','jsb','ptj','snaxx','covid', 'gidik','minsik','gamon',
                     'lgbtq','animovie','tasm','jnt','fastfurious','morbius','zsjl','seeeeee','wwkwk',
                     'disneyyy','ndakik', 'imdbnya','cinecrib','wst','ifwt','cmbyn','menfess','wckd',
                     'swank','brou','fsog','ittipat','nkcthi','tabetai','giur', 'ngl','sksk', 'afuk',
                     'jcw','xtina','kroc','blcu','sksksk','trll','jpf','doss','dcu','bvs','cgv','cilers',
                     'wwy', 'tdkr','mjb', 'nwh','bcu','dceu','kdm','okja','cruella','eternals','insidous',
                     'annabel', 'insidius', 'fiftyshades', 'insidiouos'])

# convert list to dictionary
listStopword = set(listStopword)

In [9]:
def stopword_removal(tokens):
  removed = []
  for t in tokens:
    if t not in listStopword:
        removed.append(t)
  return removed
    
tqdm.pandas(desc="Stopword Removal: ")
df['stopwords'] = df['tokens_normalized'].progress_apply(stopword_removal)
df.head(10)

Stopword Removal: 100%|██████████████████████████████████████████████████████| 17247/17247 [00:00<00:00, 302512.30it/s]


Unnamed: 0,judul,username,tweet,plot,akting,direktor,clean_text,tokens,tokens_normalized,stopwords
0,the desperate hour,penggores_pena,the desperate hour lakewood 2022 beberapa wakt...,1,1,1,the desperate hour lakewood beberapa waktu la...,"[the, desperate, hour, lakewood, beberapa, wak...","[the, desperate, hour, lakewood, beberapa, wak...","[the, desperate, hour, lakewood, bersalah, cer..."
1,pursuit,angeldevyoo,ini edisi males review jd singkat aja ya beber...,1,1,0,ini edisi males review jd singkat aja ya beber...,"[ini, edisi, males, review, jd, singkat, aja, ...","[ini, edisi, males, review, jd, singkat, aja, ...","[edisi, males, review, singkat, tontonan, libu..."
2,pursuit,lovinasoenmi,plot utama adalah dua orang deserter pursuit a...,1,1,0,plot utama adalah dua orang deserter pursuit a...,"[plot, utama, adalah, dua, orang, deserter, pu...","[plot, utama, adalah, dua, orang, deserter, pu...","[plot, utama, orang, deserter, pursuit, pembur..."
3,pursuit,yeobuun,film hereditary horror thrill midsommar gatau...,0,0,0,film hereditary horror thrill midsommar gatau...,"[film, hereditary, horror, thrill, midsommar, ...","[film, hereditary, horror, thrill, midsommar, ...","[film, hereditary, horror, thrill, midsommar, ..."
4,pursuit,bebekpakelobe,batman paling manusiawi combatnya juga sadis s...,1,0,0,batman paling manusiawi combatnya juga sadis s...,"[batman, paling, manusiawi, combatnya, juga, s...","[batman, paling, manusiawi, bertarung nya, jug...","[batman, manusiawi, bertarung nya, sadis, scen..."
5,pursuit,sonweeendy,iyaa aku udah baca itu yaampun sumpa keren pa...,1,0,0,iyaa aku udah baca itu yaampun sumpa keren pa...,"[iyaa, aku, udah, baca, itu, yaampun, sumpa, k...","[iyaa, aku, udah, baca, itu, yaampun, sumpa, k...","[iyaa, udah, baca, yaampun, sumpa, keren, para..."
6,pursuit,patheticlone,aku kasih judul nya aja liat kebioskop21 com ...,1,0,1,aku kasih judul nya aja liat kebioskop com aj...,"[aku, kasih, judul, nya, aja, liat, kebioskop,...","[aku, kasih, judul, nya, aja, liat, kebioskop,...","[kasih, judul, liat, kebioskop, com, ready, pl..."
7,pursuit,tramadanur,yang memberi ruang terhadap plot twist tersemb...,-1,0,-1,yang memberi ruang terhadap plot twist tersemb...,"[yang, memberi, ruang, terhadap, plot, twist, ...","[yang, memberi, ruang, terhadap, plot, twist, ...","[ruang, plot, twist, tersembunyi, sulit, diteb..."
8,pursuit,KeiNicro,tampan tailor nah kalo yg ini disebut sebut mi...,1,1,0,tampan tailor nah kalo yg ini disebut sebut mi...,"[tampan, tailor, nah, kalo, yg, ini, disebut, ...","[tampan, tailor, nah, kalo, yg, ini, disebut, ...","[tampan, tailor, the, pursuit, of, happyness, ..."
9,pursuit,kakdeo,62 cold pursuit 2019 jangan berani berani gang...,1,0,0,cold pursuit jangan berani berani ganggu kel...,"[cold, pursuit, jangan, berani, berani, ganggu...","[cold, pursuit, jangan, berani, berani, ganggu...","[cold, pursuit, berani, berani, ganggu, keluar..."


# Stemming Token

In [10]:
# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemmed
def stemmed_wrapper(term):
    return stemmer.stem(term)


term_dict = {}
for document in df['stopwords']:
    for term in document:
        if term not in term_dict:
            term_dict[term] = ' '
            
print(len(term_dict))

for term in tqdm(term_dict):
    term_dict[term] = stemmed_wrapper(term)
print(term_dict)

22878


100%|████████████████████████████████████████████████████████████████████████████| 22878/22878 [22:31<00:00, 16.92it/s]






In [11]:
# apply stemmed term to dataframe
def get_stemmed_term(document):
    return [term_dict[term] for term in document]

df['stemmed'] = df['stopwords'].swifter.apply(get_stemmed_term)
print(df['stemmed'])

Pandas Apply:   0%|          | 0/17247 [00:00<?, ?it/s]

0        [the, desperate, hour, lakewood, salah, cerita...
1        [edisi, males, review, singkat, tonton, libur,...
2        [plot, utama, orang, deserter, pursuit, buru, ...
3        [film, hereditary, horror, thrill, midsommar, ...
4        [batman, manusiawi, tarung nya, sadis, scene, ...
                               ...                        
17242    [kasi, rekomendasi, film, better, days, bagus,...
17243    [film, better, days, wow, wow, woooow, kereeee...
17244    [better, days, ni, film, akal, akal, siape, du...
17245    [tarik, pas, ending, alternatif, putus, pakai,...
17246    [better, days, hahahaha, nemu, film, pantes, n...
Name: stemmed, Length: 17247, dtype: object


# De-Tokenizer

In [12]:
def detokenizer(tokens):
  return TreebankWordDetokenizer().detokenize(tokens)

tqdm.pandas(desc="De-tokenizer: ")
df['detokenize'] = df['stemmed'].progress_apply(detokenizer)
df.head(10)

De-tokenizer: 100%|███████████████████████████████████████████████████████████| 17247/17247 [00:01<00:00, 14894.37it/s]


Unnamed: 0,judul,username,tweet,plot,akting,direktor,clean_text,tokens,tokens_normalized,stopwords,stemmed,detokenize
0,the desperate hour,penggores_pena,the desperate hour lakewood 2022 beberapa wakt...,1,1,1,the desperate hour lakewood beberapa waktu la...,"[the, desperate, hour, lakewood, beberapa, wak...","[the, desperate, hour, lakewood, beberapa, wak...","[the, desperate, hour, lakewood, bersalah, cer...","[the, desperate, hour, lakewood, salah, cerita...",the desperate hour lakewood salah cerita suara...
1,pursuit,angeldevyoo,ini edisi males review jd singkat aja ya beber...,1,1,0,ini edisi males review jd singkat aja ya beber...,"[ini, edisi, males, review, jd, singkat, aja, ...","[ini, edisi, males, review, jd, singkat, aja, ...","[edisi, males, review, singkat, tontonan, libu...","[edisi, males, review, singkat, tonton, libur,...",edisi males review singkat tonton libur dp des...
2,pursuit,lovinasoenmi,plot utama adalah dua orang deserter pursuit a...,1,1,0,plot utama adalah dua orang deserter pursuit a...,"[plot, utama, adalah, dua, orang, deserter, pu...","[plot, utama, adalah, dua, orang, deserter, pu...","[plot, utama, orang, deserter, pursuit, pembur...","[plot, utama, orang, deserter, pursuit, buru, ...",plot utama orang deserter pursuit buru wamil j...
3,pursuit,yeobuun,film hereditary horror thrill midsommar gatau...,0,0,0,film hereditary horror thrill midsommar gatau...,"[film, hereditary, horror, thrill, midsommar, ...","[film, hereditary, horror, thrill, midsommar, ...","[film, hereditary, horror, thrill, midsommar, ...","[film, hereditary, horror, thrill, midsommar, ...",film hereditary horror thrill midsommar gatau ...
4,pursuit,bebekpakelobe,batman paling manusiawi combatnya juga sadis s...,1,0,0,batman paling manusiawi combatnya juga sadis s...,"[batman, paling, manusiawi, combatnya, juga, s...","[batman, paling, manusiawi, bertarung nya, jug...","[batman, manusiawi, bertarung nya, sadis, scen...","[batman, manusiawi, tarung nya, sadis, scene, ...",batman manusiawi tarung nya sadis scene pursui...
5,pursuit,sonweeendy,iyaa aku udah baca itu yaampun sumpa keren pa...,1,0,0,iyaa aku udah baca itu yaampun sumpa keren pa...,"[iyaa, aku, udah, baca, itu, yaampun, sumpa, k...","[iyaa, aku, udah, baca, itu, yaampun, sumpa, k...","[iyaa, udah, baca, yaampun, sumpa, keren, para...","[iyaa, udah, baca, yaampun, sumpa, keren, para...",iyaa udah baca yaampun sumpa keren parah endin...
6,pursuit,patheticlone,aku kasih judul nya aja liat kebioskop21 com ...,1,0,1,aku kasih judul nya aja liat kebioskop com aj...,"[aku, kasih, judul, nya, aja, liat, kebioskop,...","[aku, kasih, judul, nya, aja, liat, kebioskop,...","[kasih, judul, liat, kebioskop, com, ready, pl...","[kasih, judul, liat, bioskop, com, ready, play...",kasih judul liat bioskop com ready player one ...
7,pursuit,tramadanur,yang memberi ruang terhadap plot twist tersemb...,-1,0,-1,yang memberi ruang terhadap plot twist tersemb...,"[yang, memberi, ruang, terhadap, plot, twist, ...","[yang, memberi, ruang, terhadap, plot, twist, ...","[ruang, plot, twist, tersembunyi, sulit, diteb...","[ruang, plot, twist, sembunyi, sulit, tebak, s...",ruang plot twist sembunyi sulit tebak sensor r...
8,pursuit,KeiNicro,tampan tailor nah kalo yg ini disebut sebut mi...,1,1,0,tampan tailor nah kalo yg ini disebut sebut mi...,"[tampan, tailor, nah, kalo, yg, ini, disebut, ...","[tampan, tailor, nah, kalo, yg, ini, disebut, ...","[tampan, tailor, the, pursuit, of, happyness, ...","[tampan, tailor, the, pursuit, of, happyness, ...",tampan tailor the pursuit of happyness usung p...
9,pursuit,kakdeo,62 cold pursuit 2019 jangan berani berani gang...,1,0,0,cold pursuit jangan berani berani ganggu kel...,"[cold, pursuit, jangan, berani, berani, ganggu...","[cold, pursuit, jangan, berani, berani, ganggu...","[cold, pursuit, berani, berani, ganggu, keluar...","[cold, pursuit, berani, berani, ganggu, keluar...",cold pursuit berani berani ganggu keluarga lia...


# Saving Output

In [13]:
df.to_pickle('preprocessed_df.pkl')