# Import Library

In [1]:
import nltk
import gensim
import numpy as np
import pandas as pd
from tqdm import tqdm
from gensim.models import FastText

# Tweet Dataset

In [2]:
# read dataset
tweet_df = pd.read_pickle(r'C:\Users\HP Victus 16\Documents\TA_Code\Preprocessing\preprocessed_df.pkl')
tweet_df = pd.DataFrame(tweet_df['detokenize'])
tweet_df = tweet_df.rename(columns={'detokenize': 'tokens_'})
tweet_df = tweet_df.drop_duplicates(subset='tokens_', keep='first').reset_index(drop=True)
tweet_df.head()

Unnamed: 0,tokens_
0,the desperate hour lakewood salah cerita suara...
1,edisi males review singkat tonton libur dp des...
2,plot utama orang deserter pursuit buru wamil j...
3,film hereditary horror thrill midsommar gatau ...
4,batman manusiawi tarung nya sadis scene pursui...


# Tokenizing Tweet Dataset

In [3]:
# Tokenization
def tokenizing(data):
    return nltk.tokenize.word_tokenize(data)

tqdm.pandas(desc="Tokenizing Tweet Data : ")
tweet_df['tokens_'] = tweet_df['tokens_'].astype(str).progress_apply(tokenizing)
tweet_df.head()

Tokenizing Tweet Data : 100%|██████████████████████████████████████████████████| 17116/17116 [00:01<00:00, 9330.87it/s]


Unnamed: 0,tokens_
0,"[the, desperate, hour, lakewood, salah, cerita..."
1,"[edisi, males, review, singkat, tonton, libur,..."
2,"[plot, utama, orang, deserter, pursuit, buru, ..."
3,"[film, hereditary, horror, thrill, midsommar, ..."
4,"[batman, manusiawi, tarung, nya, sadis, scene,..."


# Tweet Corpus FastText 

In [4]:
model_tweet = FastText(sentences=tweet_df['tokens_'], window=4, min_count=1, workers=6)

model_tweet.save("fasttext_tweet.model")

In [5]:
model_tweet = FastText.load("fasttext_tweet.model")

ft = model_tweet.wv

ft.most_similar('sinematografi', topn=10)

[('cinematografi', 0.9991486072540283),
 ('sinematografis', 0.9986787438392639),
 ('fotografi', 0.9981692433357239),
 ('sinematography', 0.9974574446678162),
 ('pornografi', 0.9970502257347107),
 ('filmografi', 0.9955441951751709),
 ('cinematograpy', 0.9953588247299194),
 ('sinematografer', 0.9952602386474609),
 ('fotografik', 0.9936928153038025),
 ('visual', 0.9936772584915161)]

# News Dataset

In [6]:
# read dataset
news_df = pd.read_csv(r'C:\Users\HP Victus 16\Documents\TA_Code\Dataset\indonews-clean.csv')
news_df = news_df[['clean_data']].copy(deep=True)
news_df = news_df.rename(columns={'clean_data': 'tokens_'})
news_df = news_df.drop_duplicates(subset='tokens_', keep='first').reset_index(drop=True)
news_df.head()

Unnamed: 0,tokens_
0,jakarta wakil gubernur daerah khusus ibukota j...
1,jakarta badan awas milu daerah khusus ibukota ...
2,jakarta wakil ketua komisi ix dewan perwakilan...
3,jakarta pasang calon nomor urut tiga anies bas...
4,jakarta rumah partai golkar sedikit guncang uj...


# Tokenizing News Dataset

In [7]:
news_df['tokens_'] = news_df['tokens_'].apply(str)
news_df['tokens_'] = news_df['tokens_'].astype(np.uint8,errors='ignore')

tqdm.pandas(desc="Tokenizing News Data : ")
news_df['tokens_'] = news_df['tokens_'].progress_apply(tokenizing)
news_df.head()

Tokenizing News Data : 100%|█████████████████████████████████████████████████| 140890/140890 [01:15<00:00, 1871.72it/s]


Unnamed: 0,tokens_
0,"[jakarta, wakil, gubernur, daerah, khusus, ibu..."
1,"[jakarta, badan, awas, milu, daerah, khusus, i..."
2,"[jakarta, wakil, ketua, komisi, ix, dewan, per..."
3,"[jakarta, pasang, calon, nomor, urut, tiga, an..."
4,"[jakarta, rumah, partai, golkar, sedikit, gunc..."


# News Corpus FastText

In [8]:
model_news = FastText(sentences=news_df['tokens_'], window=4, min_count=1, workers=6)

model_news.save("fasttext_news.model")

In [9]:
model_news = FastText.load("fasttext_news.model")

ft = model_news.wv

ft.most_similar('film', topn=10)

[('filmfilm', 0.9886583685874939),
 ('mfilm', 0.9735487103462219),
 ('aadcfilm', 0.9573121070861816),
 ('filmkubo', 0.9388416409492493),
 ('rsfilm', 0.9371312260627747),
 ('filmjoy', 0.9197675585746765),
 ('filmn', 0.9083369374275208),
 ('filma', 0.9066123366355896),
 ('films', 0.905907392501831),
 ('mimefilm', 0.8948878049850464)]

# Concatenate Two Dataset

In [10]:
df = pd.concat([news_df, tweet_df])
df = df.rename(columns={'tokens_': 'merged'})
df['merged'] = df['merged'].astype(np.uint8,errors='ignore')
df['merged'].reset_index(drop=True,inplace=True)
df.head()

Unnamed: 0,merged
0,"[jakarta, wakil, gubernur, daerah, khusus, ibu..."
1,"[jakarta, badan, awas, milu, daerah, khusus, i..."
2,"[jakarta, wakil, ketua, komisi, ix, dewan, per..."
3,"[jakarta, pasang, calon, nomor, urut, tiga, an..."
4,"[jakarta, rumah, partai, golkar, sedikit, gunc..."


# News + Tweet Corpus FastText

In [11]:
model_conc = FastText(sentences=df['merged'], window=4, min_count=1, workers=6)

model_conc.save("fasttext_news_tweet.model")

In [23]:
model_conc = FastText.load("fasttext_news_tweet.model")

ft = model_conc.wv

ft.most_similar('skrip', topn=40)

[('tanskrip', 0.8184252977371216),
 ('skripsi', 0.7843674421310425),
 ('skripochka', 0.7686924934387207),
 ('enskripsi', 0.7673084139823914),
 ('skript', 0.745566189289093),
 ('skrit', 0.732649028301239),
 ('deskripsiin', 0.7300468683242798),
 ('traskrip', 0.7249947786331177),
 ('transkrip', 0.7214085459709167),
 ('munuskrip', 0.716574490070343),
 ('manuskrip', 0.699177086353302),
 ('deskripsi', 0.689950704574585),
 ('dienskripsi', 0.6833628416061401),
 ('inskripsi', 0.6783464550971985),
 ('makrip', 0.677362322807312),
 ('begrip', 0.677038848400116),
 ('terenskripsi', 0.6664600372314453),
 ('skr', 0.6575965881347656),
 ('alrip', 0.654687225818634),
 ('dienkrip', 0.6442853808403015),
 ('scriptnya', 0.6367748379707336),
 ('sketsa', 0.6353818774223328),
 ('skrg', 0.6351134777069092),
 ('transkripsi', 0.6341580152511597),
 ('naskah', 0.6323281526565552),
 ('scripnya', 0.62846440076828),
 ('mendeskripskannya', 0.620667576789856),
 ('skrd', 0.6196370124816895),
 ('dokumentar', 0.616838157176