In [34]:
! pip install gensim



In [64]:
import os
import pandas as pd
from tqdm.auto import tqdm

from nltk.tokenize import word_tokenize

## Prepare Corpus

In [36]:
train_data = pd.read_csv("./data_worthcheck/train.csv")
test_data = pd.read_csv("./data_worthcheck/test.csv")

In [37]:
train_data["label"] = train_data["label"].map({'no': 0, 'yes':1})
test_data["label"] = test_data["label"].map({'no': 0, 'yes':1})

In [38]:
print("TRAIN DATA")
len_train_0 = len(train_data[train_data["label"] == 0])
len_train_1 = len(train_data[train_data["label"] == 1])
len_train = len(train_data)
print("NO: ", len_train_0)
print("YES: ", len_train_1)
print("NO + YES = ", len_train_0 + len_train_1)
print("TOTAL: ", len_train)

TRAIN DATA
NO:  15512
YES:  6089
NO + YES =  21601
TOTAL:  21601


In [39]:
print("TEST DATA")
len_test_0 = len(test_data[test_data["label"] == 0])
len_test_1 = len(test_data[test_data["label"] == 1])
len_test = len(test_data)
print("NO: ", len_test_0)
print("YES: ", len_test_1)
print("NO + YES = ", len_test_0 + len_test_1)
print("TOTAL: ", len_test)

TEST DATA
NO:  2093
YES:  707
NO + YES =  2800
TOTAL:  2800


In [40]:
test_data.head()

Unnamed: 0,text_a,label
0,jek dajal ga depok bang,0
1,detikcom untung depok masuk wilayah nya ridwan...,0
2,df dom jakarta depok yg gunain vc cabang nya c...,0
3,your2rl depok jkt,0
4,doakan indonesia selamat virus corona pkb depo...,1


In [41]:
train_data = train_data.iloc[:, 1:]

In [42]:
train_data.head()

Unnamed: 0,text_a,label
0,betewe buka twitter cuman ngetweet liat home b...,0
1,mas piyuuu mugo2 corona tuh mulut tersumpal ma...,0
2,e100ss gini buka informasi sejelas nya identit...,1
3,neng solo wes ono terduga corona cobo neng ati...,0
4,midiahn nii akun gak takut takut nya isu coron...,0


In [54]:
train_sentences = [word_tokenize(text.lower()) for text in tqdm(train_data.text_a)]
test_sentences = [word_tokenize(text.lower()) for text in tqdm(test_data.text_a)]

  0%|          | 0/21601 [00:00<?, ?it/s]

  0%|          | 0/2800 [00:00<?, ?it/s]

## Preprocessing

In [65]:
import spacy
import nltk

##### 1. CASE FOLDING

In [77]:
# CASE FOLDING

for idx,sentence in enumerate(tqdm(train_data["text_a"])):
    train_data.at[idx,'text_a'] = sentence.lower()

for idx,sentence in enumerate(tqdm(test_data["text_a"])):
    test_data.at[idx,"text_a"] = sentence.lower()


  0%|          | 0/21601 [00:00<?, ?it/s]

  0%|          | 0/2800 [00:00<?, ?it/s]

##### 2. STEMMING

In [82]:
! pip install Sastrawi



In [88]:
# STEMMING

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

factory = StemmerFactory()
stemmer = factory.create_stemmer()

for idx,sentence in enumerate(tqdm(train_data["text_a"])):
    train_data.at[idx,'text_a'] = stemmer.stem(sentence)

for idx,sentence in enumerate(tqdm(test_data["text_a"])):
    test_data.at[idx,"text_a"] = stemmer.stem(sentence)

  0%|          | 0/21601 [00:00<?, ?it/s]

  0%|          | 0/2800 [00:00<?, ?it/s]

##### 3. STOP WORD REMOVAL

In [89]:
# STOP WORD REMOVAL

from nltk.corpus import stopwords

indonesian_stopwords = stopwords.words('indonesian')

train_stop_removed = []
test_stop_removed = []

for sentence in tqdm(train_data["text_a"]):
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    sentence_stop_removed = " ".join(sentence_stop_removed)
    train_stop_removed.append(sentence_stop_removed)

for sentence in tqdm(test_data["text_a"]):
    sentence_stop_removed = [word for word in sentence.split(" ") if word not in indonesian_stopwords]
    sentence_stop_removed = " ".join(sentence_stop_removed)
    test_stop_removed.append(sentence_stop_removed)

train_data["text_a"] = train_stop_removed
test_data["text_a"] = test_stop_removed

  0%|          | 0/21601 [00:00<?, ?it/s]

  0%|          | 0/2800 [00:00<?, ?it/s]

## Train FastText Model

In [96]:
#data preparation for fasttext

# create fasttext readable train data
with open('fasttext_train.txt', 'w+') as f:
    for each_text, each_label in zip(train_data['text_a'], train_data['label']):
        f.writelines(f'__label__{each_label} {each_text}\n')

In [97]:
# create fasttext readable test data
with open('fasttext_test.txt', 'w') as f:
    for each_text, each_label in zip(test_data['text_a'], test_data['label']):
        f.writelines(f'__label__{each_label} {each_text}\n')

In [100]:
! head -n 10 fasttext_train.txt

__label__0 betewe buka twitter cuman ngetweet liat home berita corona panik pikir ndamau buka2 home yg aware aja i ll stay at home nda rumah kalo nda penting2 banget
__label__0 mas piyuuu mugo2 corona tuh mulut sumpal ma corona
__label__1 e100ss gin buka informasi nya identitas daerah derita jangkit info masyarakat isolasi nya kontak langsung derita positif corona tutup tutup
__label__0 neng solo wes ono duga corona cobo neng ati mu neng conora
__label__0 midiahn nii akun gak takut takut nya isu corona wkwkwkw
__label__0 hey corona prrgi sna
__label__0 gara corona masuk aja mesti scan jidat gw kek jajan indomaret
__label__1 jokowi menteri2 nya sila tes corona
__label__1 cegah corona other moms minum multivitamin my mom minum rebus sambiloto
__label__0 mamaciaaa mnrut gue jngan dkt2 corona cb dkt yg y puspa jaya damri als dkt tran jakarta aj


In [101]:
! head -n 10 fasttext_test.txt

__label__0 jek dajal ga depok bang
__label__0 detikcom untung depok masuk wilayah nya ridwan kamil kalo masuk wilayah nya anis abis lu bully ama buzzer kolam
__label__0 df dom jakarta depok yg gunain vc cabang nya cabang yg cantum pas kesana gabisa bayar pake shopeepay
__label__0 your2rl depok jkt
__label__1 doa indonesia selamat virus corona pkb depok gelar nusantara bershalawat
__label__1 warga depok ganggu isu corona
__label__1 kenapaa dengar kabar salah wni positif corona depok tinggal ku ku kawatir takut
__label__0 hug f cibinong bogor depok ga makan siang bareng m24
__label__0 mukenahhh tlongggg ak maw hp ak kentank bingits sdh belah hadiah ultah hshs ak depok btw follback yh
__label__0 g00d p4r3nts gilir corvid 19 jakarta beda depok banjir jakarta


In [110]:
import fasttext

In [56]:
# HYPERPARAMETERS
vector_size = 128
window = 5
min_count = 3
workers = 4
iter = 1000

In [104]:
file_to_train = 'fasttext_train.txt'

model = fasttext.train_supervised(
    file_to_train,
    dim = vector_size,
    epoch = iter,
    minCount = 3,
    verbose= True,
    ws = window
    )

Read 0M words
Number of words:  11862
Number of labels: 2
Progress: 100.0% words/sec/thread: 1415012 lr:  0.000000 avg.loss:  0.048748 ETA:   0h 0m 0s


In [106]:
os.makedirs('model/fasttext/', exist_ok=True)
model.save_model('model/fasttext/trained.fasttext')

## Test FastText

In [143]:
file_to_test = 'fasttext_test.txt'

print('Berikut ini nilai precision, recall, da f1 score dari hasil testing file test')

result = model.test(
    file_to_test)

num_data = result[0]
precision = result[1]
recall = result[2]

print("Number of test data :", num_data)
print("Precision :", precision)
print("Recall :", recall)

Berikut ini nilai precision, recall, da f1 score dari hasil testing file test
Number of test data : 2800
Precision : 0.7857142857142857
Recall : 0.7857142857142857


In [116]:
# manual testing

print('Berikut ini nilai precision, recall, dan f1 score dari pengujian testing file secara manual.')

precision = 0
recall = 0

model.

for idx,sentence in enumerate(tqdm(test_data["text_a"])):
    m

for idx,sentence in enumerate(tqdm(test_data["text_a"])):
    test_data.at[idx,"text_a"] = sentence.lower()

(('__label__0',), array([1.00001001]))