# Fine-tune IndoBERTweet dengan Huggingface API Trainer
Paper: Fajri Koto, Jey Han Lau, and Timothy Baldwin. IndoBERTweet: A Pretrained Language Model for Indonesian Twitter with Effective Domain-Specific Vocabulary Initialization. In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing (EMNLP 2021), Dominican Republic (virtual).  
Github: https://github.com/indolem/IndoBERTweet

In [1]:
# Optional
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Import Lib

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn import svm

# Import Data
data harus sudah di stemming

In [119]:
text = pd.read_csv("/content/drive/Shareddrives/DAC UNPAD: Yaudahlah/FINAL MENANG BISMILLAH/Data/Balance/NonStopword_Min2_Clean/clean_train.csv", na_values=" ", keep_default_na=False)
text = text.dropna(how="any")

In [120]:
text

Unnamed: 0.1,Unnamed: 0,raw,processed,label
0,0,"@AREAJULID IMO, lbh baik jangan salahin ppkm l...",imo lebih baik jangan salahkan ppkm lagi ini m...,2
1,1,@kembaransilent Iya Yg viral grgr bikin acara ...,iya yang viral gara-gara buat acara pas ppkm itu,1
2,2,Saat ppkm begini kerja jadi enak banget cuman ...,saat ppkm begini kerja jadi enak banget hanya ...,2
3,3,#SukseskanPPKMLevel4\nPPKM itu mmbatasi mobili...,ppkm itu mmbatasi mobilitasmu tapi bukan berar...,2
4,4,@bitiesfess ppkm itu ra makanya hacep di kamar...,ppkm itu ra makanya hacep di kamar bukan di pu...,1
...,...,...,...,...
3847,3847,@erasedpoem wah bakalan seru ada temen gym cwk...,wah akan seru ada teman gym cwk ini nanti seha...,2
3848,3848,PPKM level 99 raja terakhirnya siapa ya?,ppkm level raja terakhirnya siapa iya,1
3849,3849,PPKM: Para Penista Kapan Mampoes ?? https://t....,ppkm para penista kapan mampoes,0
3850,3850,Kayanya ini pergi terpagi selama Ppkm deh :(,kayaknya ini pergi terpagi selama ppkm deh,1


In [121]:
train, test = train_test_split(text, test_size=0.2)

In [122]:
train_label = np.array(train['label'].to_list())
train_text = np.array(train['processed'].to_list())

In [123]:
test_label = np.array(test['label'].to_list())
test_text = np.array(test['processed'].to_list())

In [124]:
text[text['processed'].isnull() == True]

Unnamed: 0.1,Unnamed: 0,raw,processed,label


## Vectorisasi Data Text dengan TfID

In [125]:
vectorizer = TfidfVectorizer(max_features=250)
vectorizer = vectorizer.fit(text['processed'].to_list())

In [126]:
train_vectorized = vectorizer.transform(train_text)
test_vectorized = vectorizer.transform(test_text)

# Modelling

In [127]:
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_vectorized, train_label)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

## Validating

In [128]:
prediction_svm = SVM.score(test_vectorized, test_label)

In [129]:
print("acc :", prediction_svm)

acc : 0.708171206225681


In [130]:
data_testing = SVM.predict(test_vectorized)

In [131]:
test_out = test.reset_index(drop=True)
test_out["predict"] = data_testing

In [132]:
test_out.to_csv("/content/drive/Shareddrives/DAC UNPAD: Yaudahlah/FINAL MENANG BISMILLAH/Error Analysis/SVM_imbalanced_Stemming.csv")

## Cross Validation

In [15]:
from sklearn.model_selection import cross_val_score
cross_val_score(SVM, test_vectorized, test_label, cv=5)

array([0.7       , 0.69044586, 0.69617834, 0.67834395, 0.6955414 ])

# Model Input Manual

In [16]:
testing = pd.read_csv("/content/drive/Shareddrives/DAC UNPAD: Yaudahlah/FINAL MENANG BISMILLAH/Data/Non_Stem_Clean/clean_test.csv")

In [17]:
testing

Unnamed: 0,raw,processed,label
0,Ppkm buat kami yg ga dipulau jawapun rada deg ...,ppkm dipulau jawapun deg degan kerja takut per...,?
1,"Ditengah orang2 yang pada mikir ;\n- ""waduh PP...",ditengah orang mikir ppkm mulai besok mr nepat...,?
2,PPKM KENAPA BERLAKU SAMPAI 20 JULI..!? TANGGAL...,ppkm berlaku juli tanggal juli hari raya idul ...,?
3,PPKM kenapa sumatera ngga ya? Dan lain2 juga. ...,ppkm sumatera iya kompak,?
4,sch! ppkm darurat kan cuman jawa-bali ya berar...,sch ppkm darurat jawa bali iya berarti ptm dae...,?
...,...,...,...
195,@monika_monika77 Aq blm mutual kk\nKrn skrng P...,mutual kakak ppkm jadi konsentrasi rumah worki...,?
196,"PPKM mematahkan segalanya:"")",ppkm mematahkan,?
197,"@PT_Transjakarta min mau tanya, selama PPKM Da...",min tanya ppkm darurat jam operasional malam n...,?
198,Mematuhi PPKM Mikro Darurat tujuannya menyelam...,mematuhi ppkm mikro darurat tujuannya menyelam...,?


In [None]:
def predict(text: str):
    text_vec = vectorizer.transform([text])
    pred = SVM.predict(text_vec)
    print(pred)

In [None]:
predict("enak banget keluar luar")

[0]
