In [57]:
#Import Library
import numpy as np
import pandas as pd
import re
import string
from sklearn.metrics import f1_score,accuracy_score
from simpletransformers.classification import ClassificationModel
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import logging
import warnings
warnings.simplefilter("ignore")
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"


In [58]:
# Load Dataset
test_df = pd.read_csv('DATA/v1/test.csv')
test_df['status'] = test_df['status'].apply(lambda x : str(x).strip())

In [59]:
test_df.head()

Unnamed: 0,instansi,reference,status
0,Kementerian Koperasi dan Usaha Kecil dan Menengah,Kementrian Koperasi dan UKM,yes
1,Kementerian Koperasi dan Usaha Kecil dan Menengah,KemenkopUKM,yes
2,Kementerian Koperasi dan Usaha Kecil dan Menengah,UKM,no
3,Kementerian Koperasi dan Usaha Kecil dan Menengah,Usaha Kecil dan Menengah,no
4,Kementerian Pendayagunaan Aparatur Negara dan ...,PANRB,yes


In [60]:
count_df = test_df['status'].value_counts().reset_index().rename({"status" : "count", "index" : "status"}, axis=1)
count_df['pct'] = 100*count_df['count']/len(test_df)
count_df

Unnamed: 0,status,count,pct
0,no,395,67.062818
1,yes,194,32.937182


In [61]:
#Ubah nama kolom
test_df = test_df.rename({"instansi" : "sentence1", "reference" : "sentence2", "status" : "label"}, axis=1)

In [62]:
#label
test_df['label'] = test_df['label'].map({"no" : 0, "yes" : 1})

In [63]:
test_df[['sentence1', 'sentence2']] = test_df[['sentence1', 'sentence2']].applymap(lambda x : str(x).lower())

In [64]:
test_df.head()

Unnamed: 0,sentence1,sentence2,label
0,kementerian koperasi dan usaha kecil dan menengah,kementrian koperasi dan ukm,1
1,kementerian koperasi dan usaha kecil dan menengah,kemenkopukm,1
2,kementerian koperasi dan usaha kecil dan menengah,ukm,0
3,kementerian koperasi dan usaha kecil dan menengah,usaha kecil dan menengah,0
4,kementerian pendayagunaan aparatur negara dan ...,panrb,1


In [65]:
test_df = test_df.rename({'sentence1':'text_a', 'sentence2':'text_b', 'label' : 'labels'}, axis=1)

In [66]:
list_model = ['indobert_base_p2', 'indobert_base_p1']

In [67]:
INDEX = 1
model = ClassificationModel('bert', "E:\Agus Folder\FIle pusatAI-ITB\Project\Gratifikasi KPK\Models\{}/bestModel".format(list_model[INDEX]), use_cuda=True, cuda_device = 0, 
        args={"use_multiprocessing": False, 
              "use_multiprocessing_for_evaluation": False, 
              "process_count": 1}) 

In [68]:
def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation.replace('?', '')), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\r', '', text)
    text = text.replace('?', ' ?')
    text = re.sub('[.;:!\'?,\"()\[\]*~]', '', text)
    text = re.sub('(<br\s*/><br\s*/>)|(\-)|(\/)', '', text)
    text = re.sub(r"^(â€œ)", "" ,text)
    return text

In [69]:
import string
test_df['text_a'] = test_df['text_a'].astype(str).apply(clean_text)

In [70]:
##Prediction
string = []
strings = []
for i in test_df.index:
  string.append(test_df['text_a'][i])
  string.append(test_df['text_b'][i])
  strings.append(string)
  string = []
predictions, raw_outputs = model.predict(strings)

  0%|          | 0/74 [00:00<?, ?it/s]

In [71]:
labels_true = test_df['labels'].values.tolist()

In [72]:
#F1 Score
f1 = f1_score(labels_true, y_pred=predictions, average='micro')
acc = accuracy_score(labels_true, predictions)

# Print Score
print("F1 Score Best Model : {}".format(f1))
print('Accuracy Score Best Model : {}'.format(acc))

F1 Score Best Model : 0.9422750424448217
Accuracy Score Best Model : 0.9422750424448217


In [73]:
test_df['prediksi'] = predictions

In [74]:
#Save result (prediction and F1 score)
script_path = ""
new_abs_path = os.path.join(script_path, f"result")
if not os.path.exists(new_abs_path):
  os.mkdir(new_abs_path)

test_df.to_csv('result/prediksi_{}.csv'.format(list_model[INDEX]), sep=";")
with open('result/score_{}.txt'.format(list_model[INDEX]), 'w') as f:
    f.write('F1 Score : {}'.format(f1))
    f.write('\n')
    f.write('Accuracy Score : {}'.format(acc))

# Print in txt format
print("F1 Score Best Model : {}".format(f1))
print('Accuracy Score Best Model : {}'.format(acc))

F1 Score Best Model : 0.9422750424448217
Accuracy Score Best Model : 0.9422750424448217
