In [1]:
import pandas as pd
import numpy as np
import os
import gc
import random

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

import transformers
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

2023-04-04 17:15:54.968045: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-04 17:15:55.946734: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-04 17:15:55.946846: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
MODEL_TYPE = 'bert-base-multilingual-cased'
L_RATE = 1e-5
MAX_LEN = 512

NUM_EPOCHS = 6
BATCH_SIZE = 2
NUM_CORES = os.cpu_count() - 2

In [3]:
os.environ["WANDB_DISABLED"] = "true"

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
df_train = pd.read_csv("dataset/indo_java_nli_training.csv", sep='\t')
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle the data

In [6]:
df_train_new = pd.DataFrame()
df_train_new["premise"] = df_train["premise"]
df_train_new["hypothesis"] = df_train["jv_hypothesis"]
df_train_new["label"] = df_train["label"]
df_train_new.head()

Unnamed: 0,premise,hypothesis,label
0,"Gempa bumi magnitudo 7,8 mengguncang sebagian ...",Kabeh kerusakan ing San Francisco ketenger gem...,1
1,"Dibuat dengan seukuran tubuh mereka, serta dib...",Krono biasane dijupukaken kembang ing kono-kon...,0
2,Partai Tani Indonesia adalah partai politik ya...,Partai Petani Filipina yaiku partai politik si...,1
3,Empat dari penumpang Lufthansa berasal dari Je...,Jumlah penumpang Lufthansa sing mburi penerban...,2
4,"Menurutnya, sang adik dalam keadaan baik-baik ...","""Adik ku masih ing pengawasan dokter.""",0


In [7]:
df_valid = pd.read_csv("dataset/indo_java_nli_validation.csv", sep='\t')
df_valid = df_valid.sample(frac=1).reset_index(drop=True) #shuffle the data

In [8]:
df_valid_new = pd.DataFrame()
df_valid_new["premise"] = df_valid["premise"]
df_valid_new["hypothesis"] = df_valid["jv_hypothesis"]
df_valid_new["label"] = df_valid["label"]
df_valid_new.head()

Unnamed: 0,premise,hypothesis,label
0,"Pada 15 April 2002, pesawat Air China Boeing 7...",Pesawat Air China yaiku pesawat sing jenengé C...,1
1,"Sementara itu, BEI mencatat frekuensi perdagan...",'Ora ana saham sing nilainipun ngisor' dalam b...,2
2,Dia juga tergabung dalam timnas U-17 dan U-20 ...,Nangsa riko melu ing timnas U-17.,2
3,Syahrir mendirikan Partai Sosialis Indonesia p...,Syahrir kayadene Partai Sosialis Indonesia.,0
4,"Cikande adalah desa di kecamatan Saguling, Kab...","""Cikande ora tegese ing Kabupaten Bandung Wétan.""",2


In [9]:
df_test = pd.read_csv("dataset/indo_java_nli_testing.csv", sep='\t')
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle the data

In [10]:
df_test_new = pd.DataFrame()
df_test_new["premise"] = df_test["premise"]
df_test_new["hypothesis"] = df_test["jv_hypothesis"]
df_test_new["label"] = df_test["label"]
df_test_new.head()

Unnamed: 0,premise,hypothesis,label
0,Martina Hingis memegang rekor sebagai petenis ...,Martina Hingis umur 90 taun ing taun 1997.,2
1,"Sementara, sebelumnya pelantun hit Can 't Feel...",Can &#39;t Feel My Face iku judhul album kasebut.,2
2,"Menurutnya, ketersediaan pangan di DKI Jakarta...",Kasedhiya pangan ing DKI Jakarta aman.,0
3,Foto itu memperlihatkan sosok istri Raul Lemos...,Bojone Raul Lemos iku tukang becak.,1
4,Presiden Soekarno mengeluarkan Keputusan Presi...,Présidhèn ngangkat akèh Pahlawan Kamardikan Na...,1


In [11]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

In [12]:
class CompDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
        
    def __getitem__(self, index):
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']
        
        encoded_dict = tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation='longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        target = torch.tensor(self.df_data.loc[index, 'label'])
        sample = {"input_ids": padded_token_list, "attention_mask": att_mask, "label": target}
        
        return sample
    
    def __len__(self):
        return len(self.df_data)
    

In [13]:
train_data_cmp = CompDataset(df_train_new)
valid_data_cmp = CompDataset(df_valid_new)
test_data_cmp = CompDataset(df_test_new)

In [14]:
print(f"Jumlah core: {str(NUM_CORES)}")

Jumlah core: 4


In [15]:
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=3)

# Send the model to the device.
# model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [16]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [17]:
training_args = TrainingArguments(
    output_dir="saved_models/Indo-Javanese-NLI/BaselineModels/bert-base-multilingual-cased",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    seed=101,
    learning_rate=L_RATE,
    report_to="none" #"azure-ml"
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_cmp,
    eval_dataset=valid_data_cmp,
    compute_metrics=compute_metrics,
)

In [19]:
# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.0009,1.056013,0.530724,0.530724,0.530724,0.516729
2,0.9702,1.238811,0.610833,0.610833,0.610833,0.609998
3,0.9338,1.323917,0.593537,0.593537,0.593537,0.594793
4,0.903,1.622457,0.608557,0.608557,0.608557,0.607083
5,0.8592,2.001213,0.604005,0.604005,0.604005,0.604749
6,0.6094,2.245729,0.604005,0.604005,0.604005,0.60316


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

TrainOutput(global_step=30990, training_loss=0.8996020031190757, metrics={'train_runtime': 18981.8016, 'train_samples_per_second': 3.265, 'train_steps_per_second': 1.633, 'total_flos': 1.630776963078144e+16, 'train_loss': 0.8996020031190757, 'epoch': 6.0})

In [20]:
prediction = trainer.predict(test_data_cmp)

In [21]:
print("Testing metrics:", prediction[2])

Testing metrics: {'test_loss': 2.4819486141204834, 'test_accuracy': 0.5570195365742844, 'test_precision': 0.5570195365742844, 'test_recall': 0.5570195365742844, 'test_f1': 0.5557291418423154, 'test_runtime': 210.997, 'test_samples_per_second': 10.431, 'test_steps_per_second': 5.218}


In [22]:
# # Preprocess raw predictions
y_pred = np.argmax(prediction[0], axis=1)

In [23]:
def return_label(the_label):
    str_label = ""
    if str(the_label) == "0":
        str_label = "entail"
    elif str(the_label) == "1":
        str_label = "neutral"
    else:
        str_label = "contradict"
    return str_label

In [24]:
for idx, row in df_test_new.iterrows():
    ground_truth = prediction[1][idx]
    if y_pred[idx] != ground_truth:
        print("==========================================================================================")
        print(f"Premis: {row['premise']}") 
        print(f"Hipotesis: {row['hypothesis']}")
        print(f"True Label: {return_label(ground_truth)}") 
        print(f"Pred Label: {return_label(y_pred[idx])}")
        print("==========================================================================================")

Premis: Sementara, sebelumnya pelantun hit Can 't Feel My Face tersebut mengakui judul albumnya Starboy terinspirasi dari hit ikonik milik David Bowie yakni Starman.
Hipotesis: Can &#39;t Feel My Face iku judhul album kasebut.
True Label: contradict
Pred Label: neutral
Premis: Foto itu memperlihatkan sosok istri Raul Lemos yang tengah duduk di bangku pengemudi becak.
Hipotesis: Bojone Raul Lemos iku tukang becak.
True Label: neutral
Pred Label: entail
Premis: Dua puluh turnamen Piala Dunia telah dimenangkan oleh delapan tim nasional berbeda. Brasil telah menjuarai Piala Dunia sebanyak lima kali, dan merupakan satu-satunya tim yang secara rutin mengikuti setiap turnamen. Juara Piala Dunia lainnya adalah Italia dan Jerman dengan empat gelar juara, Argentina, Uruguay dan Prancis dengan dua gelar juara, serta Inggris dan Spanyol masing-masing dengan satu gelar juara.
Hipotesis: Ana 9 tim nasional sing menang Piala Dunia.
True Label: contradict
Pred Label: neutral
Premis: Panduannya bermasa

Premis: Bahwa agar Dinas Tenaga Kerja dan Transmigrasi Kabupaten Ogan Komering Ilir dapat melaksanakan tugasnya secara berdaya guna dan berhasil dipandang perlu untuk menetapkan susunan Organisasi dan Tata Kerja Dinas Tenaga Kerja dan Transmigrasi Kabupaten Ogan Komering Ilir.
Hipotesis: Dinas Tenaga Kerja lan Transmigrasi wis nindakake tugas kanthi bener.
True Label: neutral
Pred Label: entail
Premis: "Niatnya kerja yang lurus saja dan enggak aneh-aneh," lanjutnya. "Via itu niatnya bekerja dan berkarier."
Hipotesis: Via ora ateges aneh.
True Label: entail
Pred Label: contradict
Premis: McGregor mengakui bahwa hasrat terbesarnya adalah pertarungan. Sehingga apapun medan yang ia tempuh, hal itu bukanlah masalah besar.
Hipotesis: Medan perang kanggo McGregor ora pati penting kanggo dheweke.
True Label: entail
Pred Label: neutral
Premis: Namun, warga sekitar justru antusias karena menganggap Ahok lah yang mengunjungi permukiman mereka.
Hipotesis: Ahok dolan menyang pemukiman warga karo pa