In [1]:
import pandas as pd
import numpy as np
import os
import gc
import random

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

import transformers
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW

2023-04-04 09:26:02.688216: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-04 09:26:03.645601: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-04 09:26:03.645710: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
MODEL_TYPE = 'bert-base-uncased'
L_RATE = 1e-5
MAX_LEN = 512

NUM_EPOCHS = 10
BATCH_SIZE = 4
NUM_CORES = os.cpu_count() - 2

In [3]:
os.environ["WANDB_DISABLED"] = "true"

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
df_train = pd.read_csv("dataset/indo_java_nli_training.csv", sep='\t')
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle the data

In [6]:
df_train_new = pd.DataFrame()
df_train_new["premise"] = df_train["premise"]
df_train_new["hypothesis"] = df_train["jv_hypothesis"]
df_train_new["label"] = df_train["label"]
df_train_new.head()

Unnamed: 0,premise,hypothesis,label
0,Selulit sendiri merupakan kondisi munculnya ga...,Selulit yaiku kondhisi munculna garis halu ing...,0
1,Banpres adalah sebuah desa yang terletak di ke...,Sumatera Selatan ana desa sing ana jenengé Ban...,0
2,Itulah sebabnya masakan Indonesia memiliki cit...,Indonesia duwe rempah-rempah sing aneka rupa.,0
3,"Selain itu, ia juga memiliki andil dari rumah ...",Sing konangan ngrangsang ing panggung gedhong.,0
4,"""Moonlight Resonance"" adalah serial drama HDTV...","""Serial drama ""Moonlight Resonance"" tayang nga...",1


In [7]:
df_valid = pd.read_csv("dataset/indo_java_nli_validation.csv", sep='\t')
df_valid = df_valid.sample(frac=1).reset_index(drop=True) #shuffle the data

In [8]:
df_valid_new = pd.DataFrame()
df_valid_new["premise"] = df_valid["premise"]
df_valid_new["hypothesis"] = df_valid["jv_hypothesis"]
df_valid_new["label"] = df_valid["label"]
df_valid_new.head()

Unnamed: 0,premise,hypothesis,label
0,Janji tatanan Angkatan Laut baru untuk mengama...,Tatanan Angkatan Laut anyar ora pernah ngandhu...,2
1,Kompleks Candi Dieng ini memiliki usia yang su...,Pembangunan kompleks Candi Dieng rampung ing a...,1
2,Apple membeli NeXT pada tanggal 20 Desember 19...,Apple ndhuwur ing NexT.,0
3,"Usai proses mentoring, imbuhnya, satu dari emp...",Foster ora bakal nglakoni konser.,2
4,Namun Polri bersedia membantu bila Biro Invest...,Polri saiki ngunakake FBI.,2


In [9]:
df_test = pd.read_csv("dataset/indo_java_nli_testing.csv", sep='\t')
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle the data

In [10]:
df_test_new = pd.DataFrame()
df_test_new["premise"] = df_test["premise"]
df_test_new["hypothesis"] = df_test["jv_hypothesis"]
df_test_new["label"] = df_test["label"]
df_test_new.head()

Unnamed: 0,premise,hypothesis,label
0,Beragam penduduk asli mendiami Alaska selama r...,Wong asli sing manggon ing Alaska ora beda-beda.,2
1,Tercatat 37 dokter di Italia meninggal akibat ...,Luwih saka enem ewu tenaga medis uga kena infe...,0
2,"""Seperti ada kerusakan di bagian belakang,"" ka...",Iku bener ana karusakan ing mburi kaya Sean ng...,1
3,Unsur busana lain yang sangat penting adalah u...,Upuh ulen-ulen iku dudu unsur sing penting ban...,2
4,Kendati vegetasi laut hanya memiliki proporsi ...,Kemampuan kanggo nyimpen karbon saka vegetasi ...,0


In [11]:
tokenizer = BertTokenizer.from_pretrained(MODEL_TYPE)

In [12]:
class CompDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
        
    def __getitem__(self, index):
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']
        
        encoded_dict = tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation='longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        target = torch.tensor(self.df_data.loc[index, 'label'])
        sample = {"input_ids": padded_token_list, "attention_mask": att_mask, "label": target}
        
        return sample
    
    def __len__(self):
        return len(self.df_data)
    

In [13]:
train_data_cmp = CompDataset(df_train_new)
valid_data_cmp = CompDataset(df_valid_new)
test_data_cmp = CompDataset(df_test_new)

In [14]:
print(f"Jumlah core: {str(NUM_CORES)}")

Jumlah core: 4


In [15]:
model = BertForSequenceClassification.from_pretrained(MODEL_TYPE, num_labels=3)

# Send the model to the device.
# model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [16]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [17]:
training_args = TrainingArguments(
    output_dir="saved_models/Indo-Javanese-NLI/BaselineModels/bert-base-epoch10",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    seed=101,
    learning_rate=L_RATE,
    report_to="none" #"azure-ml"
)

In [18]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_cmp,
    eval_dataset=valid_data_cmp,
    compute_metrics=compute_metrics,
)

In [19]:
# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.9665,0.966495,0.565316,0.565316,0.565316,0.569173
2,0.9437,0.944532,0.57533,0.57533,0.57533,0.578268
3,0.8726,0.957268,0.580337,0.580337,0.580337,0.584269
4,0.7762,1.10448,0.564861,0.564861,0.564861,0.566667
5,0.6752,1.226829,0.552117,0.552117,0.552117,0.551497
6,0.6838,1.563101,0.555303,0.555303,0.555303,0.55487
7,0.5821,2.122674,0.541193,0.541193,0.541193,0.543536
8,0.5344,2.300878,0.548475,0.548475,0.548475,0.54837
9,0.5499,2.405989,0.536641,0.536641,0.536641,0.534598
10,0.3983,2.562595,0.540282,0.540282,0.540282,0.540116


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

TrainOutput(global_step=25830, training_loss=0.6918137252400007, metrics={'train_runtime': 26077.5836, 'train_samples_per_second': 3.961, 'train_steps_per_second': 0.991, 'total_flos': 2.71796160513024e+16, 'train_loss': 0.6918137252400007, 'epoch': 10.0})

In [20]:
prediction = trainer.predict(test_data_cmp)

In [21]:
print("Testing metrics:", prediction[2])

Testing metrics: {'test_loss': 2.7814669609069824, 'test_accuracy': 0.5152203543843707, 'test_precision': 0.5152203543843707, 'test_recall': 0.5152203543843707, 'test_f1': 0.517358398267481, 'test_runtime': 205.3543, 'test_samples_per_second': 10.718, 'test_steps_per_second': 2.683}


In [22]:
# # Preprocess raw predictions
y_pred = np.argmax(prediction[0], axis=1)

In [23]:
def return_label(the_label):
    str_label = ""
    if str(the_label) == "0":
        str_label = "entail"
    elif str(the_label) == "1":
        str_label = "neutral"
    else:
        str_label = "contradict"
    return str_label

In [24]:
for idx, row in df_test_new.iterrows():
    ground_truth = prediction[1][idx]
    if y_pred[idx] != ground_truth:
        print("==========================================================================================")
        print(f"Premis: {row['premise']}") 
        print(f"Hipotesis: {row['hypothesis']}")
        print(f"True Label: {return_label(ground_truth)}") 
        print(f"Pred Label: {return_label(y_pred[idx])}")
        print("==========================================================================================")

Premis: Beragam penduduk asli mendiami Alaska selama ribuan tahun sebelum datangnya orang Eropa ke daerah ini.
Hipotesis: Wong asli sing manggon ing Alaska ora beda-beda.
True Label: contradict
Pred Label: entail
Premis: Tercatat 37 dokter di Italia meninggal akibat terinfeksi COVID-19, sementara 6.000 lebih tenaga medis juga terinfeksi oleh virus Corona.
Hipotesis: Luwih saka enem ewu tenaga medis uga kena infeksi virus Corona.
True Label: entail
Pred Label: contradict
Premis: Kendati vegetasi laut hanya memiliki proporsi 0,05 persen dari biomassa vegetasi darat, tetapi justru memiliki kemampuan menyimpan karbon yang sebanding dengan vegetasi darat.
Hipotesis: Kemampuan kanggo nyimpen karbon saka vegetasi segara luwih apik tinimbang vegetasi terestrial kanggo biomas padha.
True Label: entail
Pred Label: neutral
Premis: Pernikahan mereka tidak sah secara hukum dan Decker bulan depan akan menikahi tunangannya. Decker menghabiskan beberapa bulan di rumah Jeffs ketika pria itu tengah dala