In [1]:
import pandas as pd
import numpy as np
import os
import gc
import random
import gdown

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

import transformers
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

  from .autonotebook import tqdm as notebook_tqdm
2023-04-03 04:32:57.057972: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-03 04:33:16.244648: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-03 04:33:16.244779: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
comet_ml is installed but `COMET_API_KEY` is not set.


In [2]:
MODEL_TYPE = 'xlm-roberta-base'
L_RATE = 1e-6
MAX_LEN = 512

NUM_EPOCHS = 15
BATCH_SIZE = 4
NUM_CORES = os.cpu_count() - 2

In [3]:
os.environ["WANDB_DISABLED"] = "true"

In [4]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [5]:
uri = "https://drive.google.com/uc?id=1aE9w2rqgW-j3PTgjnmHDjulNwp-Znb6i"
output = "dataset/indo_java_nli_training.csv"
if not os.path.exists("dataset/"):
  os.makedirs("dataset/")
gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1aE9w2rqgW-j3PTgjnmHDjulNwp-Znb6i
To: /mnt/batch/tasks/shared/LS_root/mounts/clusters/machinelearning-research/code/Users/jalaluddin.94/dataset/indo_java_nli_training.csv
100%|██████████| 4.31M/4.31M [00:00<00:00, 91.9MB/s]


'dataset/indo_java_nli_training.csv'

In [6]:
uri = "https://drive.google.com/uc?id=1YlQ9_8CvQbTSb5-2BjIfiYT-cy7pe6YM"
output = "dataset/indo_java_nli_validation.csv"
if not os.path.exists("dataset/"):
  os.makedirs("dataset/")
gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1YlQ9_8CvQbTSb5-2BjIfiYT-cy7pe6YM
To: /mnt/batch/tasks/shared/LS_root/mounts/clusters/machinelearning-research/code/Users/jalaluddin.94/dataset/indo_java_nli_validation.csv
100%|██████████| 887k/887k [00:00<00:00, 21.6MB/s]


'dataset/indo_java_nli_validation.csv'

In [7]:
uri = "https://drive.google.com/uc?id=1Zz_rHeI7fPUuA04zt9gCWyl5RYhrYPn0"
output = "dataset/indo_java_nli_testing.csv"
if not os.path.exists("dataset/"):
  os.makedirs("dataset/")
gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1Zz_rHeI7fPUuA04zt9gCWyl5RYhrYPn0
To: /mnt/batch/tasks/shared/LS_root/mounts/clusters/machinelearning-research/code/Users/jalaluddin.94/dataset/indo_java_nli_testing.csv
100%|██████████| 1.02M/1.02M [00:00<00:00, 29.4MB/s]


'dataset/indo_java_nli_testing.csv'

In [8]:
df_train = pd.read_csv("dataset/indo_java_nli_training.csv", sep='\t')
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle the data

In [9]:
df_train_new = pd.DataFrame()
df_train_new["premise"] = df_train["premise"]
df_train_new["hypothesis"] = df_train["jv_hypothesis"]
df_train_new["label"] = df_train["label"]
df_train_new.head()

Unnamed: 0,premise,hypothesis,label
0,Kuntowijoyo yakin bahwa pandangan ini akan beg...,Kuntowijoyo ora yakin pandangan iki isih ana p...,1
1,Kemudian dilanjutkan pada proses pembangunan l...,"""Kerja pembangunan pasar lan terminal bakal di...",0
2,Kecamatan ini merupakan satu dari dua puluh du...,Kecamatan iki ana ing Kabupatèn Sumba Timur.,0
3,"""Jakarta, CNN Indonesia—Saat musim hujan tubuh...",Musim ujan tubuh bakal kentel kedelik mudhun s...,2
4,"Pada 2012, mereka menghasilkan enam pemuda yan...",Sistem pungsèn sumarah iki diwènèhi ing taun 2...,0


In [10]:
df_valid = pd.read_csv("dataset/indo_java_nli_validation.csv", sep='\t')
df_valid = df_valid.sample(frac=1).reset_index(drop=True) #shuffle the data

In [11]:
df_valid_new = pd.DataFrame()
df_valid_new["premise"] = df_valid["premise"]
df_valid_new["hypothesis"] = df_valid["jv_hypothesis"]
df_valid_new["label"] = df_valid["label"]
df_valid_new.head()

Unnamed: 0,premise,hypothesis,label
0,Banpres adalah sebuah desa yang terletak di ke...,"""Dhésa Banpres tenan nglébokaké ing Kabupatèn ...",2
1,Dusun ini dikelilingi jalan semi aspal yang lu...,Dhukuh iki durung bisa dimasuki mobil.,2
2,Peringkat atas kewajiban jangka pendek Moody's...,Moody's ngresiki pangrembugan ya lumantar dhek...,0
3,"Michelle Monaghan, yang tampil di Mission: Imp...",Michelle Monaghan ora pernah nampil ing Misión...,2
4,"Kalimat lainnya dalam catatan itu berbunyi ""ka...",Isine catetan iki nyalahake kita.,0


In [5]:
df_test = pd.read_csv("dataset/indo_java_nli_testing.csv", sep='\t')
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle the data

In [6]:
df_test_new = pd.DataFrame()
df_test_new["premise"] = df_test["premise"]
df_test_new["hypothesis"] = df_test["jv_hypothesis"]
df_test_new["label"] = df_test["label"]
df_test_new.head()

Unnamed: 0,premise,hypothesis,label
0,Salah satunya seorang lelaki yang sedang memak...,Salah sijine yaiku wong wadon sing lagi mangan...,2
1,Bagi mereka yang mengikuti transisi media sosi...,Capitol Hill dumunung ing Washington DC,1
2,"Serial drama ini dibintangi Shota Matsuda, Kat...",Seri drama iki dibintangi Shota Matsuda.,0
3,Penderita maag perlu menghindari makanan yang ...,Panganan sing pedhes banget kudu dihindari den...,0
4,Sutan Syahrir ditetapkan sebagai salah seorang...,Sutan Syahrir dipunangkat dados salah satungga...,2


In [7]:
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

In [8]:
class CompDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
        
    def __getitem__(self, index):
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']
        
        encoded_dict = tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation='longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        target = torch.tensor(self.df_data.loc[index, 'label'])
        sample = {"input_ids": padded_token_list, "attention_mask": att_mask, "label": target}
        
        return sample
    
    def __len__(self):
        return len(self.df_data)
    

In [9]:
train_data_cmp = CompDataset(df_train_new)
valid_data_cmp = CompDataset(df_valid_new)
test_data_cmp = CompDataset(df_test_new)

In [21]:
training_loader_iter = next(iter(train_data_cmp))
print(training_loader_iter)

{'input_ids': tensor([     0,   5900,     18,  10678,    513,   1410,  36199,   4238,  55509,
           485,    945,  13006,   2747, 117597,    273,    638,  96382,  14366,
          9274,    123,  14135,     14,   4341,      5,      2,      2,   5900,
            18,  10678,    513,   1410,   3620,  36199,  55509,   2209,     83,
          1043,   3877, 117597,    446,    700,  76323,   3305,  96382, 101174,
             5,      2,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,   

In [22]:
print(f"Jumlah core: {str(NUM_CORES)}")

Jumlah core: 4


In [23]:
train_dataloader = DataLoader(train_data_cmp,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_CORES)

In [24]:
valid_dataloader = DataLoader(valid_data_cmp,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_CORES)

In [25]:
test_dataloader = DataLoader(test_data_cmp,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_CORES)

In [27]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE, 
    num_labels = 3
)

# Send the model to the device.
# model.to(device)

Some weights of the model checkpoint at xlm-roberta-base were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.out_p

In [16]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [29]:
training_args = TrainingArguments(
    output_dir="saved_models/Indo-Javanese-NLI/BaselineModels/XLMR-base-epoch10",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    seed=101,
    learning_rate=L_RATE,
    report_to="none" #"azure-ml"
)

In [30]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_cmp,
    eval_dataset=valid_data_cmp,
    compute_metrics=compute_metrics
)

In [31]:
# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.1184,1.18831,0.291761,0.291761,0.291761,0.131796
2,1.11,1.104693,0.340919,0.340919,0.340919,0.173353
3,1.1074,1.098304,0.367319,0.367319,0.367319,0.197355
4,1.1023,1.10548,0.291761,0.291761,0.291761,0.131796
5,1.1026,1.106959,0.291761,0.291761,0.291761,0.131796
6,1.1027,1.096153,0.367319,0.367319,0.367319,0.197355
7,1.1003,1.102371,0.291761,0.291761,0.291761,0.131796
8,1.1015,1.098479,0.367319,0.367319,0.367319,0.197355
9,1.0981,1.098808,0.340919,0.340919,0.340919,0.173353
10,1.0996,1.098641,0.340919,0.340919,0.340919,0.173353


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

TrainOutput(global_step=25830, training_loss=1.1062323215438836, metrics={'train_runtime': 28757.1084, 'train_samples_per_second': 3.592, 'train_steps_per_second': 0.898, 'total_flos': 2.71796160513024e+16, 'train_loss': 1.1062323215438836, 'epoch': 10.0})

In [10]:
# CHECKPOINT_MODEL_PATH = "saved_models/Indo-Javanese-NLI/BaselineModels/XLMR-base-epoch10"

In [11]:
# model_new = XLMRobertaForSequenceClassification.from_pretrained(
#     CHECKPOINT_MODEL_PATH + "/checkpoint-10332", 
#     num_labels = 3
# )

In [17]:
# test_trainer = Trainer(
#     model = model_new,
#     compute_metrics = compute_metrics
# )

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [18]:
# # Make prediction
prediction = trainer.predict(test_data_cmp) # test_trainer.predict(test_data_cmp)

In [19]:
print(prediction)

PredictionOutput(predictions=array([[-0.1229597 ,  0.01109497, -0.0834606 ],
       [-0.1229597 ,  0.01109496, -0.08346058],
       [-0.12295969,  0.01109498, -0.08346058],
       ...,
       [-0.12295971,  0.01109496, -0.08346057],
       [-0.1229597 ,  0.01109496, -0.08346057],
       [-0.12295968,  0.01109496, -0.08346057]], dtype=float32), label_ids=array([2, 1, 0, ..., 2, 2, 0]), metrics={'test_loss': 1.1060373783111572, 'test_accuracy': 0.28577919127669243, 'test_precision': 0.28577919127669243, 'test_recall': 0.28577919127669243, 'test_f1': 0.12703541435550497, 'test_runtime': 192.321, 'test_samples_per_second': 11.444, 'test_steps_per_second': 1.435})


In [25]:
print(len(prediction[1]))
print(len(test_data_cmp))

2201
2201


In [27]:
print(df_test_new.iloc[0])

premise       Salah satunya seorang lelaki yang sedang memak...
hypothesis    Salah sijine yaiku wong wadon sing lagi mangan...
label                                                         2
Name: 0, dtype: object


In [28]:
print(prediction[1][0])

2


In [31]:
# # Preprocess raw predictions
y_pred = np.argmax(prediction[0], axis=1)

In [38]:
def return_label(the_label):
    str_label = ""
    if str(the_label) == "0":
        str_label = "entail"
    elif str(the_label) == "1":
        str_label = "neutral"
    else:
        str_label = "contradict"
    return str_label

In [39]:
for idx, row in df_test_new.iterrows():
    ground_truth = prediction[1][idx]
    if y_pred[idx] != ground_truth:
        print("==========================================================================================")
        print(f"Premis: {row['premise']}") 
        print(f"Hipotesis: {row['hypothesis']}")
        print(f"True Label: {return_label(ground_truth)}") 
        print(f"Pred Label: {return_label(y_pred[idx])}")
        print("==========================================================================================")

Premis: Salah satunya seorang lelaki yang sedang memakan permen karet yang dengan paksa dikeluarkan dari mulutnya.
Hipotesis: Salah sijine yaiku wong wadon sing lagi mangan permen karet sing dicopot kanthi paksa saka cangkeme.
True Label: contradict
Pred Label: neutral
Premis: Serial drama ini dibintangi Shota Matsuda, Katsumi Takahashi, Riko Narumi, Yuki Uchida, dan Satomi Kobayashi.
Hipotesis: Seri drama iki dibintangi Shota Matsuda.
True Label: entail
Pred Label: neutral
Premis: Penderita maag perlu menghindari makanan yang berlemak, goreng-gorengan, santan, mie, sayuran yang membentuk banyak gas (seperti kol, sawi, lobak), makanan terlalu pedas, kopi, minuman bersoda, dan minuman mengandung alkohol.
Hipotesis: Panganan sing pedhes banget kudu dihindari dening penderita ulcer.
True Label: entail
Pred Label: neutral
Premis: Sutan Syahrir ditetapkan sebagai salah seorang Pahlawan Nasional Indonesia pada tanggal 9 April 1966 melalui Keppres Nomor 76 Tahun 1966.
Hipotesis: Sutan Syahrir

Premis: "Umumnya, takoyaki berwarna kuning kecoklatan karena adonan yang terbuat dari tepung dan telur."
Hipotesis: Adonan takoyaki digawe saka daging sapi giling.
True Label: contradict
Pred Label: neutral
Premis: Pakaian formal yang dikenakan pejabat sipil (bunkan) dijahit di bagian bawah ketiak.
Hipotesis: Pejabat sipil ora nganggo sandhangan.
True Label: contradict
Pred Label: neutral
Premis: Perang Inggris-Burma Pertama berlangsung dari tahun 1823 hingga 1826.
Hipotesis: Inggris lan Burma wis hubungan diplomatik apik wiwit negara loro dibangun, supaya perang antarane negara loro ora tau kelakon.
True Label: contradict
Pred Label: neutral
Premis: Suasana yang tenang dan asri akan menyambut pengunjung ketika memasuki kawasan candi Dieng ini.
Hipotesis: Tlatah Candhi Dieng ora bisa dileboni pengunjung.
True Label: contradict
Pred Label: neutral
Premis: Leher nya bergerak maju dalam konfrontasi yang menonjolkan urat-uratnya, tangannya bergetar.
Hipotesis: Dheweke melu konfrontasi.
Tru

In [33]:
print(len(y_pred))

2201
