In [1]:
# ! pip install --upgrade gdown
# ! pip install transformers
# ! pip install 

In [2]:
import pandas as pd
import numpy as np
import os
import gc
import random

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(555)

import transformers
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

2023-04-05 07:08:00.027706: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-05 07:08:18.326500: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-04-05 07:08:18.326627: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
comet_ml is installed but `COMET_API_KEY` is not set.


In [3]:
MODEL_TYPE = 'xlm-roberta-large'
L_RATE = 1e-6
MAX_LEN = 512

NUM_EPOCHS = 6
BATCH_SIZE = 1
NUM_CORES = os.cpu_count() - 2

In [4]:
os.environ["WANDB_DISABLED"] = "true"

In [5]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [6]:
df_train = pd.read_csv("azureml://subscriptions/c1ca5205-8fcd-4d2f-b06c-ac9a5f6644b4/resourcegroups/jalaluddin.94-rg/workspaces/wrkspace-machinelearning-research/datastores/workspaceblobstore/paths/UI/2023-04-03_060650_UTC/indonli-with-java-chatgpt-training-data.csv", sep='\t')
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle the data

In [7]:
df_train_new = pd.DataFrame()
df_train_new["premise"] = df_train["premise"]
df_train_new["hypothesis"] = df_train["jv_hypothesis"]
df_train_new["label"] = df_train["label"]
df_train_new.head()

Unnamed: 0,premise,hypothesis,label
0,Kualitas yang rendah ini dapat mengakibatkan k...,Kualitas sing kurang ora nindakake komunikasi ...,2
1,Beberapa peradaban kuno terletak di sekitar pe...,"""Peradaban kuna nang pinggir ora kepengin éfèk...",2
2,Tentunya Tiongkok akan menyajikan banyak peran...,Cina ora duwe helikopter.,2
3,Jaminan kemerdekaannya berjanji untuk menghila...,"""Jaminan kemerdekaan iki dipenginaken déning w...",1
4,Danau Batur dikenal sebagai danau terbesar yan...,Danau Batur ora ana ing Bali.,2


In [8]:
df_valid = pd.read_csv("azureml://subscriptions/c1ca5205-8fcd-4d2f-b06c-ac9a5f6644b4/resourcegroups/jalaluddin.94-rg/workspaces/wrkspace-machinelearning-research/datastores/workspaceblobstore/paths/UI/2023-04-03_060939_UTC/indonli-with-java-chatgpt-validation-data.csv", sep='\t')
df_valid = df_valid.sample(frac=1).reset_index(drop=True) #shuffle the data

In [9]:
df_valid_new = pd.DataFrame()
df_valid_new["premise"] = df_valid["premise"]
df_valid_new["hypothesis"] = df_valid["jv_hypothesis"]
df_valid_new["label"] = df_valid["label"]
df_valid_new.head()

Unnamed: 0,premise,hypothesis,label
0,Air hangat dapat membantu saluran pencernaan b...,"""Jamu hangat nglempekno aliran ing pembuluh da...",2
1,Qalawun merebut Latakia pada tahun 1287 dan Tr...,Negara salibis County of Tripoli bubar ing tau...,2
2,Seiring menguatnya histeria pemilihan Presiden...,Ketentraman ati Karel ora terancam papan sepisan.,2
3,"Perry dan Snoop Dogg menampilkan ""California G...",Snoop Dogg tampil dhewe ing MTV Movie Awards.,2
4,Akan ada batas pada kemampuan nya untuk belanj...,Pamérintah ngatur anggaran.,1


In [10]:
df_test = pd.read_csv("azureml://subscriptions/c1ca5205-8fcd-4d2f-b06c-ac9a5f6644b4/resourcegroups/jalaluddin.94-rg/workspaces/wrkspace-machinelearning-research/datastores/workspaceblobstore/paths/UI/2023-04-03_061029_UTC/indonli-with-java-chatgpt-testing-data.csv", sep='\t')
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle the data

In [11]:
df_test_new = pd.DataFrame()
df_test_new["premise"] = df_test["premise"]
df_test_new["hypothesis"] = df_test["jv_hypothesis"]
df_test_new["label"] = df_test["label"]
df_test_new.head()

Unnamed: 0,premise,hypothesis,label
0,Lebih lanjut dr Adeline menjelaskan frekuensi ...,Frekuensi mangan nalika pasa sing disaranake d...,0
1,"""Head of Medical Kalbe Nutritionals dr Muliama...",ASI minangka sumber sistem kekebalan bayi.,0
2,Kasus hewan pertama yang positif COVID-19 di A...,Ana akeh jinis macan ing Zoo Bronx.,1
3,"Di samping itu, sebagian warga juga bekerja se...",Sawetara warga ora kerja ing luar negeri.,0
4,GERD (Gastroesophageal Reflux Disease) adalah ...,GERD minangka kondisi tambah asam lambung.,2


In [12]:
tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)

In [13]:
class CompDataset(Dataset):
    def __init__(self, df):
        self.df_data = df
        
    def __getitem__(self, index):
        sentence1 = self.df_data.loc[index, 'premise']
        sentence2 = self.df_data.loc[index, 'hypothesis']
        
        encoded_dict = tokenizer.encode_plus(
            sentence1,
            sentence2,
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation='longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        
        target = torch.tensor(self.df_data.loc[index, 'label'])
        sample = {"input_ids": padded_token_list, "attention_mask": att_mask, "label": target}
        
        return sample
    
    def __len__(self):
        return len(self.df_data)
    

In [14]:
train_data_cmp = CompDataset(df_train_new)
valid_data_cmp = CompDataset(df_valid_new)
test_data_cmp = CompDataset(df_test_new)

In [15]:
training_loader_iter = next(iter(train_data_cmp))
print(training_loader_iter)

{'input_ids': tensor([     0, 232718,    119,  37092,    485,   1802, 119998,  90940,   2253,
           704,    177, 112470,      4,  13166,    734, 101485,      9, 130565,
          7389,  36069,  11782,      5,      2,      2, 232718,   5367,  14391,
          3620,    300,   1127,  33849,  90940,   5219,  51718,   1098,    704,
        112470,      5,      2,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,      1,      1,      1,      1,      1,      1,      1,
             1,      1,   

In [16]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_TYPE, 
    num_labels = 3
)

# Send the model to the device.
# model.to(device)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-large and are newly initialized: ['classifier.out

In [17]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [18]:
training_args = TrainingArguments(
    output_dir="saved_models/Indo-Javanese-NLI/BaselineModels/XLMR-large",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=NUM_EPOCHS,
    seed=101,
    learning_rate=L_RATE,
    report_to="none" #"azure-ml"
)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data_cmp,
    eval_dataset=valid_data_cmp,
    compute_metrics=compute_metrics
)

In [20]:
# Train pre-trained model
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.2651,1.480126,0.591261,0.591261,0.591261,0.593115
2,1.5957,2.17174,0.619936,0.619936,0.619936,0.61795
3,1.8134,2.065101,0.634502,0.634502,0.634502,0.634468
4,1.5645,1.765647,0.67228,0.67228,0.67228,0.672792
5,1.3201,1.989853,0.685025,0.685025,0.685025,0.685802
6,1.5223,1.997512,0.689577,0.689577,0.689577,0.690628


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
[Handler.handle()] Failed to read or parse request from socket: Expecting value: line 1 column 1

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Bad pipe message: %s [b'\x0f\x00\x0c\x00\x1a\x00\t\x00\x14\x00\x11\x00\x19\x00\x08\x00\x06']Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be a

TrainOutput(global_step=61980, training_loss=1.544870816403414, metrics={'train_runtime': 73852.9765, 'train_samples_per_second': 0.839, 'train_steps_per_second': 0.839, 'total_flos': 5.776130106169344e+16, 'train_loss': 1.544870816403414, 'epoch': 6.0})

In [21]:
# # Make prediction
prediction = trainer.predict(test_data_cmp)

In [22]:
print(prediction)

PredictionOutput(predictions=array([[ 4.7773843, -2.5426106, -2.9645848],
       [ 4.3887057, -1.9837533, -2.966195 ],
       [-2.8831387,  5.047405 , -3.03932  ],
       ...,
       [ 4.9967422, -2.4342542, -3.2563076],
       [ 1.9321915, -1.3218817, -1.0576552],
       [-4.0110793, -1.9076651,  5.564022 ]], dtype=float32), label_ids=array([0, 0, 1, ..., 0, 1, 2]), metrics={'test_loss': 2.0355732440948486, 'test_accuracy': 0.6756019990913221, 'test_precision': 0.6756019990913221, 'test_recall': 0.6756019990913221, 'test_f1': 0.6769119360983353, 'test_runtime': 726.6501, 'test_samples_per_second': 3.029, 'test_steps_per_second': 3.029})


In [23]:
print(prediction[2])

{'test_loss': 2.0355732440948486, 'test_accuracy': 0.6756019990913221, 'test_precision': 0.6756019990913221, 'test_recall': 0.6756019990913221, 'test_f1': 0.6769119360983353, 'test_runtime': 726.6501, 'test_samples_per_second': 3.029, 'test_steps_per_second': 3.029}


In [24]:
# # Preprocess raw predictions
y_pred = np.argmax(prediction[0], axis=1)

In [25]:
def return_label(the_label):
    str_label = ""
    if str(the_label) == "0":
        str_label = "entail"
    elif str(the_label) == "1":
        str_label = "neutral"
    else:
        str_label = "contradict"
    return str_label

In [26]:
for idx, row in df_test_new.iterrows():
    ground_truth = prediction[1][idx]
    if y_pred[idx] != ground_truth:
        print("==========================================================================================")
        print(f"Premis: {row['premise']}") 
        print(f"Hipotesis: {row['hypothesis']}")
        print(f"True Label: {return_label(ground_truth)}") 
        print(f"Pred Label: {return_label(y_pred[idx])}")
        print("==========================================================================================")

Premis: Di samping itu, sebagian warga juga bekerja sebagai Tenaga Kerja Indonesia di Luar negeri.
Hipotesis: Sawetara warga ora kerja ing luar negeri.
True Label: entail
Pred Label: contradict
Premis: GERD (Gastroesophageal Reflux Disease) adalah kondisi jangka panjang di mana asam dari lambung naik ke kerongkongan. Gejala ini dapat menyebabkan kerusakan jaringan pada tubuh, penyempitan kerongkongan, masalah pernapasan, bahkan berkembang menjadi kanker.
Hipotesis: GERD minangka kondisi tambah asam lambung.
True Label: contradict
Pred Label: entail
Premis: Beragam penduduk asli mendiami Alaska selama ribuan tahun sebelum datangnya orang Eropa ke daerah ini.
Hipotesis: Wong Eropa minangka imigran ing wilayah iki.
True Label: entail
Pred Label: contradict
Premis: Tapi mereka tidak melakukan hal yang sama kepada penonton lokal. Jika kami orang Korea, mereka akan memeriksa tubuh kami secara detil.
Hipotesis: Pamirsa lokal dipriksa.
True Label: neutral
Pred Label: entail
Premis: Sayangnya, 

Premis: Betapa terkejutnya ia saat melihat ternyata orang yang nyaris ia tabrak itu adalah LARA, dalam keadaan hamil besar.
Hipotesis: LARA kaget.
True Label: neutral
Pred Label: entail
Premis: Wabah virus Corona dan penetapan pembatasan sosial berskala besar (PSBB) membuat banyak masyarakat Indonesia memenuhi kebutuhannya lewat belanja online, termasuk lewat e-commerce seperti Tokopedia.
Hipotesis: Tokopedia tutup layanan amarga wabah virus Corona.
True Label: contradict
Pred Label: neutral
Premis: Malaysia adalah sebuah negara federal yang terdiri dari tiga belas negeri (negara bagian) dan tiga wilayah federal di Asia Tenggara dengan luas 329.847 km persegi.
Hipotesis: Malaysia duwe 13 negara.
True Label: entail
Pred Label: contradict
Premis: Pavlovsky adalah presiden dari Yayasan Politik yang Efektif (FEP). 
Hipotesis: Yayasan kanggo Politik Efektif didegaké déning Pavlovsky.
True Label: neutral
Pred Label: entail
Premis: Liliyana Natsir sudah tiga kali berpartisipasi di ajang Olimp