# Transfer Learning Approach for Cross-Lingual NLI

## Import Libraries and Setup Environment Variables

In [1]:
import pandas as pd
import numpy as np
import os
import gc
import random
import gdown
import time
from tqdm import tqdm, trange

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

# set a seed value
torch.manual_seed(205)

from datasets import load_dataset

import wandb

import transformers
from transformers import TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from transformers import PreTrainedModel, PretrainedConfig
from transformers import BertTokenizer, BertModel, BertForSequenceClassification #, XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from transformers import AdamW

In [2]:
# MODEL_TYPE = 'xlm-roberta-base'
TOKENIZER_TYPE = 'bert-base-multilingual-cased'
MBERT_TYPE = 'bert-base-multilingual-cased'
MODEL_TYPE = 'jalaluddin94/nli_mbert'
MODEL_PATH = 'D:/Training/Machine Learning/NLP/NLI/saved_models/Indo-Javanese-NLI/ResearchedModels/'

L_RATE = 3e-6
STUDENT_LRATE = 3e-6
MAX_LEN = 512
NUM_EPOCHS = 25
BATCH_SIZE = 8
BATCH_NORM_EPSILON = 1e-5
LAMBDA_L2 = 3e-5

HF_TOKEN = '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'

NUM_CORES = os.cpu_count() - 2

In [3]:
# %env WANDB_NOTEBOOK_NAME=/home/sagemaker-user/PPT/BERT_BiLSTM_Game_Review.ipynb
%env WANDB_API_KEY=97b170d223eb55f86fe1fbf9640831ad76381a74
wandb.login()

env: WANDB_API_KEY=97b170d223eb55f86fe1fbf9640831ad76381a74


[34m[1mwandb[0m: Currently logged in as: [33mjalaluddin-94[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [4]:
# %env WANDB_PROJECT=javanese_nli
%env WANDB_LOG_MODEL='end'
run = wandb.init(
  project="javanese_nli",
  notes="Experiment transfer learning on Bandyopadhyay's paper",
  name="transfer-learning-paper",
  tags=["transferlearning", "bandyopadhyay"]
)

env: WANDB_LOG_MODEL='end'


In [5]:
os.environ["WANDB_AGENT_MAX_INITIAL_FAILURES"]="1024"

In [6]:
os.environ["WANDB_AGENT_DISABLE_FLAPPING"]="true"

In [7]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Download and Prepare Dataset

### Download Dataset

In [8]:
# uri = "https://drive.google.com/uc?id=1aE9w2rqgW-j3PTgjnmHDjulNwp-Znb6i"
# output = "dataset/indo_java_nli_training.csv"
# if not os.path.exists("dataset/"):
#   os.makedirs("dataset/")
# gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

In [9]:
# uri = "https://drive.google.com/uc?id=1YlQ9_8CvQbTSb5-2BjIfiYT-cy7pe6YM"
# output = "dataset/indo_java_nli_validation.csv"
# if not os.path.exists("dataset/"):
#   os.makedirs("dataset/")
# gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

In [10]:
# uri = "https://drive.google.com/uc?id=1Zz_rHeI7fPUuA04zt9gCWyl5RYhrYPn0"
# output = "dataset/indo_java_nli_testing.csv"
# if not os.path.exists("dataset/"):
#   os.makedirs("dataset/")
# gdown.download(url=uri, output=output, quiet=False, fuzzy=True)

### Prepare Dataset for Student 

In [11]:
df_train = pd.read_csv("D:/Training/Machine Learning/Datasets/NLI/IndoJavaNLI/indojavanesenli-train.csv", sep='\t')
df_train = df_train.sample(frac=1).reset_index(drop=True) #shuffle the data

In [12]:
df_train_student = pd.DataFrame()
df_train_student["premise"] = df_train["premise"]
df_train_student["hypothesis"] = df_train["jv_hypothesis_mongo"]
df_train_student["label"] = df_train["label"]
df_train_student.head()

Unnamed: 0,premise,hypothesis,label
0,Edisi novel di Britania Raya memenangkan Man B...,novel wis nduweni edisi sing diterbitkan neng ...,0
1,Sarana olahragapun tidak luput dari pemikiran ...,lapangan sapak bola digugah.,0
2,Penduduk kabupaten Raja Ampat mayoritas memelu...,"kajaba agama kristen, akeh warga kabupaten raj...",1
3,Petisi yang diinisiasi oleh Tyler Sigmon itu m...,tyler sigmon ora tau menginisiasi siji petisi.,2
4,"Untuk video, Air 2 bisa merekam video 4K 60fps...",banyu 2 isa nduweni resolusi 5k.,2


In [13]:
df_valid = pd.read_csv("D:/Training/Machine Learning/Datasets/NLI/IndoJavaNLI/indojavanesenli-valid.csv", sep='\t')
df_valid = df_valid.sample(frac=1).reset_index(drop=True) #shuffle the data

In [14]:
df_valid_student = pd.DataFrame()
df_valid_student["premise"] = df_valid["premise"]
df_valid_student["hypothesis"] = df_valid["jv_hypothesis_mongo"]
df_valid_student["label"] = df_valid["label"]
df_valid_student.head()

Unnamed: 0,premise,hypothesis,label
0,Selain itu pengamanan ketat juga disiagakan di...,ora ana pangamanan sing kedelok neng kutha ban...,2
1,Rumah makan ini biasanya dijadikan tempat tran...,omah madhang iki ora mengenke tunggangan bus k...,2
2,Utama adalah salah satu kelurahan di Kecamatan...,cimahi kidul ngrupakne siji kecamatan neng kut...,0
3,Dewan Riset Nasional yang dikelola pemerintah ...,itali isih upadi atos kanggo mengurangi penyeb...,1
4,"Pasangan nomor satu dunia asal Indonesia, Marc...","pasangan nomor siji donya seka indonesia, marc...",0


In [15]:
df_test = pd.read_csv("D:/Training/Machine Learning/Datasets/NLI/IndoJavaNLI/indojavanesenli-test.csv", sep='\t')
df_test = df_test.sample(frac=1).reset_index(drop=True) #shuffle the data

In [16]:
df_test_student = pd.DataFrame()
df_test_student["premise"] = df_test["premise"]
df_test_student["premise"] = df_test_student["premise"].astype(str)
df_test_student["hypothesis"] = df_test["jv_hypothesis_mongo"]
df_test_student["hypothesis"] = df_test_student["hypothesis"].astype(str)
df_test_student["label"] = df_test["label"]
df_test_student.head()

Unnamed: 0,premise,hypothesis,label
0,Esarhadon kembali ke ibu kota Niniwe dan menga...,esarhado lair neng siji kutha terpencil.,1
1,Pengunjung dapat menikmati kawah serta kawasan...,pengunjung kudu ngetoke dhuwit kanggo menaiki ...,1
2,"Menurut Le Figaro, rencana pemerintah ini bisa...","pamerentah mengalokasikan 15,5 yuta euro saka ...",1
3,Asia Tenggara memiliki letak strategis dan sum...,asia nduweni sumber daya alam sing akeh.,1
4,Memetika pada umumnya mengadopsi konsep dari t...,memetika nggunakne model matematika.,1


### Prepare Dataset for Teacher

Dataset from teacher will be from "IndoNLI", and using Indonesian only.

In [17]:
df_train_t = pd.DataFrame()
df_train_t["premise"] = df_train["premise"]
df_train_t["hypothesis"] = df_train["hypothesis"]
df_train_t["label"] = df_train["label"]
df_train_t = df_train_t.sample(frac=1).reset_index(drop=True)
display(df_train_t)

Unnamed: 0,premise,hypothesis,label
0,"Makamnya sempat tak diketahui, lalu Zain bin A...",Makamnya selalu terkenal.,2
1,Tanjung Johor adalah salah satu kelurahan di K...,Provinsi Jambi memiliki kelurahan bernama Tanj...,0
2,Daerah yang dikontrol oleh Bogd Khaan kira-kir...,Bogd Khaan besar di Mongolia.,1
3,"Pada hari Selasa, Amerika Serikat bisa saja me...",Amerika Serikat tidak mungkin memililh preside...,2
4,Meskipun isu keamanan membuat beberapa perusah...,Eric Yuan gagal meyakinkan investor.,2
...,...,...,...
10325,"Norman adalah salah satu teman baik ku, jadi s...",Norman adalah teman baik ku.,0
10326,Arsenal sukses mencatatkan tiga poin krusial y...,Arsenal mencatatkan tiga poin krusial dalam la...,0
10327,Bale mulai tersisih dari tim inti Real Madrid ...,Bale sakit akibat ia cedera.,1
10328,"Sayangnya, sebuah kecelakaan fatal telah meren...",Ia meninggal di tahun 1994.,0


In [18]:
print("Count per class train:") 
print(df_train_t['label'].value_counts())

Count per class train:
0    3476
2    3439
1    3415
Name: label, dtype: int64


In [19]:
df_valid_t = pd.DataFrame()
df_valid_t["premise"] = df_valid["premise"]
df_valid_t["hypothesis"] = df_valid["hypothesis"]
df_valid_t["label"] = df_valid["label"]
df_valid_t = df_valid_t.sample(frac=1).reset_index(drop=True)
display(df_valid_t)

Unnamed: 0,premise,hypothesis,label
0,"Pada pukul 7:00 pagi tanggal 21 November 1980,...",Kebakaran restoran hotel di Deli terjadi kuran...,2
1,Bagi Anda yang memiliki hobi menantang seperti...,Banyak wisata mendaki di Bogor.,1
2,Tim Korea Selatan berhasil menjuarai turnamen ...,Tim Korea Selatan sangat berbakat.,0
3,"""Dokter tidak memiliki alat-alat ini, mereka h...",Dokter memiliki lembar resep dan suntikan berd...,0
4,"Dengan mencampur kopi dengan susu, maka akan l...",Kopi dicampur susu lebih aman untuk dikonsumsi.,0
...,...,...,...
2192,"Film ""Merah Putih Memanggil"" yang disutradarai...",Film 'Merah Putih Memanggil' tayang di bioskop...,1
2193,Bank Indonesia (BI) mencatat posisi cadangan d...,Posisi cadangan devisi Indonesia pada akhir Ag...,0
2194,Adapun batuk pada malam hari dapat menjadi per...,Terlalu lama berada di kondisi dingin bisa men...,1
2195,Adi mengingatkan tersangka ini terkait anggara...,Kasus korupsi KB II belum dapat dibuktikan.,1


In [20]:
print("Count per class valid:") 
print(df_valid_t['label'].value_counts())

Count per class valid:
0    807
2    749
1    641
Name: label, dtype: int64


In [21]:
df_test_t = pd.DataFrame()
df_test_t["premise"] = df_test["premise"]
df_test_t["hypothesis"] = df_test["hypothesis"]
df_test_t["label"] = df_test["label"]
df_test_t = df_test_t.sample(frac=1).reset_index(drop=True)
display(df_test_t)

Unnamed: 0,premise,hypothesis,label
0,Analisis menunjukkan hubungan langsung antara ...,Tidak terdapat analisis yang melibatkan jumlah...,2
1,Bangkok terpilih sebagai tuan rumah Piala Thom...,Bangkok terpilih sebagai tuan rumah Piala Thom...,0
2,Pengelola Nama Domain Internet Indonesia (Pand...,Pandi sering meluncurkan domain.,1
3,"Pada tahun 1933, penduduknya berjumlah 19.000 ...","Pada tahun 1933, penduduknya berjumlah 19.000 ...",0
4,Wangi adalah sebuah desa yang berada di kecama...,Terdapat sebuah desa bernama desa Wangi.,0
...,...,...,...
2196,Lucasfilm resmi mengungkapkan sinopsis film le...,Sinopsis film lepas Star Wars tidak diungkapka...,2
2197,Asal nama perusahaan tersebut tidak diketahui ...,Reruntuhan tersebut ditemukan saat penghancura...,2
2198,"Sewaktu album ini direkam, Yuko Hara mengambil...",Yuko Hara tidak pernah mengambil cuti.,2
2199,Oerip menentang kebijakan pemerintah yang dian...,Oerip menilai kebijakan pemerintah.,0


In [22]:
print("Count per class test:") 
print(df_test_t['label'].value_counts())

Count per class test:
0    808
2    764
1    629
Name: label, dtype: int64


## Preprocessing

### Tokenization

In [23]:
# tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_TYPE)
tokenizer = BertTokenizer.from_pretrained(TOKENIZER_TYPE)

In [24]:
class CompDataset(Dataset):
    def __init__(self, df_teacher, df_student):
        self.df_data_teacher = df_teacher
        self.df_data_student = df_student
        
    def __getitem__(self, index):
        # Teacher
        sentence_teacher_1 = self.df_data_teacher.loc[index, 'premise']
        sentence_teacher_2 = self.df_data_teacher.loc[index, 'hypothesis']
        
        encoded_dict_teacher = tokenizer.encode_plus(
            sentence_teacher_1,
            sentence_teacher_2,
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation='longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        padded_token_list_teacher = encoded_dict_teacher['input_ids'][0]
        att_mask_teacher = encoded_dict_teacher['attention_mask'][0]
        tok_type_id_teacher = encoded_dict_teacher['token_type_ids'][0]
        
        target_teacher = torch.tensor([self.df_data_teacher.loc[index, 'label']])
        lt_target_teacher = torch.LongTensor(target_teacher)
        onehot_encoded_lbl_teacher = F.one_hot(lt_target_teacher, num_classes=3) # 3 classes: entails, neutral, contradict
        
        # Student
        sentence_student_1 = self.df_data_student.loc[index, 'premise']
        sentence_student_2 = self.df_data_student.loc[index, 'hypothesis']
        
        encoded_dict_student = tokenizer.encode_plus(
            sentence_student_1,
            sentence_student_2,
            add_special_tokens = True,
            max_length = MAX_LEN,
            truncation='longest_first',
            padding = 'max_length',
            return_attention_mask = True,
            return_tensors = 'pt'
        )
        
        padded_token_list_student = encoded_dict_student['input_ids'][0]
        att_mask_student = encoded_dict_student['attention_mask'][0]
        tok_type_id_student = encoded_dict_student['token_type_ids'][0]
        
        target_student = torch.tensor([self.df_data_student.loc[index, 'label']])
        lt_target_student = torch.LongTensor(target_student)
        onehot_encoded_lbl_student = F.one_hot(lt_target_student, num_classes=3) # 3 classes: entails, neutral, contradict
        
        output = {
            "input_ids_teacher": padded_token_list_teacher, 
            "attention_mask_teacher": att_mask_teacher,
            "token_type_ids_teacher": tok_type_id_teacher,
            "lbl_teacher": onehot_encoded_lbl_teacher,
            "input_ids_student": padded_token_list_student, 
            "attention_mask_student": att_mask_student,
            "token_type_ids_student": tok_type_id_student,
            "lbl_student": onehot_encoded_lbl_student
        }
        
        return output
    
    def __len__(self):
        return len(self.df_data_teacher)

Tokenize dataset

In [25]:
train_data_cmp = CompDataset(df_train_t, df_train_student)
valid_data_cmp = CompDataset(df_valid_t, df_valid_student)
test_data_cmp = CompDataset(df_test_t, df_test_student)

Create dataloader

In [26]:
train_dataloader = DataLoader(train_data_cmp, batch_size = BATCH_SIZE)
valid_dataloader = DataLoader(valid_data_cmp, batch_size = BATCH_SIZE)
test_dataloader = DataLoader(test_data_cmp, batch_size = BATCH_SIZE)

## Model

Transfer Learning model as per Bandyopadhyay, D., et al (2022) paper

In [27]:
# bert_student_model = BertModel.from_pretrained(
#             MBERT_TYPE,
#             num_labels = 3,
#             output_hidden_states=True
#         )
# bert_student_model = bert_student_model.to(device)

In [28]:
# optimizer_student = AdamW(
#     bert_student_model.parameters(), 
#     lr=STUDENT_LRATE
# )

In [29]:
class TransferLearningPaper(PreTrainedModel):
    def __init__(self, config, lambda_kld, learningrate_student):
        super(TransferLearningPaper, self).__init__(config)
        
        self.bert_model_teacher = BertModel.from_pretrained(
            MODEL_TYPE, # using pretrained mBERT in INA language
            num_labels = 3,
            output_hidden_states=True
        )
    
        self.bert_model_student = BertModel.from_pretrained(
            MBERT_TYPE,
            num_labels = 3,
            output_hidden_states=True
        )
        self.optimizer_student = AdamW(
            self.bert_model_student.parameters(), 
            lr=learningrate_student
        )
        
        self.linear = nn.Linear(config.hidden_size, 3)  # Linear layer
        self.softmax = nn.Softmax(dim=1)  # Softmax activation
        
        self.cross_entropy = nn.CrossEntropyLoss()
        self.kld = nn.KLDivLoss(reduction='batchmean')
        
        # Initialize the weights of the linear layer
        self.linear.weight.data.normal_(mean=0.0, std=0.02)
        self.linear.bias.data.zero_()
        
        self.lambda_kld = lambda_kld
    
    def forward(self, input_ids_teacher, attention_mask_teacher, token_type_ids_teacher, lbl_teacher, input_ids_student, attention_mask_student, token_type_ids_student, lbl_student):
        # assume the label is already one-hot encoded
        
        self.bert_model_teacher.eval()
        self.bert_model_student.eval()
        
        with torch.no_grad():
            outputs_teacher = self.bert_model_teacher(input_ids=input_ids_teacher, attention_mask=attention_mask_teacher, token_type_ids=token_type_ids_teacher)
            outputs_student = self.bert_model_student(input_ids=input_ids_student, attention_mask=attention_mask_student, token_type_ids=token_type_ids_student)
        
            # take CLS token of the last hidden state
            pooled_output_teacher = outputs_teacher[0][:, 0, :]
            pooled_output_student = outputs_student[0][:, 0, :]
        
        linear_output = self.linear(pooled_output_student) # the output's logits
        softmax_linear_output = F.log_softmax(linear_output, dim=1)
        
        lbl_student = lbl_student[:,0,:].float()
        lbl_teacher = lbl_teacher[:,0,:].float()
        softmax_linear_output = softmax_linear_output.float()
        
        cross_entropy_loss = self.cross_entropy(softmax_linear_output, lbl_student)
        total_kld = self.kld(F.log_softmax(pooled_output_student, dim=1), F.softmax(pooled_output_teacher, dim=1))
        
        joint_loss = cross_entropy_loss + (self.lambda_kld * total_kld )
        
        return {"loss": joint_loss, "logits": softmax_linear_output}
    
    def update_param_student_model(self, loss):
        # Doing customized backpropagation for student's model
        self.optimizer_student.zero_grad()
        loss.backward()
        self.optimizer_student.step()

In [30]:
config = PretrainedConfig(
    problem_type = "single_label_classification",
    id2label = {
        "0": "ENTAIL",
        "1": "NEUTRAL",
        "2": "CONTRADICTION"
    },
    label2id = {
        "ENTAIL": 0,
        "NEUTRAL": 1,
        "CONTRADICTION": 2
    },
    num_labels = 3,
    hidden_size = 768,
    name_or_path = "indojavanesenli-transfer-learning",
    finetuning_task = "indonesian-javanese natural language inference"
)
print(config)
transferlearning_model = TransferLearningPaper(
    config = config,
    lambda_kld = 0.25, # antara 0.01-0.5
    learningrate_student = STUDENT_LRATE
)
transferlearning_model = transferlearning_model.to(device)

PretrainedConfig {
  "_name_or_path": "indojavanesenli-transfer-learning",
  "finetuning_task": "indonesian-javanese natural language inference",
  "hidden_size": 768,
  "id2label": {
    "0": "ENTAIL",
    "1": "NEUTRAL",
    "2": "CONTRADICTION"
  },
  "label2id": {
    "CONTRADICTION": 2,
    "ENTAIL": 0,
    "NEUTRAL": 1
  },
  "problem_type": "single_label_classification",
  "transformers_version": "4.27.3"
}



Some weights of the model checkpoint at jalaluddin94/nli_mbert were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias

## Training

Collect garbage

In [31]:
gc.collect()

81

Function to compute metrics

In [32]:
def compute_metrics(p):
    print("Computing metrics...")
    pred, labels = p
    pred = np.argmax(pred[:,0,:], axis=1)
    print("pred:", pred)
    print("labels", labels)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='weighted')
    
    print("f1 score:", f1)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1_score": f1}

Manual training function

In [33]:
def train(the_model, train_data):
    the_model.train()
    
    batch_loss = 0
    
    for batch, data in enumerate(train_data):
        input_ids_teacher = data["input_ids_teacher"].to(device)
        attention_mask_teacher = data["attention_mask_teacher"].to(device)
        token_type_ids_teacher = data["token_type_ids_teacher"].to(device)
        lbl_teacher = data["lbl_teacher"].to(device)
        input_ids_student = data["input_ids_student"].to(device)
        attention_mask_student = data["attention_mask_student"].to(device)
        token_type_ids_student = data["token_type_ids_student"].to(device)
        lbl_student = data["lbl_student"].to(device)
        
        output = the_model(
            input_ids_teacher = input_ids_teacher, 
            attention_mask_teacher = attention_mask_teacher, 
            token_type_ids_teacher = token_type_ids_teacher, 
            lbl_teacher = lbl_teacher, 
            input_ids_student = input_ids_student, 
            attention_mask_student = attention_mask_student, 
            token_type_ids_student = token_type_ids_student, 
            lbl_student = lbl_student
        )
        
        loss_model = output["loss"]
        batch_loss += loss_model
        wandb.log({"loss": loss_model})
        
        # Backpropagation
        the_model.update_param_student_model(loss_model)
    
    training_loss = batch_loss / BATCH_SIZE
    
    return training_loss

In [34]:
def validate(the_model, valid_data):
    the_model.eval()
    
    batch_loss = 0
    
    with torch.no_grad():
        for batch, data in enumerate(valid_data):
            input_ids_teacher = data["input_ids_teacher"].to(device)
            attention_mask_teacher = data["attention_mask_teacher"].to(device)
            token_type_ids_teacher = data["token_type_ids_teacher"].to(device)
            lbl_teacher = data["lbl_teacher"].to(device)
            input_ids_student = data["input_ids_student"].to(device)
            attention_mask_student = data["attention_mask_student"].to(device)
            token_type_ids_student = data["token_type_ids_student"].to(device)
            lbl_student = data["lbl_student"].to(device)

            output = the_model(
                input_ids_teacher = input_ids_teacher, 
                attention_mask_teacher = attention_mask_teacher, 
                token_type_ids_teacher = token_type_ids_teacher, 
                lbl_teacher = lbl_teacher, 
                input_ids_student = input_ids_student, 
                attention_mask_student = attention_mask_student, 
                token_type_ids_student = token_type_ids_student, 
                lbl_student = lbl_student
            )

            loss_model = output["loss"]
            batch_loss += loss_model
            wandb.log({"eval_loss": loss_model})
    
        eval_loss = batch_loss / BATCH_SIZE
    
    return eval_loss

In [35]:
def training_sequence(the_model, train_data, valid_data, epochs):
    track_train_loss = []
    track_val_loss = []
    
    t = trange(epochs, colour="green", position=0, leave=True)
    for ep in t:
        training_loss = train(the_model, train_data)
        valid_loss = validate(the_model, valid_data)
        
        track_train_loss.append(training_loss)
        track_val_loss.append(valid_loss)
        
        t.set_description(f"Epoch [{ep + 1}/{epochs}] - Training loss: {training_loss:.2f} Validation loss: {valid_loss:.2f}")
        
        if valid_loss < min(track_val_loss) or ep + 1 == 1:
            the_model.save_pretrained(
                save_directory = MODEL_PATH + "indojavanesenli-transfer-learning"
            )
    return {
        "training_loss": track_train_loss,
        "validation_loss": track_val_loss
    }

In [36]:
training_sequence(transferlearning_model, train_dataloader, valid_dataloader, NUM_EPOCHS)

Epoch [50/50] - Training loss: 183.34 Validation loss: 38.31: 100%|[32m█████████████████[0m| 50/50 [9:31:45<00:00, 686.11s/it][0m


{'training_loss': [],
 'validation_loss': [tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, device='cuda:0'),
  tensor(38.3089, devic

In [38]:
transferlearning_model.save_pretrained(save_directory = MODEL_PATH + "indojavanesenli-transfer-learning")

In [37]:
wandb.finish()

0,1
eval_loss,▇▄▃▂▅▅▁▅█▅▇▅▆▅▆▆▆▄▇▅▅▆▃▃█▄▇▆▆▆▆▆▇▄▄▆▄▅▇█
loss,▇▄▃▄▅█▆▃▄█▃▆▅▂▇▆▅▅▂▇▆█▅▆▆█▅▅█▅▅▁▇▅▆▃▄▅▇█

0,1
eval_loss,1.18867
loss,1.31325


Training using Trainer from Huggingface (couldn't work)

In [30]:
# training_args = TrainingArguments(
#     output_dir=MODEL_PATH + "indojavanesenli-transfer-learning/",
#     save_strategy="no", # no
#     evaluation_strategy="epoch",
#     logging_strategy="epoch",
#     learning_rate=L_RATE,
#     per_device_train_batch_size=BATCH_SIZE,
#     per_device_eval_batch_size=BATCH_SIZE,
#     overwrite_output_dir=True,
#     num_train_epochs=NUM_EPOCHS,
#     weight_decay=LAMBDA_L2,
#     hub_token=HF_TOKEN,
#     report_to="wandb",
#     gradient_accumulation_steps=1000,
#     push_to_hub=False,
#     run_name="transfer-learning-paper-lambda-0.25"
# )

# trainer = Trainer(
#     model=transferlearning_model.to(device),
#     args=training_args,
#     train_dataset=train_data_cmp,
#     eval_dataset=valid_data_cmp,
#     compute_metrics=compute_metrics
# )

# trainer.train()



lbl_teacher.size: torch.Size([6, 3])
lbl_student.size: torch.Size([6, 3])


RuntimeError: The size of tensor a (3) must match the size of tensor b (768) at non-singleton dimension 1

In [None]:
# fin_eval = trainer.evaluate()
# wandb.log({"f1_score": fin_eval["eval_f1_score"], "eval_loss": fin_eval["eval_loss"], "accuracy": fin_eval["eval_accuracy"], "precision": fin_eval["eval_precision"], "recall": fin_eval["eval_recall"]})

In [None]:
# trainer.save_model()