<h1>Preparing training, validation and testing dataset, and loading the BD-SHS trained model<h1>

In [5]:
!pip install --quiet transformers

In [6]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

In [7]:
#defining some hyperparameters
max_number_input_tokens=256
batch_size_training = 16
first_dropout_rate = 0.3
hidden_output = 768
bert_model_name = "sagorsarker/bangla-bert-base"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.8
epochs = 10
classes = 2
need_split_dataset=False

In [8]:
from google.colab import drive
drive.mount('/content/drive')
DirPath = ('/content/drive/My Drive/Test/')
Finetuned_model_path = DirPath+bert_model_name+"_CustomBertBengaliFullDataset6epoch885044valacc.pth"
CollectedDatasetFileName = "Final_data.csv"
CollectedDatasetPath = DirPath+"EMNLP/"+CollectedDatasetFileName
SplittedTrainFileName = "train.csv"
SplittedValFileName = "dev.csv"
SplittedTrainDataPath = DirPath+"EMNLP/"+SplittedTrainFileName
SplittedValDataPath = DirPath+"EMNLP/"+SplittedValFileName

Mounted at /content/drive


In [9]:
def interchange(df_train,pos,label):
  #setting the first sample to be with label '0'
  zero_index = df_train[df_train['label'] == label].index[0]
  first_index=pos
  # interchange the samples
  df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]
  return df_train

In [10]:
def balanceclasses(df_train):
  class_counts = df_train['label'].value_counts()
  min_count = class_counts.max()

  # Create new DataFrames for each class with fewer samples
  new_dfs = []
  for label, count in class_counts.items():
    if count == min_count:
        continue
    df_label = df_train[df_train['label'] == label]
    num_copies = min_count // count
    new_df_label = pd.concat([df_label] * num_copies, ignore_index=True)
    new_df_label = new_df_label.head(min_count-count)
    #print(new_df_label.head(10))
    new_dfs.append(new_df_label)

  # Concatenate the new DataFrames with the original DataFrame
  df_balanced = pd.concat([df_train] + new_dfs, ignore_index=True).sample(frac=1).reset_index(drop=True)
  return df_balanced

In [11]:
from sklearn.model_selection import train_test_split

#splitting the dataset and saving
if need_split_dataset==True:
  #dataset loading
  df = pd.read_csv(CollectedDatasetPath)[ ['Text','label'] ]
  print(f'df label counts\n',df['label'].value_counts())
  # check if there is any NaN value in the dataframe
  print(f'null values: {df.isna().sum()}')

  #null indices
  null_index = df.index[df.isna().any(axis=1)]
  print(f'null indices: {null_index}')

  #dropping null values
  df = df.dropna()

  df_train, df_val = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
  df_train.to_csv(SplittedTrainDataPath)
  df_val.to_csv(SplittedValDataPath)
else:
  df_train = pd.read_csv(SplittedTrainDataPath)[ ['text','label'] ]
  df_val = pd.read_csv(SplittedValDataPath)[ ['text','label'] ]

#df_train = pd.concat([df_train, df_val], ignore_index=True)

# count the number of each unique label in train and validation dataframes
train_label_counts = df_train['label'].value_counts()
val_label_counts = df_val['label'].value_counts()

#setting the first sample to be with label '0'
zero_index = df_train[df_train['label'] == 0].index[0]
first_index=0
# interchange the samples
df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]

print(df_train.shape)
print(df_val.shape)
print('Train label counts:\n', train_label_counts)
print('Validation label counts:\n', val_label_counts)

print("\n after making copies:")
#balance all classes making copies
df_train = balanceclasses(df_train)
print(df_train['label'].value_counts())

#setting the first sample to be with label '0'
zero_index = df_train[df_train['label'] == 0].index[0]
first_index=0
# interchange the samples
df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]

#setting the first sample to be with label '0'
zero_index = df_train[df_train['label'] == 1].index[0]
first_index=1
# interchange the samples
df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]

print(df_train)
print(df_val)

(2700, 2)
(1330, 2)
Train label counts:
 0    1389
1     922
2     389
Name: label, dtype: int64
Validation label counts:
 0    717
1    417
2    196
Name: label, dtype: int64

 after making copies:
2    1389
0    1389
1    1389
Name: label, dtype: int64
                                                   text  label
0     কে নিবে এই দায় কলেজ শিক্ষার্থীরা না ব্যবসায়ী...      0
1     শুয়েরের বাচচা তোরা কি দেশের শাসন ব্যবস্থা মানি...      1
2     যে ব্যক্তি এই ঘটনা ঘটিয়েছে তাকে গুলি করে হত্যা...      2
3     এখানে জা সব আওয়ামিলীগের দালাল আপু ভোট দিতে না ...      1
4     হিজাব পড়লে যদি তালেবান উপাধি দেয়া হয়।তাহলে ...      2
...                                                 ...    ...
4162  হে আল্লাহ আপনি ভাৰতীয় মুসলিম ভাই বোন দেৰকে ৰক্...      0
4163  এখনো যদি বিজিপির উপরে গুলি চুপ থাকে তহলে তাদের...      1
4164                                      ঠিক বলছেন ভাই      0
4165                   দালালের বাচ্চারা কার ভয় পাস তোরা      1
4166  তাহলে তোদের যে ২ কোটি হিন্দু বাংলাদেশে আছে ওদে

In [12]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'] , value['label']

In [13]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_val)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [14]:
class HateSpeechBert(nn.Module):

    def __init__(self, bert):
        super(HateSpeechBert, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(first_dropout_rate)

        # relu activation function
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        # dense layer 1
        self.fc1 = nn.Linear(hidden_output*2, hidden_output)

        #dense layer 2
        self.fc2 = nn.Linear(hidden_output, 128)

        # dense layer 2 (Output layer)
        self.fc3 = nn.Linear(128, 2)

        #softmax
        self.softmax = nn.Softmax(dim=1)

    # define the forward pass
    def forward(self, input_ids, token_type_ids, attention_mask):
        # pass the inputs to the model
        out = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        mean, _ = torch.max(out[0], 1)
        x= torch.cat((mean,out[1]), dim=1)

        x = self.dropout(x)

        x = self.fc1(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.relu(x)

        # output layer
        x = self.fc3(x)
        x = self.softmax(x)

        return x

In [15]:
class BERTBengali(nn.Module):
    def __init__(self, bert):
        super(BERTBengali, self).__init__()
        #self.bert = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
        self.bert = bert
        self.bert_drop = nn.Dropout(0.2)
        self.out = nn.Linear(hidden_output, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(output[1])

        output = self.out(bo)
        return output

In [16]:
class BERTBengaliPooler(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliPooler, self).__init__()
        self.bert = bert
        #self.bert.pooler.dense = nn.Linear(bert.config.hidden_size, bert.config.hidden_size)
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.out = nn.Linear(bert.config.hidden_size, classes)
        #softmax
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output
        bo = self.bert_drop(pooled_output)

        output = self.out(bo)
        output = self.softmax(bo)
        return output

In [17]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 3, classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        o1 = outputs.hidden_states[-1]
        o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        pooled_output = o2
        cat = torch.cat((apool, mpool, pooled_output), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [18]:
class BERTBengaliTwo(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliTwo, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 2, classes)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        out = torch.cat((outputs.hidden_states[-1], outputs.hidden_states[-2]), dim=-1)
        out = self.drop_out(out)
        out = out[:,0,:]
        logits = self.l0(out)
        logits = self.softmax(logits)
        return logits

In [19]:
class BERTBengaliLastTwoPooler(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPooler, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 3, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        #out = out[:,0,:]
        logits = self.l0(out)
        logits = self.softmax(logits)
        return logits

In [20]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#creating the structure to contain finetuned bert
struct_model = CustomBERTBengali(bert)
struct_model.to(device)

#loading the finetuned model which have leaned necessary info from other domain
struct_model.load_state_dict(torch.load(Finetuned_model_path))

# Access the bert model
finetuned_bert_base = struct_model.bert

<h1>Creating model architechture for VITD training model, freezing necessary layers, loding parameters into the architechture from BD-SHS trained model and training on VITD dataset. Transfer learning second step, first Training session <h1>

In [22]:
###Hyperparameter for the new model
#defining some hyperparameters
max_number_input_tokens=256
batch_size_training = 8
first_dropout_rate = 0.0
hidden_output = 768
bert_model_name = "sagorsarker/bangla-bert-base"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.98
epochs = 100
classes = 3
#need_split_dataset=False

In [23]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'] , value['label']

In [24]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=False)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_val)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [25]:
# #model for finetuning collected data
# class BERTBengaliLastTwoPooler(nn.Module):
#     def __init__(self, bert):
#         super(BERTBengaliLastTwoPooler, self).__init__()
#         self.bert = bert
#         self.drop_out = nn.Dropout(first_dropout_rate)
#         self.l0 =  nn.Linear(hidden_output * 3, classes)
#         #torch.nn.init.normal_(self.l0.weight, std=0.02)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids
#         )
#         mpool, _ = torch.max(outputs.hidden_states[-1], 1)
#         out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
#         out = self.drop_out(out)
#         #out = out[:,0,:]
#         logits = self.l0(out)
#         logits = self.softmax(logits)
#         return logits

In [26]:
# class CustomBERTBengali(nn.Module):
#     def __init__(self, bert):
#         super(CustomBERTBengali, self).__init__()
#         self.bert = bert
#         self.bert_drop = nn.Dropout(first_dropout_rate)
#         self.tanh = nn.Tanh()
#         self.out = nn.Linear(hidden_output * 3, classes)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids
#         )
#         o1 = outputs.hidden_states[-1]
#         o2 = outputs.pooler_output
#         apool = torch.mean(o1, 1)
#         mpool, _ = torch.max(o1, 1)
#         pooled_output = o2
#         cat = torch.cat((apool, mpool, pooled_output), 1)
#         bo = self.bert_drop(cat)
#         logits = self.out(bo)
#         #logits = self.softmax(logits)
#         return logits

In [27]:
#model for finetuning collected data
class BERTBengaliLastTwoPoolerFreeze(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerFreeze, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l2 = nn.Linear(hidden_output * 3, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l1 = nn.Linear(hidden_output * 2, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l0 = nn.Linear(hidden_output * 2, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((outputs.hidden_states[-2][:,0,:], mpool,outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        out = self.l2(out)
        out = self.activation(out)
        out = self.l1(out)
        out = self.activation(out)
        logits = self.l0(out)
        #prob = self.softmax(logits)
        return logits


In [28]:
#model for finetuning collected data
class BERTBengaliLastTwoPoolerFreezePrev(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerFreezePrev, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l1 = nn.Linear(hidden_output * 3, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l0 = nn.Linear(hidden_output * 2, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((outputs.hidden_states[-2][:,0,:], mpool,outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        out = self.l1(out)
        out = self.activation(out)
        logits = self.l0(out)
        #prob = self.softmax(logits)
        return logits


In [29]:
model = BERTBengaliLastTwoPoolerFreeze(finetuned_bert_base)
# model2Forlastlayers = BERTBengaliLastTwoPoolerFreezePrev(finetuned_bert_base)

model.to(device)
# model2Forlastlayers.to(device)
# model2Forlastlayers.load_state_dict(torch.load(DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_acc1_frozen_acc1.pth"))

# model.l0 = model2Forlastlayers.l0
# model.l2 = model2Forlastlayers.l1
# model.bert = struct_model.bert

model.load_state_dict(torch.load(DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_finalhs_frozen_acc1_step2.pth"))

for params in model.bert.parameters():
  params.requires_grad = True
for params in model.bert.encoder.parameters():
  params.requires_grad = True
for params in model.l2.parameters():
  params.requires_grad = True
for params in model.l1.parameters():
  params.requires_grad = True
for params in model.l0.parameters():
  params.requires_grad = True

for name, param in model.named_parameters():
  if param.requires_grad:
      print(f"name: {name} is trainable")
  else:
      print(f"name: {name} is non-trainable")

name: bert.embeddings.word_embeddings.weight is trainable
name: bert.embeddings.position_embeddings.weight is trainable
name: bert.embeddings.token_type_embeddings.weight is trainable
name: bert.embeddings.LayerNorm.weight is trainable
name: bert.embeddings.LayerNorm.bias is trainable
name: bert.encoder.layer.0.attention.self.query.weight is trainable
name: bert.encoder.layer.0.attention.self.query.bias is trainable
name: bert.encoder.layer.0.attention.self.key.weight is trainable
name: bert.encoder.layer.0.attention.self.key.bias is trainable
name: bert.encoder.layer.0.attention.self.value.weight is trainable
name: bert.encoder.layer.0.attention.self.value.bias is trainable
name: bert.encoder.layer.0.attention.output.dense.weight is trainable
name: bert.encoder.layer.0.attention.output.dense.bias is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.weight is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.bias is trainable
name: bert.encoder.layer.0.inter

In [30]:
#testing if the input of model works before starting training
s = "আমি বাংলায় গান গাই। [SEP]"
t = tokenizer.encode_plus(s, return_tensors="pt").to(device)
print(t)
out = model(**t)
print(out)

{'input_ids': tensor([[  101,  2169,  2492,  9294,  2552, 13985,  1014,   102,   102]],
       device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[ 0.8516,  0.2068, -0.3792]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


In [31]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [32]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        #print("successfully calculated criterion in train!")
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [33]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device, dtype=torch.long)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        #print(f'predicted: {predicted} labels: {labels}')
        label_0_TP += ((predicted == 0) & (labels == 0)).sum().item()
        label_0_TN += ((predicted != 0) & (labels != 0)).sum().item()
        label_0_FP += ((predicted == 0) & (labels != 0)).sum().item()
        label_0_FN += ((predicted != 0) & (labels == 0)).sum().item()

        label_1_TP += ((predicted == 1) & (labels == 1)).sum().item()
        label_1_TN += ((predicted != 1) & (labels != 1)).sum().item()
        label_1_FP += ((predicted == 1) & (labels != 1)).sum().item()
        label_1_FN += ((predicted != 1) & (labels == 1)).sum().item()

        label_2_TP += ((predicted == 2) & (labels == 2)).sum().item()
        label_2_TN += ((predicted != 2) & (labels != 2)).sum().item()
        label_2_FP += ((predicted == 2) & (labels != 2)).sum().item()
        label_2_FN += ((predicted != 2) & (labels == 2)).sum().item()

    return total, correct, valid_loss, label_0_TP, label_0_TN, label_0_FP, label_0_FN, label_1_TP, label_1_TN, label_1_FP, label_1_FN, label_2_TP, label_2_TN, label_2_FP, label_2_FN


In [34]:

tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [None]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf
best_acc=0
sml = 1e-10
best_f1=0.4390

for epoch in range(epochs):


    if epoch==1:
      training_data = NewsDatasets(df_train)
      train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0


    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]
    label_0_TP += out[3]
    label_0_TN += out[4]
    label_0_FP += out[5]
    label_0_FN += out[6]

    label_1_TP += out[7]
    label_1_TN += out[8]
    label_1_FP += out[9]
    label_1_FN += out[10]

    label_2_TP += out[11]
    label_2_TN += out[12]
    label_2_FP += out[13]
    label_2_FN += out[14]

    # Calculate precision, recall, and F1-score for each class
    label_0_precision = label_0_TP / (label_0_TP + label_0_FP+sml)
    label_0_recall = label_0_TP / (label_0_TP + label_0_FN+sml)
    label_0_f1_score = 2 * (label_0_precision * label_0_recall) / (label_0_precision + label_0_recall+sml)

    label_1_precision = label_1_TP / (label_1_TP + label_1_FP+sml)
    label_1_recall = label_1_TP / (label_1_TP + label_1_FN+sml)
    label_1_f1_score = 2 * (label_1_precision * label_1_recall) / (label_1_precision + label_1_recall+sml)

    label_2_precision = label_2_TP / (label_2_TP + label_2_FP+sml)
    label_2_recall = label_2_TP / (label_2_TP + label_2_FN+sml)
    label_2_f1_score = 2 * (label_2_precision * label_2_recall) / (label_2_precision + label_2_recall+sml)

    # Calculate combined F1-score
    combined_f1_score = (label_0_f1_score + label_1_f1_score + label_2_f1_score) / 3

    # Calculate micro TP, TN, FP, FN values
    micro_TP = label_0_TP + label_1_TP + label_2_TP
    micro_TN = label_0_TN + label_1_TN + label_2_TN
    micro_FP = label_0_FP + label_1_FP + label_2_FP
    micro_FN = label_0_FN + label_1_FN + label_2_FN

    # Calculate micro precision, recall, and F1 score
    micro_precision = micro_TP / (micro_TP + micro_FP)
    micro_recall = micro_TP / (micro_TP + micro_FN)
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    val_acc=correct / total * 100

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if combined_f1_score > best_f1:
        best_f1 = combined_f1_score
        torch.save(model.state_dict(), DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_finalhs_unfrozenfrozen_acc1_sub.pth")
        print(f'saved on epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))
    print("\tLabel 0 Precision: {:.4f}\tLabel 0 Recall: {:.4f}\tLabel 0 F1-score: {:.4f}\n"
      "\tLabel 1 Precision: {:.4f}\tLabel 1 Recall: {:.4f}\tLabel 1 F1-score: {:.4f}\n"
      "\tLabel 2 Precision: {:.4f}\tLabel 2 Recall: {:.4f}\tLabel 2 F1-score: {:.4f}\n"
      "\tCombined F1-score: {:.4f}".format(label_0_precision, label_0_recall, label_0_f1_score,
                                            label_1_precision, label_1_recall, label_1_f1_score,
                                            label_2_precision, label_2_recall, label_2_f1_score,
                                            combined_f1_score))
    print(f'micro precision: {micro_precision}, Micro recall: {micro_recall}, micro f1: {micro_f1}')

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 1
	Train loss:0.968731.. 	Valid Loss:0.898504.. 	Val Accuracy: 56.0150
	Label 0 Precision: 0.7207	Label 0 Recall: 0.6695	Label 0 F1-score: 0.6941
	Label 1 Precision: 0.6723	Label 1 Recall: 0.2854	Label 1 F1-score: 0.4007
	Label 2 Precision: 0.2998	Label 2 Recall: 0.7449	Label 2 F1-score: 0.4275
	Combined F1-score: 0.5074
micro precision: 0.5601503759398496, Micro recall: 0.5601503759398496, micro f1: 0.5601503759398496
Epoch: 2/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 2
	Train loss:0.799102.. 	Valid Loss:0.835134.. 	Val Accuracy: 67.5940
	Label 0 Precision: 0.7300	Label 0 Recall: 0.7880	Label 0 F1-score: 0.7579
	Label 1 Precision: 0.6215	Label 1 Recall: 0.5827	Label 1 F1-score: 0.6015
	Label 2 Precision: 0.5515	Label 2 Recall: 0.4643	Label 2 F1-score: 0.5042
	Combined F1-score: 0.6212
micro precision: 0.6759398496240602, Micro recall: 0.6759398496240602, micro f1: 0.6759398496240602
Epoch: 3/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 3
	Train loss:0.652156.. 	Valid Loss:0.854174.. 	Val Accuracy: 71.1278
	Label 0 Precision: 0.7327	Label 0 Recall: 0.8563	Label 0 F1-score: 0.7897
	Label 1 Precision: 0.6658	Label 1 Recall: 0.6019	Label 1 F1-score: 0.6322
	Label 2 Precision: 0.7043	Label 2 Recall: 0.4133	Label 2 F1-score: 0.5209
	Combined F1-score: 0.6476
micro precision: 0.7112781954887218, Micro recall: 0.7112781954887218, micro f1: 0.7112781954887217
Epoch: 4/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.558824.. 	Valid Loss:1.039654.. 	Val Accuracy: 66.3910
	Label 0 Precision: 0.8191	Label 0 Recall: 0.6569	Label 0 F1-score: 0.7291
	Label 1 Precision: 0.5706	Label 1 Recall: 0.7170	Label 1 F1-score: 0.6355
	Label 2 Precision: 0.4892	Label 2 Recall: 0.5765	Label 2 F1-score: 0.5293
	Combined F1-score: 0.6313
micro precision: 0.6639097744360902, Micro recall: 0.6639097744360902, micro f1: 0.6639097744360902
Epoch: 5/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 5
	Train loss:0.508669.. 	Valid Loss:1.067359.. 	Val Accuracy: 70.7519
	Label 0 Precision: 0.7385	Label 0 Recall: 0.8312	Label 0 F1-score: 0.7822
	Label 1 Precision: 0.6912	Label 1 Recall: 0.5635	Label 1 F1-score: 0.6209
	Label 2 Precision: 0.6011	Label 2 Recall: 0.5612	Label 2 F1-score: 0.5805
	Combined F1-score: 0.6612
micro precision: 0.7075187969924812, Micro recall: 0.7075187969924812, micro f1: 0.7075187969924812
Epoch: 6/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.469576.. 	Valid Loss:1.296007.. 	Val Accuracy: 70.3759
	Label 0 Precision: 0.7406	Label 0 Recall: 0.8285	Label 0 F1-score: 0.7821
	Label 1 Precision: 0.6693	Label 1 Recall: 0.6019	Label 1 F1-score: 0.6338
	Label 2 Precision: 0.5948	Label 2 Recall: 0.4643	Label 2 F1-score: 0.5215
	Combined F1-score: 0.6458
micro precision: 0.7037593984962406, Micro recall: 0.7037593984962406, micro f1: 0.7037593984962406
Epoch: 7/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.450402.. 	Valid Loss:1.195408.. 	Val Accuracy: 71.0526
	Label 0 Precision: 0.7398	Label 0 Recall: 0.8368	Label 0 F1-score: 0.7853
	Label 1 Precision: 0.6804	Label 1 Recall: 0.5923	Label 1 F1-score: 0.6333
	Label 2 Precision: 0.6282	Label 2 Recall: 0.5000	Label 2 F1-score: 0.5568
	Combined F1-score: 0.6585
micro precision: 0.7105263157894737, Micro recall: 0.7105263157894737, micro f1: 0.7105263157894737
Epoch: 8/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.423144.. 	Valid Loss:1.438659.. 	Val Accuracy: 68.2707
	Label 0 Precision: 0.7443	Label 0 Recall: 0.7796	Label 0 F1-score: 0.7616
	Label 1 Precision: 0.6076	Label 1 Recall: 0.6163	Label 1 F1-score: 0.6119
	Label 2 Precision: 0.5897	Label 2 Recall: 0.4694	Label 2 F1-score: 0.5227
	Combined F1-score: 0.6321
micro precision: 0.6827067669172933, Micro recall: 0.6827067669172933, micro f1: 0.6827067669172933
Epoch: 9/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.438099.. 	Valid Loss:1.524104.. 	Val Accuracy: 68.1203
	Label 0 Precision: 0.7416	Label 0 Recall: 0.7727	Label 0 F1-score: 0.7568
	Label 1 Precision: 0.6497	Label 1 Recall: 0.5827	Label 1 F1-score: 0.6144
	Label 2 Precision: 0.5215	Label 2 Recall: 0.5561	Label 2 F1-score: 0.5383
	Combined F1-score: 0.6365
micro precision: 0.681203007518797, Micro recall: 0.681203007518797, micro f1: 0.681203007518797
Epoch: 10/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.432641.. 	Valid Loss:1.426394.. 	Val Accuracy: 68.3459
	Label 0 Precision: 0.7539	Label 0 Recall: 0.7476	Label 0 F1-score: 0.7507
	Label 1 Precision: 0.5940	Label 1 Recall: 0.6595	Label 1 F1-score: 0.6250
	Label 2 Precision: 0.6282	Label 2 Recall: 0.5000	Label 2 F1-score: 0.5568
	Combined F1-score: 0.6442
micro precision: 0.6834586466165413, Micro recall: 0.6834586466165413, micro f1: 0.6834586466165413
Epoch: 11/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.438864.. 	Valid Loss:1.372371.. 	Val Accuracy: 68.9474
	Label 0 Precision: 0.7548	Label 0 Recall: 0.7727	Label 0 F1-score: 0.7636
	Label 1 Precision: 0.6129	Label 1 Recall: 0.6379	Label 1 F1-score: 0.6251
	Label 2 Precision: 0.5988	Label 2 Recall: 0.4949	Label 2 F1-score: 0.5419
	Combined F1-score: 0.6436
micro precision: 0.6894736842105263, Micro recall: 0.6894736842105263, micro f1: 0.6894736842105263
Epoch: 12/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.457303.. 	Valid Loss:1.530546.. 	Val Accuracy: 68.5714
	Label 0 Precision: 0.7574	Label 0 Recall: 0.7531	Label 0 F1-score: 0.7552
	Label 1 Precision: 0.6799	Label 1 Recall: 0.5755	Label 1 F1-score: 0.6234
	Label 2 Precision: 0.5000	Label 2 Recall: 0.6735	Label 2 F1-score: 0.5739
	Combined F1-score: 0.6508
micro precision: 0.6857142857142857, Micro recall: 0.6857142857142857, micro f1: 0.6857142857142857
Epoch: 13/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.456860.. 	Valid Loss:1.414020.. 	Val Accuracy: 67.1429
	Label 0 Precision: 0.7743	Label 0 Recall: 0.6890	Label 0 F1-score: 0.7292
	Label 1 Precision: 0.5483	Label 1 Recall: 0.7626	Label 1 F1-score: 0.6379
	Label 2 Precision: 0.7232	Label 2 Recall: 0.4133	Label 2 F1-score: 0.5260
	Combined F1-score: 0.6310
micro precision: 0.6714285714285714, Micro recall: 0.6714285714285714, micro f1: 0.6714285714285714
Epoch: 14/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.433308.. 	Valid Loss:1.514209.. 	Val Accuracy: 66.3158
	Label 0 Precision: 0.8121	Label 0 Recall: 0.6388	Label 0 F1-score: 0.7151
	Label 1 Precision: 0.5258	Label 1 Recall: 0.8058	Label 1 F1-score: 0.6364
	Label 2 Precision: 0.6929	Label 2 Recall: 0.4490	Label 2 F1-score: 0.5449
	Combined F1-score: 0.6321
micro precision: 0.6631578947368421, Micro recall: 0.6631578947368421, micro f1: 0.6631578947368421
Epoch: 15/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.504319.. 	Valid Loss:1.308715.. 	Val Accuracy: 69.4737
	Label 0 Precision: 0.7559	Label 0 Recall: 0.7559	Label 0 F1-score: 0.7559
	Label 1 Precision: 0.6122	Label 1 Recall: 0.6739	Label 1 F1-score: 0.6416
	Label 2 Precision: 0.6558	Label 2 Recall: 0.5153	Label 2 F1-score: 0.5771
	Combined F1-score: 0.6582
micro precision: 0.6947368421052632, Micro recall: 0.6947368421052632, micro f1: 0.6947368421052632
Epoch: 16/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.455743.. 	Valid Loss:1.401153.. 	Val Accuracy: 69.0977
	Label 0 Precision: 0.7000	Label 0 Recall: 0.8591	Label 0 F1-score: 0.7714
	Label 1 Precision: 0.6615	Label 1 Recall: 0.5156	Label 1 F1-score: 0.5795
	Label 2 Precision: 0.7040	Label 2 Recall: 0.4490	Label 2 F1-score: 0.5483
	Combined F1-score: 0.6331
micro precision: 0.6909774436090226, Micro recall: 0.6909774436090226, micro f1: 0.6909774436090226
Epoch: 17/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.493745.. 	Valid Loss:1.224707.. 	Val Accuracy: 69.1729
	Label 0 Precision: 0.7584	Label 0 Recall: 0.7531	Label 0 F1-score: 0.7558
	Label 1 Precision: 0.6184	Label 1 Recall: 0.6451	Label 1 F1-score: 0.6315
	Label 2 Precision: 0.6066	Label 2 Recall: 0.5663	Label 2 F1-score: 0.5858
	Combined F1-score: 0.6577
micro precision: 0.6917293233082706, Micro recall: 0.6917293233082706, micro f1: 0.6917293233082706
Epoch: 18/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.498155.. 	Valid Loss:1.370210.. 	Val Accuracy: 67.7444
	Label 0 Precision: 0.7773	Label 0 Recall: 0.7155	Label 0 F1-score: 0.7451
	Label 1 Precision: 0.5803	Label 1 Recall: 0.6930	Label 1 F1-score: 0.6317
	Label 2 Precision: 0.5756	Label 2 Recall: 0.5051	Label 2 F1-score: 0.5380
	Combined F1-score: 0.6383
micro precision: 0.6774436090225564, Micro recall: 0.6774436090225564, micro f1: 0.6774436090225564
Epoch: 19/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.452578.. 	Valid Loss:1.288963.. 	Val Accuracy: 67.9699
	Label 0 Precision: 0.7888	Label 0 Recall: 0.6876	Label 0 F1-score: 0.7347
	Label 1 Precision: 0.5644	Label 1 Recall: 0.7986	Label 1 F1-score: 0.6614
	Label 2 Precision: 0.6783	Label 2 Recall: 0.3980	Label 2 F1-score: 0.5016
	Combined F1-score: 0.6326
micro precision: 0.6796992481203008, Micro recall: 0.6796992481203008, micro f1: 0.6796992481203008
Epoch: 20/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.457247.. 	Valid Loss:1.248994.. 	Val Accuracy: 69.2481
	Label 0 Precision: 0.7547	Label 0 Recall: 0.7810	Label 0 F1-score: 0.7676
	Label 1 Precision: 0.6481	Label 1 Recall: 0.6139	Label 1 F1-score: 0.6305
	Label 2 Precision: 0.5440	Label 2 Recall: 0.5357	Label 2 F1-score: 0.5398
	Combined F1-score: 0.6460
micro precision: 0.6924812030075188, Micro recall: 0.6924812030075188, micro f1: 0.6924812030075188
Epoch: 21/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.463915.. 	Valid Loss:1.663263.. 	Val Accuracy: 62.3308
	Label 0 Precision: 0.8373	Label 0 Recall: 0.5453	Label 0 F1-score: 0.6605
	Label 1 Precision: 0.4911	Label 1 Recall: 0.8561	Label 1 F1-score: 0.6241
	Label 2 Precision: 0.5956	Label 2 Recall: 0.4133	Label 2 F1-score: 0.4880
	Combined F1-score: 0.5909
micro precision: 0.6233082706766917, Micro recall: 0.6233082706766917, micro f1: 0.6233082706766917
Epoch: 22/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.492192.. 	Valid Loss:1.240415.. 	Val Accuracy: 67.2932
	Label 0 Precision: 0.7451	Label 0 Recall: 0.7992	Label 0 F1-score: 0.7712
	Label 1 Precision: 0.7018	Label 1 Recall: 0.4628	Label 1 F1-score: 0.5578
	Label 2 Precision: 0.4510	Label 2 Recall: 0.6582	Label 2 F1-score: 0.5353
	Combined F1-score: 0.6214
micro precision: 0.6729323308270677, Micro recall: 0.6729323308270677, micro f1: 0.6729323308270677
Epoch: 23/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.475712.. 	Valid Loss:1.362328.. 	Val Accuracy: 67.8947
	Label 0 Precision: 0.7432	Label 0 Recall: 0.7671	Label 0 F1-score: 0.7550
	Label 1 Precision: 0.5682	Label 1 Recall: 0.6691	Label 1 F1-score: 0.6145
	Label 2 Precision: 0.7475	Label 2 Recall: 0.3776	Label 2 F1-score: 0.5017
	Combined F1-score: 0.6237
micro precision: 0.6789473684210526, Micro recall: 0.6789473684210526, micro f1: 0.6789473684210526
Epoch: 24/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.452766.. 	Valid Loss:1.339262.. 	Val Accuracy: 69.6241
	Label 0 Precision: 0.7526	Label 0 Recall: 0.7936	Label 0 F1-score: 0.7726
	Label 1 Precision: 0.6452	Label 1 Recall: 0.6019	Label 1 F1-score: 0.6228
	Label 2 Precision: 0.5730	Label 2 Recall: 0.5408	Label 2 F1-score: 0.5564
	Combined F1-score: 0.6506
micro precision: 0.6962406015037594, Micro recall: 0.6962406015037594, micro f1: 0.6962406015037594
Epoch: 25/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.423955.. 	Valid Loss:1.386623.. 	Val Accuracy: 68.3459
	Label 0 Precision: 0.7725	Label 0 Recall: 0.7294	Label 0 F1-score: 0.7504
	Label 1 Precision: 0.6166	Label 1 Recall: 0.6403	Label 1 F1-score: 0.6282
	Label 2 Precision: 0.5409	Label 2 Recall: 0.6071	Label 2 F1-score: 0.5721
	Combined F1-score: 0.6502
micro precision: 0.6834586466165413, Micro recall: 0.6834586466165413, micro f1: 0.6834586466165413
Epoch: 26/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.416943.. 	Valid Loss:1.463950.. 	Val Accuracy: 68.2707
	Label 0 Precision: 0.7490	Label 0 Recall: 0.7615	Label 0 F1-score: 0.7552
	Label 1 Precision: 0.6105	Label 1 Recall: 0.6163	Label 1 F1-score: 0.6134
	Label 2 Precision: 0.5833	Label 2 Recall: 0.5357	Label 2 F1-score: 0.5585
	Combined F1-score: 0.6424
micro precision: 0.6827067669172933, Micro recall: 0.6827067669172933, micro f1: 0.6827067669172933
Epoch: 27/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.442815.. 	Valid Loss:1.361985.. 	Val Accuracy: 67.8195
	Label 0 Precision: 0.7490	Label 0 Recall: 0.7615	Label 0 F1-score: 0.7552
	Label 1 Precision: 0.6090	Label 1 Recall: 0.5827	Label 1 F1-score: 0.5956
	Label 2 Precision: 0.5594	Label 2 Recall: 0.5765	Label 2 F1-score: 0.5678
	Combined F1-score: 0.6395
micro precision: 0.6781954887218045, Micro recall: 0.6781954887218045, micro f1: 0.6781954887218045
Epoch: 28/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.405206.. 	Valid Loss:1.410495.. 	Val Accuracy: 68.7970
	Label 0 Precision: 0.7269	Label 0 Recall: 0.8020	Label 0 F1-score: 0.7626
	Label 1 Precision: 0.6202	Label 1 Recall: 0.5755	Label 1 F1-score: 0.5970
	Label 2 Precision: 0.6579	Label 2 Recall: 0.5102	Label 2 F1-score: 0.5747
	Combined F1-score: 0.6448
micro precision: 0.6879699248120301, Micro recall: 0.6879699248120301, micro f1: 0.6879699248120301
Epoch: 29/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.414810.. 	Valid Loss:1.625117.. 	Val Accuracy: 64.8872
	Label 0 Precision: 0.7840	Label 0 Recall: 0.7085	Label 0 F1-score: 0.7443
	Label 1 Precision: 0.6207	Label 1 Recall: 0.5180	Label 1 F1-score: 0.5647
	Label 2 Precision: 0.4162	Label 2 Recall: 0.7092	Label 2 F1-score: 0.5245
	Combined F1-score: 0.6112
micro precision: 0.6488721804511278, Micro recall: 0.6488721804511278, micro f1: 0.6488721804511278
Epoch: 30/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.465145.. 	Valid Loss:1.281877.. 	Val Accuracy: 68.9474
	Label 0 Precision: 0.6956	Label 0 Recall: 0.8731	Label 0 F1-score: 0.7743
	Label 1 Precision: 0.6627	Label 1 Recall: 0.5372	Label 1 F1-score: 0.5934
	Label 2 Precision: 0.7283	Label 2 Recall: 0.3418	Label 2 F1-score: 0.4653
	Combined F1-score: 0.6110
micro precision: 0.6894736842105263, Micro recall: 0.6894736842105263, micro f1: 0.6894736842105263
Epoch: 31/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.416293.. 	Valid Loss:1.264791.. 	Val Accuracy: 69.4737
	Label 0 Precision: 0.6935	Label 0 Recall: 0.9024	Label 0 F1-score: 0.7842
	Label 1 Precision: 0.7252	Label 1 Recall: 0.4556	Label 1 F1-score: 0.5596
	Label 2 Precision: 0.6444	Label 2 Recall: 0.4439	Label 2 F1-score: 0.5257
	Combined F1-score: 0.6232
micro precision: 0.6947368421052632, Micro recall: 0.6947368421052632, micro f1: 0.6947368421052632
Epoch: 32/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.461878.. 	Valid Loss:0.997643.. 	Val Accuracy: 68.1203
	Label 0 Precision: 0.6802	Label 0 Recall: 0.9135	Label 0 F1-score: 0.7798
	Label 1 Precision: 0.7598	Label 1 Recall: 0.3717	Label 1 F1-score: 0.4992
	Label 2 Precision: 0.5890	Label 2 Recall: 0.4898	Label 2 F1-score: 0.5348
	Combined F1-score: 0.6046
micro precision: 0.681203007518797, Micro recall: 0.681203007518797, micro f1: 0.681203007518797
Epoch: 33/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.417820.. 	Valid Loss:1.379690.. 	Val Accuracy: 69.3233
	Label 0 Precision: 0.7613	Label 0 Recall: 0.8006	Label 0 F1-score: 0.7804
	Label 1 Precision: 0.6813	Label 1 Recall: 0.5588	Label 1 F1-score: 0.6140
	Label 2 Precision: 0.4915	Label 2 Recall: 0.5867	Label 2 F1-score: 0.5349
	Combined F1-score: 0.6431
micro precision: 0.6932330827067669, Micro recall: 0.6932330827067669, micro f1: 0.6932330827067669
Epoch: 34/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.431747.. 	Valid Loss:1.387543.. 	Val Accuracy: 70.0000
	Label 0 Precision: 0.7326	Label 0 Recall: 0.8368	Label 0 F1-score: 0.7812
	Label 1 Precision: 0.6801	Label 1 Recall: 0.5659	Label 1 F1-score: 0.6178
	Label 2 Precision: 0.5793	Label 2 Recall: 0.4847	Label 2 F1-score: 0.5278
	Combined F1-score: 0.6423
micro precision: 0.7, Micro recall: 0.7, micro f1: 0.7
Epoch: 35/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.410227.. 	Valid Loss:1.327678.. 	Val Accuracy: 70.0752
	Label 0 Precision: 0.7591	Label 0 Recall: 0.7824	Label 0 F1-score: 0.7706
	Label 1 Precision: 0.6299	Label 1 Recall: 0.6571	Label 1 F1-score: 0.6432
	Label 2 Precision: 0.6218	Label 2 Recall: 0.4949	Label 2 F1-score: 0.5511
	Combined F1-score: 0.6550
micro precision: 0.7007518796992481, Micro recall: 0.7007518796992481, micro f1: 0.7007518796992481
Epoch: 36/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.394642.. 	Valid Loss:1.368047.. 	Val Accuracy: 70.1504
	Label 0 Precision: 0.7433	Label 0 Recall: 0.8117	Label 0 F1-score: 0.7760
	Label 1 Precision: 0.6320	Label 1 Recall: 0.6259	Label 1 F1-score: 0.6289
	Label 2 Precision: 0.6716	Label 2 Recall: 0.4592	Label 2 F1-score: 0.5455
	Combined F1-score: 0.6501
micro precision: 0.7015037593984963, Micro recall: 0.7015037593984963, micro f1: 0.7015037593984963
Epoch: 37/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.424134.. 	Valid Loss:1.181495.. 	Val Accuracy: 68.4211
	Label 0 Precision: 0.6890	Label 0 Recall: 0.8898	Label 0 F1-score: 0.7766
	Label 1 Precision: 0.7059	Label 1 Recall: 0.4317	Label 1 F1-score: 0.5357
	Label 2 Precision: 0.6174	Label 2 Recall: 0.4694	Label 2 F1-score: 0.5333
	Combined F1-score: 0.6152
micro precision: 0.6842105263157895, Micro recall: 0.6842105263157895, micro f1: 0.6842105263157895
Epoch: 38/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.463260.. 	Valid Loss:1.604577.. 	Val Accuracy: 61.8797
	Label 0 Precision: 0.8209	Label 0 Recall: 0.5690	Label 0 F1-score: 0.6722
	Label 1 Precision: 0.5703	Label 1 Recall: 0.7002	Label 1 F1-score: 0.6286
	Label 2 Precision: 0.3832	Label 2 Recall: 0.6276	Label 2 F1-score: 0.4758
	Combined F1-score: 0.5922
micro precision: 0.618796992481203, Micro recall: 0.618796992481203, micro f1: 0.618796992481203
Epoch: 39/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.723489.. 	Valid Loss:1.276429.. 	Val Accuracy: 30.6767
	Label 0 Precision: 0.0000	Label 0 Recall: 0.0000	Label 0 F1-score: 0.0000
	Label 1 Precision: 0.6211	Label 1 Recall: 0.6211	Label 1 F1-score: 0.6211
	Label 2 Precision: 0.1634	Label 2 Recall: 0.7602	Label 2 F1-score: 0.2690
	Combined F1-score: 0.2967
micro precision: 0.3067669172932331, Micro recall: 0.3067669172932331, micro f1: 0.3067669172932331
Epoch: 40/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.700541.. 	Valid Loss:1.249205.. 	Val Accuracy: 32.1053
	Label 0 Precision: 1.0000	Label 0 Recall: 0.0014	Label 0 F1-score: 0.0028
	Label 1 Precision: 0.5861	Label 1 Recall: 0.6691	Label 1 F1-score: 0.6249
	Label 2 Precision: 0.1723	Label 2 Recall: 0.7500	Label 2 F1-score: 0.2803
	Combined F1-score: 0.3026
micro precision: 0.32105263157894737, Micro recall: 0.32105263157894737, micro f1: 0.32105263157894737
Epoch: 41/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.699697.. 	Valid Loss:1.274030.. 	Val Accuracy: 30.8271
	Label 0 Precision: 0.0000	Label 0 Recall: 0.0000	Label 0 F1-score: 0.0000
	Label 1 Precision: 0.6087	Label 1 Recall: 0.6379	Label 1 F1-score: 0.6230
	Label 2 Precision: 0.1613	Label 2 Recall: 0.7347	Label 2 F1-score: 0.2645
	Combined F1-score: 0.2958
micro precision: 0.3082706766917293, Micro recall: 0.3082706766917293, micro f1: 0.3082706766917293
Epoch: 42/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.697639.. 	Valid Loss:1.273305.. 	Val Accuracy: 30.6767
	Label 0 Precision: 0.0000	Label 0 Recall: 0.0000	Label 0 F1-score: 0.0000
	Label 1 Precision: 0.6436	Label 1 Recall: 0.5803	Label 1 F1-score: 0.6103
	Label 2 Precision: 0.1744	Label 2 Recall: 0.8469	Label 2 F1-score: 0.2892
	Combined F1-score: 0.2998
micro precision: 0.3067669172932331, Micro recall: 0.3067669172932331, micro f1: 0.3067669172932331
Epoch: 43/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.503871.. 	Valid Loss:1.180948.. 	Val Accuracy: 68.5714
	Label 0 Precision: 0.7009	Label 0 Recall: 0.8759	Label 0 F1-score: 0.7787
	Label 1 Precision: 0.6828	Label 1 Recall: 0.4748	Label 1 F1-score: 0.5601
	Label 2 Precision: 0.5972	Label 2 Recall: 0.4388	Label 2 F1-score: 0.5059
	Combined F1-score: 0.6149
micro precision: 0.6857142857142857, Micro recall: 0.6857142857142857, micro f1: 0.6857142857142857
Epoch: 44/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.557700.. 	Valid Loss:1.111783.. 	Val Accuracy: 67.1429
	Label 0 Precision: 0.6794	Label 0 Recall: 0.8898	Label 0 F1-score: 0.7705
	Label 1 Precision: 0.7249	Label 1 Recall: 0.3981	Label 1 F1-score: 0.5139
	Label 2 Precision: 0.5494	Label 2 Recall: 0.4541	Label 2 F1-score: 0.4972
	Combined F1-score: 0.5939
micro precision: 0.6714285714285714, Micro recall: 0.6714285714285714, micro f1: 0.6714285714285714
Epoch: 45/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.464349.. 	Valid Loss:1.057933.. 	Val Accuracy: 67.7444
	Label 0 Precision: 0.7707	Label 0 Recall: 0.7266	Label 0 F1-score: 0.7480
	Label 1 Precision: 0.5894	Label 1 Recall: 0.6643	Label 1 F1-score: 0.6246
	Label 2 Precision: 0.5598	Label 2 Recall: 0.5255	Label 2 F1-score: 0.5421
	Combined F1-score: 0.6382
micro precision: 0.6774436090225564, Micro recall: 0.6774436090225564, micro f1: 0.6774436090225564
Epoch: 46/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.447432.. 	Valid Loss:1.064625.. 	Val Accuracy: 62.7820
	Label 0 Precision: 0.8312	Label 0 Recall: 0.5565	Label 0 F1-score: 0.6667
	Label 1 Precision: 0.5062	Label 1 Recall: 0.7818	Label 1 F1-score: 0.6145
	Label 2 Precision: 0.5340	Label 2 Recall: 0.5612	Label 2 F1-score: 0.5473
	Combined F1-score: 0.6095
micro precision: 0.6278195488721805, Micro recall: 0.6278195488721805, micro f1: 0.6278195488721805
Epoch: 47/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.441278.. 	Valid Loss:1.358710.. 	Val Accuracy: 64.2105
	Label 0 Precision: 0.8123	Label 0 Recall: 0.6095	Label 0 F1-score: 0.6964
	Label 1 Precision: 0.5151	Label 1 Recall: 0.7770	Label 1 F1-score: 0.6195
	Label 2 Precision: 0.5706	Label 2 Recall: 0.4745	Label 2 F1-score: 0.5181
	Combined F1-score: 0.6113
micro precision: 0.6421052631578947, Micro recall: 0.6421052631578947, micro f1: 0.6421052631578947
Epoch: 48/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.433931.. 	Valid Loss:1.297605.. 	Val Accuracy: 67.5940
	Label 0 Precision: 0.7765	Label 0 Recall: 0.7266	Label 0 F1-score: 0.7507
	Label 1 Precision: 0.5892	Label 1 Recall: 0.6571	Label 1 F1-score: 0.6213
	Label 2 Precision: 0.5361	Label 2 Recall: 0.5306	Label 2 F1-score: 0.5333
	Combined F1-score: 0.6351
micro precision: 0.6759398496240602, Micro recall: 0.6759398496240602, micro f1: 0.6759398496240602
Epoch: 49/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.423624.. 	Valid Loss:1.356263.. 	Val Accuracy: 68.8722
	Label 0 Precision: 0.7547	Label 0 Recall: 0.7768	Label 0 F1-score: 0.7656
	Label 1 Precision: 0.6049	Label 1 Recall: 0.6499	Label 1 F1-score: 0.6266
	Label 2 Precision: 0.6111	Label 2 Recall: 0.4490	Label 2 F1-score: 0.5176
	Combined F1-score: 0.6366
micro precision: 0.6887218045112782, Micro recall: 0.6887218045112782, micro f1: 0.6887218045112782
Epoch: 50/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.426542.. 	Valid Loss:1.186521.. 	Val Accuracy: 69.6241
	Label 0 Precision: 0.7420	Label 0 Recall: 0.8061	Label 0 F1-score: 0.7727
	Label 1 Precision: 0.6287	Label 1 Recall: 0.6091	Label 1 F1-score: 0.6188
	Label 2 Precision: 0.6395	Label 2 Recall: 0.4796	Label 2 F1-score: 0.5481
	Combined F1-score: 0.6465
micro precision: 0.6962406015037594, Micro recall: 0.6962406015037594, micro f1: 0.6962406015037594
Epoch: 51/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.438501.. 	Valid Loss:1.273740.. 	Val Accuracy: 68.4211
	Label 0 Precision: 0.7783	Label 0 Recall: 0.7294	Label 0 F1-score: 0.7531
	Label 1 Precision: 0.5675	Label 1 Recall: 0.7362	Label 1 F1-score: 0.6409
	Label 2 Precision: 0.6838	Label 2 Recall: 0.4082	Label 2 F1-score: 0.5112
	Combined F1-score: 0.6351
micro precision: 0.6842105263157895, Micro recall: 0.6842105263157895, micro f1: 0.6842105263157895
Epoch: 52/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.434314.. 	Valid Loss:1.263960.. 	Val Accuracy: 69.0226
	Label 0 Precision: 0.7431	Label 0 Recall: 0.7908	Label 0 F1-score: 0.7662
	Label 1 Precision: 0.6423	Label 1 Recall: 0.5899	Label 1 F1-score: 0.6150
	Label 2 Precision: 0.5707	Label 2 Recall: 0.5357	Label 2 F1-score: 0.5526
	Combined F1-score: 0.6446
micro precision: 0.6902255639097744, Micro recall: 0.6902255639097744, micro f1: 0.6902255639097744
Epoch: 53/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.425865.. 	Valid Loss:1.281036.. 	Val Accuracy: 67.1429
	Label 0 Precision: 0.6993	Label 0 Recall: 0.7978	Label 0 F1-score: 0.7453
	Label 1 Precision: 0.6518	Label 1 Recall: 0.5252	Label 1 F1-score: 0.5817
	Label 2 Precision: 0.5795	Label 2 Recall: 0.5204	Label 2 F1-score: 0.5484
	Combined F1-score: 0.6251
micro precision: 0.6714285714285714, Micro recall: 0.6714285714285714, micro f1: 0.6714285714285714
Epoch: 54/100


  0%|          | 0/521 [00:00<?, ?it/s]

In [None]:
torch.cuda.empty_cache()

In [None]:
from matplotlib import pyplot as plt

plt.plot(train_loss_data, label="Training loss")
plt.plot(valid_loss_data, label="validation loss")
plt.legend(frameon=False)

In [None]:
model.load_state_dict(torch.load(DirPath+bert_model_name+"_lasttwopooler_contest_val.pth", map_location = device))

In [None]:
all_preds = []
all_labels = []

for batch in test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

In [None]:
from sklearn.metrics import classification_report

# preds = np.argmax(preds, axis = 1)
print(classification_report(all_labels, all_preds))

<h1>Training the model with All Collected dataset with the selected model and hyperparameters(Code not yet updated and run, ignore it)<h1>

In [None]:
!pip install --quiet transformers

In [None]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

In [None]:
#df loading
df_train = pd.read_csv('train.csv')[['sentence','hate speech']]
df_val = pd.read_csv('val.csv')[['sentence','hate speech']]
df_test = pd.read_csv('test.csv')[['sentence','hate speech']]

#concatenating all the data
df_train = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
print(df_train)
print(df_train.describe())

In [None]:
#defining previous hyperparameters got from testing
max_number_input_tokens=256
batch_size_training = 16
first_dropout_rate = 0.3
hidden_output = 768
bert_model_name = "sagorsarker/bangla-bert-base"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.8
epochs = 6
classes = 2

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DirPath = ('/content/drive/My Drive/Test/')

In [None]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['sentence'] , value['hate speech']

In [None]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [None]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 3, classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        o1 = outputs.hidden_states[-1]
        o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        pooled_output = o2
        cat = torch.cat((apool, mpool, pooled_output), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [None]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomBERTBengali(bert)
model.to(device)

In [None]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [None]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [None]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total, correct, valid_loss

In [None]:
tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [None]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if True:
        best_loss = valid_loss
        torch.save(model.state_dict(), DirPath+bert_model_name+"_CustomBertBengaliFullDataset6epoch885044valacc.pth")
        print(f'epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))