<h1>Preparing training, validation and testing dataset, and loading the trained model from first training session of 2nd step of transfer learning<h1>

In [5]:
!pip install --quiet transformers

In [6]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AutoModelForMaskedLM, AutoTokenizer

In [7]:
#defining some hyperparameters
max_number_input_tokens=256
batch_size_training = 16
first_dropout_rate = 0.3
hidden_output = 768
bert_model_name = "bert-base-multilingual-cased"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.8
epochs = 10
classes = 2
need_split_dataset=False

In [8]:
from google.colab import drive
drive.mount('/content/drive')
DirPath = ('/content/drive/My Drive/Test/')
Finetuned_model_path = DirPath+bert_model_name+"_CustomBertBengaliFullDataset6epoch885044valacc.pth"
CollectedDatasetFileName = "Final_data.csv"
CollectedDatasetPath = DirPath+"EMNLP/"+CollectedDatasetFileName
SplittedTrainFileName = "train.csv"
SplittedValFileName = "dev.csv"
SplittedTrainDataPath = DirPath+"EMNLP/"+SplittedTrainFileName
SplittedValDataPath = DirPath+"EMNLP/"+SplittedValFileName

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
def interchange(df_train,pos,label):
  #setting the first sample to be with label '0'
  zero_index = df_train[df_train['label'] == label].index[0]
  first_index=pos
  # interchange the samples
  df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]
  return df_train

In [10]:
def balanceclasses(df_train):
  class_counts = df_train['label'].value_counts()
  min_count = class_counts.max()

  # Create new DataFrames for each class with fewer samples
  new_dfs = []
  for label, count in class_counts.items():
    if count == min_count:
        continue
    df_label = df_train[df_train['label'] == label]
    num_copies = min_count // count
    new_df_label = pd.concat([df_label] * num_copies, ignore_index=True)
    new_df_label = new_df_label.head(min_count-count)
    #print(new_df_label.head(10))
    new_dfs.append(new_df_label)

  # Concatenate the new DataFrames with the original DataFrame
  df_balanced = pd.concat([df_train] + new_dfs, ignore_index=True).sample(frac=1).reset_index(drop=True)
  return df_balanced

In [11]:
from sklearn.model_selection import train_test_split

#splitting the dataset and saving
if need_split_dataset==True:
  #dataset loading
  df = pd.read_csv(CollectedDatasetPath)[ ['Text','label'] ]
  print(f'df label counts\n',df['label'].value_counts())
  # check if there is any NaN value in the dataframe
  print(f'null values: {df.isna().sum()}')

  #null indices
  null_index = df.index[df.isna().any(axis=1)]
  print(f'null indices: {null_index}')

  #dropping null values
  df = df.dropna()

  df_train, df_val = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
  df_train.to_csv(SplittedTrainDataPath)
  df_val.to_csv(SplittedValDataPath)
else:
  df_train = pd.read_csv(SplittedTrainDataPath)[ ['text','label'] ]
  df_val = pd.read_csv(SplittedValDataPath)[ ['text','label'] ]

# df_train = pd.concat([df_train, df_val], ignore_index=True)
# df_val = df_train

# count the number of each unique label in train and validation dataframes
train_label_counts = df_train['label'].value_counts()
val_label_counts = df_val['label'].value_counts()

#setting the first sample to be with label '0'
zero_index = df_train[df_train['label'] == 0].index[0]
first_index=0
# interchange the samples
df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]

print(df_train.shape)
print(df_val.shape)
print('Train label counts:\n', train_label_counts)
print('Validation label counts:\n', val_label_counts)

print("\n after making copies:")
#balance all classes making copies
df_train = balanceclasses(df_train)
print(df_train['label'].value_counts())

#setting the first sample to be with label '0'
zero_index = df_train[df_train['label'] == 0].index[0]
first_index=0
# interchange the samples
df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]

#setting the first sample to be with label '0'
zero_index = df_train[df_train['label'] == 1].index[0]
first_index=1
# interchange the samples
df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]

print(df_train)
print(df_val)

(2700, 2)
(1330, 2)
Train label counts:
 0    1389
1     922
2     389
Name: label, dtype: int64
Validation label counts:
 0    717
1    417
2    196
Name: label, dtype: int64

 after making copies:
1    1389
0    1389
2    1389
Name: label, dtype: int64
                                                   text  label
0     ভাই তাহলে স্কুলের নির্দিষ্ট পোশাক থাকে কেন?এই ...      0
1     হায়রে সময় টিভি।।সত্য কে গুজব আর গুজব কে সত্য ব...      1
2     ওর ভাই বললনা যে আপনি ওরে নাস্তা খাওয়াইয়ায় যেকো...      1
3     এই সমস্ত গঠনা রুখতে হলে ঢাকা বিশ্ববিদ্যালয়ে গ...      0
4                               য়ে দুধরে ভাই মাল কি সেই      1
...                                                 ...    ...
4162  এক নিউজে বলল ব্যবসায়ী ব্যবসায়ী কথাকাটাকাটির ...      0
4163  এই সব নেতা গুলো কোন কাজের না,এরা সব নোংরা খাবা...      1
4164  পুলিশ কেন ঘুষ খায় পুলিশ কেন দুষ্টু পুলিশ কেন ন...      1
4165  সব কিছু চলছে বহিরাগতদের মদদে ওদের কঠিন বিচার চ...      2
4166  কর্তব্যরত পুলিশ ডিউটি থেকে অনুপস্থিত ছিল কেন? 

In [12]:
# #df loading
# df_train = pd.read_csv('train.csv')[['sentence','hate speech']]
# df_val = pd.read_csv('val.csv')[['sentence','hate speech']]
# df_test = pd.read_csv('test.csv')[['sentence','hate speech']]

# #concatenating all the data
# df_train = pd.concat([df_train, df_val, df_test], ignore_index=True)

# print(df_train.shape)
# print(df_val.shape)
# print(df_test.shape)
# print(df_train)
# print(df_train.describe())

In [None]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'] , value['label']

In [None]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_val)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [None]:
class HateSpeechBert(nn.Module):

    def __init__(self, bert):
        super(HateSpeechBert, self).__init__()

        self.bert = bert

        # dropout layer
        self.dropout = nn.Dropout(first_dropout_rate)

        # relu activation function
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()

        # dense layer 1
        self.fc1 = nn.Linear(hidden_output*2, hidden_output)

        #dense layer 2
        self.fc2 = nn.Linear(hidden_output, 128)

        # dense layer 2 (Output layer)
        self.fc3 = nn.Linear(128, 2)

        #softmax
        self.softmax = nn.Softmax(dim=1)

    # define the forward pass
    def forward(self, input_ids, token_type_ids, attention_mask):
        # pass the inputs to the model
        out = self.bert(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        mean, _ = torch.max(out[0], 1)
        x= torch.cat((mean,out[1]), dim=1)

        x = self.dropout(x)

        x = self.fc1(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.relu(x)

        # output layer
        x = self.fc3(x)
        x = self.softmax(x)

        return x

In [None]:
class BERTBengali(nn.Module):
    def __init__(self, bert):
        super(BERTBengali, self).__init__()
        #self.bert = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
        self.bert = bert
        self.bert_drop = nn.Dropout(0.2)
        self.out = nn.Linear(hidden_output, 2)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(output[1])

        output = self.out(bo)
        return output

In [None]:
class BERTBengaliPooler(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliPooler, self).__init__()
        self.bert = bert
        #self.bert.pooler.dense = nn.Linear(bert.config.hidden_size, bert.config.hidden_size)
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.out = nn.Linear(bert.config.hidden_size, classes)
        #softmax
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        pooled_output = outputs.pooler_output
        bo = self.bert_drop(pooled_output)

        output = self.out(bo)
        output = self.softmax(bo)
        return output

In [None]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 3, classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        o1 = outputs.hidden_states[-1]
        o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        pooled_output = o2
        cat = torch.cat((apool, mpool, pooled_output), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [None]:
class BERTBengaliTwo(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliTwo, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 2, classes)
        torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        out = torch.cat((outputs.hidden_states[-1], outputs.hidden_states[-2]), dim=-1)
        out = self.drop_out(out)
        out = out[:,0,:]
        logits = self.l0(out)
        logits = self.softmax(logits)
        return logits

In [None]:
class BERTBengaliLastTwoPooler(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPooler, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 3, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        #out = out[:,0,:]
        logits = self.l0(out)
        logits = self.softmax(logits)
        return logits

In [None]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/491 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/660M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#creating the structure to contain finetuned bert
struct_model = CustomBERTBengali(bert)
struct_model.to(device)

#loading the finetuned model which have leaned necessary info from other domain
struct_model.load_state_dict(torch.load(Finetuned_model_path))

# Access the bert model
finetuned_bert_base = struct_model.bert

<h1>Creating model architechture for VITD training model, freezing necessary layers, loading parameters into the architechture from previously loaded trained model and training on VITD dataset. Transfer learning second step, second Training session <h1>

In [83]:
###Hyperparameter for the new model
#defining some hyperparameters
max_number_input_tokens=256
batch_size_training = 8
first_dropout_rate = 0.0
hidden_output = 768
bert_model_name = "bert-base-multilingual-cased"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.88
epochs = 100
classes = 3
#need_split_dataset=False

In [84]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['text'] , value['label']

In [85]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_val)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [86]:
# #model for finetuning collected data
# class BERTBengaliLastTwoPooler(nn.Module):
#     def __init__(self, bert):
#         super(BERTBengaliLastTwoPooler, self).__init__()
#         self.bert = bert
#         self.drop_out = nn.Dropout(first_dropout_rate)
#         self.l0 =  nn.Linear(hidden_output * 3, classes)
#         #torch.nn.init.normal_(self.l0.weight, std=0.02)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids
#         )
#         mpool, _ = torch.max(outputs.hidden_states[-1], 1)
#         out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
#         out = self.drop_out(out)
#         #out = out[:,0,:]
#         logits = self.l0(out)
#         logits = self.softmax(logits)
#         return logits

In [87]:
# class CustomBERTBengali(nn.Module):
#     def __init__(self, bert):
#         super(CustomBERTBengali, self).__init__()
#         self.bert = bert
#         self.bert_drop = nn.Dropout(first_dropout_rate)
#         self.tanh = nn.Tanh()
#         self.out = nn.Linear(hidden_output * 3, classes)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids
#         )
#         o1 = outputs.hidden_states[-1]
#         o2 = outputs.pooler_output
#         apool = torch.mean(o1, 1)
#         mpool, _ = torch.max(o1, 1)
#         pooled_output = o2
#         cat = torch.cat((apool, mpool, pooled_output), 1)
#         bo = self.bert_drop(cat)
#         logits = self.out(bo)
#         #logits = self.softmax(logits)
#         return logits

In [88]:
class BERTBengaliLastTwoPooler(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPooler, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 3, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        #out = out[:,0,:]
        logits = self.l0(out)
        # logits = self.softmax(logits)
        return logits

In [89]:
class BERTBengaliLastTwoPoolerP(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerP, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 3, 2)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        #out = out[:,0,:]
        logits = self.l0(out)
        # logits = self.softmax(logits)
        return logits

In [90]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 2, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids
        )
        # print(dict(outputs).keys())
        o1 = outputs.hidden_states[-1]
        # o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        # pooled_output = o2
        cat = torch.cat((apool, mpool), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [91]:
#model for finetuning collected data
class BERTBengaliLastTwoPoolerFreeze(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerFreeze, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l2 = nn.Linear(hidden_output * 3, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l1 = nn.Linear(hidden_output * 2, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l0 = nn.Linear(hidden_output * 2, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((outputs.hidden_states[-2][:,0,:], mpool), dim=-1)#,outputs.pooler_output
        out = self.drop_out(out)
        out = self.l2(out)
        out = self.activation(out)
        out = self.l1(out)
        out = self.activation(out)
        logits = self.l0(out)
        #prob = self.softmax(logits)
        return logits


In [92]:
#model for finetuning collected data
class BERTBengaliLastTwoPoolerFreezePrev(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerFreezePrev, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l1 = nn.Linear(hidden_output * 2, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l0 = nn.Linear(hidden_output * 2, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((outputs.hidden_states[-2][:,0,:], mpool,outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        out = self.l1(out)
        out = self.activation(out)
        logits = self.l0(out)
        #prob = self.softmax(logits)
        return logits


In [93]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
model = BERTBengaliLastTwoPooler(bert)
model2Forlastlayers = BERTBengaliLastTwoPoolerP(bert)

model.to(device)
# model2Forlastlayers.to(device)
# model2Forlastlayers.load_state_dict(torch.load(DirPath+'Models by Sami/'+bert_model_name+"_lasttwopooler_fromHS_freezeencoder_f1.pth"))

# model.l0 = model2Forlastlayers.l0
# model.l2 = model2Forlastlayers.l1
# model.bert = model2Forlastlayers.bert

model.load_state_dict(torch.load(DirPath+'Models by Sami/'+bert_model_name+"_lasttwopooler_fromHS_freezeencoder_f1.pth"))

for params in model.bert.parameters():
  params.requires_grad = False
for params in model.bert.embeddings.parameters():
  params.requires_grad = False
for params in model.bert.encoder.parameters():
  params.requires_grad = True
for params in model.bert.pooler.parameters():
  params.requires_grad = True
# for params in model.l2.parameters():
#   params.requires_grad = True
# for params in model.l1.parameters():
#   params.requires_grad = True
for params in model.l0.parameters():
  params.requires_grad = True

for name, param in model.named_parameters():
  if param.requires_grad:
      print(f"name: {name} is trainable")
  else:
      print(f"name: {name} is non-trainable")

name: bert.embeddings.word_embeddings.weight is non-trainable
name: bert.embeddings.position_embeddings.weight is non-trainable
name: bert.embeddings.token_type_embeddings.weight is non-trainable
name: bert.embeddings.LayerNorm.weight is non-trainable
name: bert.embeddings.LayerNorm.bias is non-trainable
name: bert.encoder.layer.0.attention.self.query.weight is trainable
name: bert.encoder.layer.0.attention.self.query.bias is trainable
name: bert.encoder.layer.0.attention.self.key.weight is trainable
name: bert.encoder.layer.0.attention.self.key.bias is trainable
name: bert.encoder.layer.0.attention.self.value.weight is trainable
name: bert.encoder.layer.0.attention.self.value.bias is trainable
name: bert.encoder.layer.0.attention.output.dense.weight is trainable
name: bert.encoder.layer.0.attention.output.dense.bias is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.weight is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.bias is trainable
name: bert.e

In [94]:
#testing if the input of model works before starting training
s = "আমি বাংলায় গান গাই। [SEP]"
t = tokenizer.encode_plus(s, return_tensors="pt").to(device)
print(t)
out = model(**t)
print(out)

{'input_ids': tensor([[  101,   938, 37376,   100,   950, 18770,   950, 40102,   920,   102,
           102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[ 1.0077, -1.0590, -0.3506]], device='cuda:0',
       grad_fn=<AddmmBackward0>)


In [95]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [96]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        loss = criterion(logs, labels)
        #print("successfully calculated criterion in train!")
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [97]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device, dtype=torch.long)

        # move things to model
        output = model(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        #print(f'predicted: {predicted} labels: {labels}')
        label_0_TP += ((predicted == 0) & (labels == 0)).sum().item()
        label_0_TN += ((predicted != 0) & (labels != 0)).sum().item()
        label_0_FP += ((predicted == 0) & (labels != 0)).sum().item()
        label_0_FN += ((predicted != 0) & (labels == 0)).sum().item()

        label_1_TP += ((predicted == 1) & (labels == 1)).sum().item()
        label_1_TN += ((predicted != 1) & (labels != 1)).sum().item()
        label_1_FP += ((predicted == 1) & (labels != 1)).sum().item()
        label_1_FN += ((predicted != 1) & (labels == 1)).sum().item()

        label_2_TP += ((predicted == 2) & (labels == 2)).sum().item()
        label_2_TN += ((predicted != 2) & (labels != 2)).sum().item()
        label_2_FP += ((predicted == 2) & (labels != 2)).sum().item()
        label_2_FN += ((predicted != 2) & (labels == 2)).sum().item()

    return total, correct, valid_loss, label_0_TP, label_0_TN, label_0_FP, label_0_FN, label_1_TP, label_1_TN, label_1_FP, label_1_FN, label_2_TP, label_2_TN, label_2_FP, label_2_FN


In [98]:

tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [99]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf
best_acc=0
sml = 1e-10
best_f1=0.5812

for epoch in range(epochs):


    if epoch==1:
      training_data = NewsDatasets(df_train)
      train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0


    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]
    label_0_TP += out[3]
    label_0_TN += out[4]
    label_0_FP += out[5]
    label_0_FN += out[6]

    label_1_TP += out[7]
    label_1_TN += out[8]
    label_1_FP += out[9]
    label_1_FN += out[10]

    label_2_TP += out[11]
    label_2_TN += out[12]
    label_2_FP += out[13]
    label_2_FN += out[14]

    # Calculate precision, recall, and F1-score for each class
    label_0_precision = label_0_TP / (label_0_TP + label_0_FP+sml)
    label_0_recall = label_0_TP / (label_0_TP + label_0_FN+sml)
    label_0_f1_score = 2 * (label_0_precision * label_0_recall) / (label_0_precision + label_0_recall+sml)

    label_1_precision = label_1_TP / (label_1_TP + label_1_FP+sml)
    label_1_recall = label_1_TP / (label_1_TP + label_1_FN+sml)
    label_1_f1_score = 2 * (label_1_precision * label_1_recall) / (label_1_precision + label_1_recall+sml)

    label_2_precision = label_2_TP / (label_2_TP + label_2_FP+sml)
    label_2_recall = label_2_TP / (label_2_TP + label_2_FN+sml)
    label_2_f1_score = 2 * (label_2_precision * label_2_recall) / (label_2_precision + label_2_recall+sml)

    # Calculate combined F1-score
    combined_f1_score = (label_0_f1_score + label_1_f1_score + label_2_f1_score) / 3

    # Calculate micro TP, TN, FP, FN values
    micro_TP = label_0_TP + label_1_TP + label_2_TP
    micro_TN = label_0_TN + label_1_TN + label_2_TN
    micro_FP = label_0_FP + label_1_FP + label_2_FP
    micro_FN = label_0_FN + label_1_FN + label_2_FN

    # Calculate micro precision, recall, and F1 score
    micro_precision = micro_TP / (micro_TP + micro_FP)
    micro_recall = micro_TP / (micro_TP + micro_FN)
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    val_acc=correct / total * 100

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if combined_f1_score > best_f1:
        best_f1 = combined_f1_score
        torch.save(model.state_dict(), DirPath+'Models by Sami/'+bert_model_name+"_lasttwopooler_fromHS_contest_midpoolernonfreeze_f1.pth")
        print(f'saved on epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))
    print("\tLabel 0 Precision: {:.4f}\tLabel 0 Recall: {:.4f}\tLabel 0 F1-score: {:.4f}\n"
      "\tLabel 1 Precision: {:.4f}\tLabel 1 Recall: {:.4f}\tLabel 1 F1-score: {:.4f}\n"
      "\tLabel 2 Precision: {:.4f}\tLabel 2 Recall: {:.4f}\tLabel 2 F1-score: {:.4f}\n"
      "\tCombined F1-score: {:.4f}".format(label_0_precision, label_0_recall, label_0_f1_score,
                                            label_1_precision, label_1_recall, label_1_f1_score,
                                            label_2_precision, label_2_recall, label_2_f1_score,
                                            combined_f1_score))
    print(f'micro precision: {micro_precision}, Micro recall: {micro_recall}, micro f1: {micro_f1}')

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.829483.. 	Valid Loss:1.129772.. 	Val Accuracy: 56.0902
	Label 0 Precision: 0.8840	Label 0 Recall: 0.3933	Label 0 F1-score: 0.5444
	Label 1 Precision: 0.4976	Label 1 Recall: 0.7410	Label 1 F1-score: 0.5954
	Label 2 Precision: 0.3974	Label 2 Recall: 0.7908	Label 2 F1-score: 0.5290
	Combined F1-score: 0.5563
micro precision: 0.5609022556390978, Micro recall: 0.5609022556390978, micro f1: 0.5609022556390978
Epoch: 2/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 2
	Train loss:0.541311.. 	Valid Loss:0.813111.. 	Val Accuracy: 70.7519
	Label 0 Precision: 0.7961	Label 0 Recall: 0.7406	Label 0 F1-score: 0.7673
	Label 1 Precision: 0.5908	Label 1 Recall: 0.8034	Label 1 F1-score: 0.6809
	Label 2 Precision: 0.7812	Label 2 Recall: 0.3827	Label 2 F1-score: 0.5137
	Combined F1-score: 0.6540
micro precision: 0.7075187969924812, Micro recall: 0.7075187969924812, micro f1: 0.7075187969924812
Epoch: 3/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 3
	Train loss:0.368552.. 	Valid Loss:0.919658.. 	Val Accuracy: 70.9774
	Label 0 Precision: 0.8165	Label 0 Recall: 0.7322	Label 0 F1-score: 0.7721
	Label 1 Precision: 0.6134	Label 1 Recall: 0.7458	Label 1 F1-score: 0.6732
	Label 2 Precision: 0.6000	Label 2 Recall: 0.5510	Label 2 F1-score: 0.5745
	Combined F1-score: 0.6732
micro precision: 0.7097744360902256, Micro recall: 0.7097744360902256, micro f1: 0.7097744360902256
Epoch: 4/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.278993.. 	Valid Loss:1.363790.. 	Val Accuracy: 69.3985
	Label 0 Precision: 0.7249	Label 0 Recall: 0.8563	Label 0 F1-score: 0.7852
	Label 1 Precision: 0.7356	Label 1 Recall: 0.4604	Label 1 F1-score: 0.5664
	Label 2 Precision: 0.5270	Label 2 Recall: 0.5969	Label 2 F1-score: 0.5598
	Combined F1-score: 0.6371
micro precision: 0.693984962406015, Micro recall: 0.693984962406015, micro f1: 0.693984962406015
Epoch: 5/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.183524.. 	Valid Loss:1.689367.. 	Val Accuracy: 72.6316
	Label 0 Precision: 0.7474	Label 0 Recall: 0.8870	Label 0 F1-score: 0.8112
	Label 1 Precision: 0.7003	Label 1 Recall: 0.5827	Label 1 F1-score: 0.6361
	Label 2 Precision: 0.6591	Label 2 Recall: 0.4439	Label 2 F1-score: 0.5305
	Combined F1-score: 0.6593
micro precision: 0.7263157894736842, Micro recall: 0.7263157894736842, micro f1: 0.7263157894736842
Epoch: 6/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.124272.. 	Valid Loss:2.088251.. 	Val Accuracy: 72.1053
	Label 0 Precision: 0.7319	Label 0 Recall: 0.8870	Label 0 F1-score: 0.8020
	Label 1 Precision: 0.6882	Label 1 Recall: 0.5875	Label 1 F1-score: 0.6339
	Label 2 Precision: 0.7429	Label 2 Recall: 0.3980	Label 2 F1-score: 0.5183
	Combined F1-score: 0.6514
micro precision: 0.7210526315789474, Micro recall: 0.7210526315789474, micro f1: 0.7210526315789474
Epoch: 7/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.090159.. 	Valid Loss:2.123689.. 	Val Accuracy: 72.8571
	Label 0 Precision: 0.7684	Label 0 Recall: 0.8466	Label 0 F1-score: 0.8056
	Label 1 Precision: 0.6603	Label 1 Recall: 0.6619	Label 1 F1-score: 0.6611
	Label 2 Precision: 0.7049	Label 2 Recall: 0.4388	Label 2 F1-score: 0.5409
	Combined F1-score: 0.6692
micro precision: 0.7285714285714285, Micro recall: 0.7285714285714285, micro f1: 0.7285714285714285
Epoch: 8/100


  0%|          | 0/521 [00:00<?, ?it/s]

saved on epoch: 8
	Train loss:0.044461.. 	Valid Loss:2.244994.. 	Val Accuracy: 72.4812
	Label 0 Precision: 0.8159	Label 0 Recall: 0.7727	Label 0 F1-score: 0.7937
	Label 1 Precision: 0.6230	Label 1 Recall: 0.7530	Label 1 F1-score: 0.6819
	Label 2 Precision: 0.6531	Label 2 Recall: 0.4898	Label 2 F1-score: 0.5598
	Combined F1-score: 0.6784
micro precision: 0.724812030075188, Micro recall: 0.724812030075188, micro f1: 0.7248120300751879
Epoch: 9/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.032382.. 	Valid Loss:2.372397.. 	Val Accuracy: 71.8045
	Label 0 Precision: 0.7924	Label 0 Recall: 0.7824	Label 0 F1-score: 0.7874
	Label 1 Precision: 0.6377	Label 1 Recall: 0.7050	Label 1 F1-score: 0.6697
	Label 2 Precision: 0.6211	Label 2 Recall: 0.5102	Label 2 F1-score: 0.5602
	Combined F1-score: 0.6724
micro precision: 0.7180451127819549, Micro recall: 0.7180451127819549, micro f1: 0.7180451127819549
Epoch: 10/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.029714.. 	Valid Loss:2.659488.. 	Val Accuracy: 71.5789
	Label 0 Precision: 0.7918	Label 0 Recall: 0.7796	Label 0 F1-score: 0.7857
	Label 1 Precision: 0.6112	Label 1 Recall: 0.7314	Label 1 F1-score: 0.6659
	Label 2 Precision: 0.7040	Label 2 Recall: 0.4490	Label 2 F1-score: 0.5483
	Combined F1-score: 0.6666
micro precision: 0.7157894736842105, Micro recall: 0.7157894736842105, micro f1: 0.7157894736842105
Epoch: 11/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.019753.. 	Valid Loss:2.742683.. 	Val Accuracy: 73.0827
	Label 0 Precision: 0.7681	Label 0 Recall: 0.8452	Label 0 F1-score: 0.8048
	Label 1 Precision: 0.6784	Label 1 Recall: 0.6475	Label 1 F1-score: 0.6626
	Label 2 Precision: 0.6713	Label 2 Recall: 0.4898	Label 2 F1-score: 0.5664
	Combined F1-score: 0.6779
micro precision: 0.7308270676691729, Micro recall: 0.7308270676691729, micro f1: 0.730827067669173
Epoch: 12/100


  0%|          | 0/521 [00:00<?, ?it/s]

	Train loss:0.006282.. 	Valid Loss:2.984473.. 	Val Accuracy: 70.9023
	Label 0 Precision: 0.8130	Label 0 Recall: 0.7517	Label 0 F1-score: 0.7812
	Label 1 Precision: 0.6116	Label 1 Recall: 0.7098	Label 1 F1-score: 0.6570
	Label 2 Precision: 0.5902	Label 2 Recall: 0.5510	Label 2 F1-score: 0.5699
	Combined F1-score: 0.6694
micro precision: 0.7090225563909774, Micro recall: 0.7090225563909774, micro f1: 0.7090225563909774
Epoch: 13/100


  0%|          | 0/521 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [82]:
torch.cuda.empty_cache()

In [None]:
torch.save(model.state_dict(), DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_finalhs_midnonfrozen_acc1_sub_finaluntested.pth")

In [None]:
from matplotlib import pyplot as plt

plt.plot(train_loss_data, label="Training loss")
plt.plot(valid_loss_data, label="validation loss")
plt.legend(frameon=False)

In [None]:
model.load_state_dict(torch.load(DirPath+bert_model_name+"_lasttwopooler_contest_val.pth", map_location = device))

In [None]:
all_preds = []
all_labels = []

for batch in test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

In [None]:
from sklearn.metrics import classification_report

# preds = np.argmax(preds, axis = 1)
print(classification_report(all_labels, all_preds))

<h1>Training the model with All Collected dataset with the selected model and hyperparameters(Code not yet updated and run, ignore it)<h1>

In [None]:
!pip install --quiet transformers

In [None]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

In [None]:
#df loading
df_train = pd.read_csv('train.csv')[['sentence','hate speech']]
df_val = pd.read_csv('val.csv')[['sentence','hate speech']]
df_test = pd.read_csv('test.csv')[['sentence','hate speech']]

#concatenating all the data
df_train = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
print(df_train)
print(df_train.describe())

In [None]:
#defining previous hyperparameters got from testing
max_number_input_tokens=256
batch_size_training = 16
first_dropout_rate = 0.3
hidden_output = 768
bert_model_name = "sagorsarker/bangla-bert-base"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.8
epochs = 6
classes = 2

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DirPath = ('/content/drive/My Drive/Test/')

In [None]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['sentence'] , value['hate speech']

In [None]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [None]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 3, classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        o1 = outputs.hidden_states[-1]
        o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        pooled_output = o2
        cat = torch.cat((apool, mpool, pooled_output), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [None]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomBERTBengali(bert)
model.to(device)

In [None]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [None]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [None]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total, correct, valid_loss

In [None]:
tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [None]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if True:
        best_loss = valid_loss
        torch.save(model.state_dict(), DirPath+bert_model_name+"_CustomBertBengaliFullDataset6epoch885044valacc.pth")
        print(f'epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))