<h1>loading train, validation and test data<h1>

In [1]:
!pip install --quiet transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m58.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m37.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m97.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m84.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m34.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AutoModelForMaskedLM, AutoTokenizer

In [3]:
###Hyperparameter for the new model
#defining some hyperparameters
max_number_input_tokens=512
batch_size_training = 8
first_dropout_rate = 0.0
hidden_output = 768
bert_model_name = "bert-base-multilingual-cased"
adam_opt_lr = 8e-6
scheduler_step = 1
scheduler_gamma = 0.98
epochs = 8
classes = 2
model_layer = ''
name_change=''
headlineContentSeparator = ' \\\\ '

#other options
isSaveModel = False

In [4]:
from google.colab import drive
drive.mount('/content/drive')
DirPath = ('/content/drive/My Drive/FakeNews/')
TestPath = DirPath+'test.csv'
ValPath = DirPath+'val.csv'
TrainPath = DirPath+'train.csv'
ModelPath = DirPath+'Models/'+'FNBaseline_mbertCased.pth'

Mounted at /content/drive


In [5]:
def interchange(df_train,pos,label):
  #setting the first sample to be with label '0'
  zero_index = df_train[df_train['label'] == label].index[0]
  first_index=pos
  # interchange the samples
  df_train.iloc[[first_index, zero_index]] = df_train.iloc[[zero_index, first_index]]
  return df_train

In [6]:
def balanceclasses(df_train):
  class_counts = df_train['label'].value_counts()
  min_count = class_counts.max()

  # Create new DataFrames for each class with fewer samples
  new_dfs = []
  for label, count in class_counts.items():
    if count == min_count:
        continue
    df_label = df_train[df_train['label'] == label]
    num_copies = min_count // count
    new_df_label = pd.concat([df_label] * num_copies, ignore_index=True)
    new_df_label = new_df_label.head(min_count-count)
    #print(new_df_label.head(10))
    new_dfs.append(new_df_label)

  # Concatenate the new DataFrames with the original DataFrame
  df_balanced = pd.concat([df_train] + new_dfs, ignore_index=True).sample(frac=1).reset_index(drop=True)
  return df_balanced

In [7]:
#df loading
df_train = pd.read_csv(TrainPath) # [['sentence','hate speech']]
df_val = pd.read_csv(ValPath)#[['sentence','hate speech']]
df_test = pd.read_csv(TestPath)#[['sentence','hate speech']]

#concatenating all the data
# df_train = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
print(df_train)
print(df_train.describe())
print(df_train['Label'].value_counts())
print(df_val['Label'].value_counts())
print(df_test['Label'].value_counts())

(43106, 3)
(9237, 3)
(9238, 3)
                                                Headline  \
0             ঢাকায় বাসের ধাক্কায় কৃষি কর্মকর্তার মৃত্যু   
1                   জাপানে ঘূর্ণিঝড় ট্রামির আঘাতে নিহত ২   
2      প্রতিরক্ষা মন্ত্রণালয়ে বিশাল নিয়োগ বিজ্ঞপ্তি ২০১৮   
3            তত্ত্বাবধায়ক সরকার গঠনের নির্দেশনা চেয়ে রিট   
4      জগাখিচুড়ি ঐক্যের কোনও ভবিষ্যৎ নেই: ওবায়দুল কাদের   
...                                                  ...   
43101  মধ্য প্রদেশে আশ্রয় কেন্দ্রে প্রতিবন্ধী নারীকে ...   
43102    ঢাকা ভাগের প্রতিবাদে মহা-সমাবেশ | দৈনিক মতিকণ্ঠ   
43103  ChotoBhai' এর ভিডিও রিপোর্ট করে takeout 2.0 থে...   
43104  'আমি কিছুই জানি না, ও তো সৎ ছিল,' মিঠুন প্রসঙ্...   
43105             হাসানুল হক ইনুর বিরুদ্ধ মানহানির মামলা   

                                                 Content  Label  
0      নিজস্ব প্রতিবেদক : রাজধানীর ফার্মগেটে বাসের ধা...      1  
1      জাপানে শক্তিশালী ঘূর্ণিঝড়ের আঘাতে দুজনের প্রাণ...      1  
2      প্রতিরক্ষা মন্ত্রণালয়ের নিয়ন্ত্রণাধীন প্রধা

<h1>preparing training, validation and test data. Preparing model. Training model<h1>

In [8]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        # print(value["Headline"],value['Content']) #debugging dataloader
        return value['Headline']+headlineContentSeparator+value['Content'] , value['Label']

In [9]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=False)

test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=False)

In [10]:
# #model for finetuning collected data
# class BERTBengaliLastTwoPooler(nn.Module):
#     def __init__(self, bert):
#         super(BERTBengaliLastTwoPooler, self).__init__()
#         self.bert = bert
#         self.drop_out = nn.Dropout(first_dropout_rate)
#         self.l0 =  nn.Linear(hidden_output * 3, classes)
#         #torch.nn.init.normal_(self.l0.weight, std=0.02)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids
#         )
#         mpool, _ = torch.max(outputs.hidden_states[-1], 1)
#         out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
#         out = self.drop_out(out)
#         #out = out[:,0,:]
#         logits = self.l0(out)
#         logits = self.softmax(logits)
#         return logits

In [11]:
# class CustomBERTBengali(nn.Module):
#     def __init__(self, bert):
#         super(CustomBERTBengali, self).__init__()
#         self.bert = bert
#         self.bert_drop = nn.Dropout(first_dropout_rate)
#         self.tanh = nn.Tanh()
#         self.out = nn.Linear(hidden_output * 3, classes)
#         self.softmax = nn.Softmax(dim=1)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids
#         )
#         o1 = outputs.hidden_states[-1]
#         o2 = outputs.pooler_output
#         apool = torch.mean(o1, 1)
#         mpool, _ = torch.max(o1, 1)
#         pooled_output = o2
#         cat = torch.cat((apool, mpool, pooled_output), 1)
#         bo = self.bert_drop(cat)
#         logits = self.out(bo)
#         #logits = self.softmax(logits)
#         return logits

In [12]:
class BERTBengaliLastTwoPooler(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPooler, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l0 =  nn.Linear(hidden_output * 3, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((mpool, outputs.hidden_states[-2][:,0,:],outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        #out = out[:,0,:]
        logits = self.l0(out)
        # logits = self.softmax(logits)
        return logits

In [13]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 2, 2)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids
        )
        # print(dict(outputs).keys())
        o1 = outputs.hidden_states[-1]
        # o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        # pooled_output = o2
        cat = torch.cat((apool, mpool), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [14]:
#model for finetuning collected data
class BERTBengaliLastTwoPoolerFreeze(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerFreeze, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l2 = nn.Linear(hidden_output * 3, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l1 = nn.Linear(hidden_output * 2, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l0 = nn.Linear(hidden_output * 2, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((outputs.hidden_states[-2][:,0,:], mpool), dim=-1)#,outputs.pooler_output
        out = self.drop_out(out)
        out = self.l2(out)
        out = self.activation(out)
        out = self.l1(out)
        out = self.activation(out)
        logits = self.l0(out)
        #prob = self.softmax(logits)
        return logits


In [15]:
#model for finetuning collected data
class BERTBengaliLastTwoPoolerFreezePrev(nn.Module):
    def __init__(self, bert):
        super(BERTBengaliLastTwoPoolerFreezePrev, self).__init__()
        self.bert = bert
        self.drop_out = nn.Dropout(first_dropout_rate)
        self.l1 = nn.Linear(hidden_output * 2, hidden_output * 2)
        self.activation = nn.Tanh()
        self.l0 = nn.Linear(hidden_output * 2, classes)
        #torch.nn.init.normal_(self.l0.weight, std=0.02)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        mpool, _ = torch.max(outputs.hidden_states[-1], 1)
        out = torch.cat((outputs.hidden_states[-2][:,0,:], mpool,outputs.pooler_output), dim=-1)
        out = self.drop_out(out)
        out = self.l1(out)
        out = self.activation(out)
        logits = self.l0(out)
        #prob = self.softmax(logits)
        return logits


In [16]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
model = BERTBengaliLastTwoPooler(bert)
# model2Forlastlayers = CustomBERTBengali(bert)

model.to(device)
# model2Forlastlayers.to(device)
# model2Forlastlayers.load_state_dict(torch.load(DirPath+'Models by Sami/'+bert_model_name+"_modeltest.pth"))

# model.l0 = model2Forlastlayers.l0
# model.l2 = model2Forlastlayers.l1
# model.bert = model2Forlastlayers.bert

# model.load_state_dict(torch.load(DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_HScollected_lastfrozen_acc1_sub.pth"))

# for params in model.bert.parameters():
#   params.requires_grad = False
# for params in model.bert.embeddings.parameters():
#   params.requires_grad = True
# for params in model.bert.encoder.parameters():
#   params.requires_grad = False
# # for params in model.l2.parameters():
# #   params.requires_grad = True
# # for params in model.l1.parameters():
# #   params.requires_grad = True
# for params in model.l0.parameters():
#   params.requires_grad = True

for name, param in model.named_parameters():
  if param.requires_grad:
      print(f"name: {name} is trainable")
  else:
      print(f"name: {name} is non-trainable")

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

name: bert.embeddings.word_embeddings.weight is trainable
name: bert.embeddings.position_embeddings.weight is trainable
name: bert.embeddings.token_type_embeddings.weight is trainable
name: bert.embeddings.LayerNorm.weight is trainable
name: bert.embeddings.LayerNorm.bias is trainable
name: bert.encoder.layer.0.attention.self.query.weight is trainable
name: bert.encoder.layer.0.attention.self.query.bias is trainable
name: bert.encoder.layer.0.attention.self.key.weight is trainable
name: bert.encoder.layer.0.attention.self.key.bias is trainable
name: bert.encoder.layer.0.attention.self.value.weight is trainable
name: bert.encoder.layer.0.attention.self.value.bias is trainable
name: bert.encoder.layer.0.attention.output.dense.weight is trainable
name: bert.encoder.layer.0.attention.output.dense.bias is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.weight is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.bias is trainable
name: bert.encoder.layer.0.inter

In [17]:
#testing if the input of model works before starting training
s = "আমি বাংলায় গান গাই। [SEP]"

# debugging dataloader
# i = 0
# for batch in train_dataloader:
#   text, labels = batch
#   for j in range(len(text)):
#     print(i+1)
#     i+=1

# s=headlineContentSeparator
# print(s)
t = tokenizer.encode_plus(s, return_tensors="pt").to(device)
print(t)
out = model(**t)
print(out)

{'input_ids': tensor([[  101,   938, 37376,   100,   950, 18770,   950, 40102,   920,   102,
           102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}
tensor([[ 0.2539, -0.3514]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [18]:
print(df_train.iloc[94]['Headline'],df_train.iloc[94]['Content'])

গ্যাসের দাম বাড়ানোর সিদ্ধান্ত থেকে সরেছে সরকার হাসান মাহামুদ : শিল্পকারখানা, বাণিজ্যিক প্রতিষ্ঠান ও যানবাহনের জন্য গ্যাসের দাম বাড়ানোর ঘোষণা আসার কথা ছিল চলতি সপ্তাহেই। সরকারের সংশ্লিষ্ট সংস্থাগুলোর পক্ষ থেকে গ্যাসের দাম বৃদ্ধির পরিষ্কার আভাসও ছিল। কিন্তু জাতীয় সংসদ নির্বাচনকে সামনে রেখে এ সিদ্ধান্ত থেকে সরে এসেছে সরকার। বাংলাদেশ এনার্জি রেগুলেটরি কমিশনের (বিইআরসি) একটি বিশ্বস্ত সূত্র জানিয়েছে, নির্বাচনকে সামনে রেখে গ্যাসের মূল্য বৃদ্ধির ব্যাপারে অনেক কিছু ভেবে দেখেছে কমিশন। আপাতত গ্যাসের দাম বাড়ানোর সিদ্ধান্ত কার্যকর করছে না সরকার। জানা গেছে, এর আগে উচ্চমূল্যে তরলীকৃত প্রাকৃতিক গ্যাস (এলএনজি) আমদানির ওপর সম্পূরক শুল্ক (এসডি) মওকুফের জন্য সরকারের সিদ্ধান্তের অপেক্ষায় থাকা বিইআরসি সম্পূরক শুল্ক ব্যয় মেটাতেই গ্যাসের মূল্য বৃদ্ধির পরিকল্পনা করে। কিন্তু প্রধানমন্ত্রী শেখ হাসিনার নির্দেশে জ্বালানি সংক্রান্ত সর্বোচ্চ নীতিনির্ধারণী সংস্থা গ্যাসের মূল্য বৃদ্ধির বিষয়টি আপাতত স্থগিত রেখেছে। জানা গেছে, সম্পূরক শুল্ক মওকুফের পর গ্যাসের মূল্যের সাথে রেগুলেটরি কমিশন এখন সমন্বয়ের কাজ করছে। ফলে গ্যাস

In [19]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [20]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        loss = criterion(logs, labels)
        #print("successfully calculated criterion in train!")
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [21]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device, dtype=torch.long)

        # move things to model
        output = model(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        #print(f'predicted: {predicted} labels: {labels}')
        label_0_TP += ((predicted == 0) & (labels == 0)).sum().item()
        label_0_TN += ((predicted != 0) & (labels != 0)).sum().item()
        label_0_FP += ((predicted == 0) & (labels != 0)).sum().item()
        label_0_FN += ((predicted != 0) & (labels == 0)).sum().item()

        label_1_TP += ((predicted == 1) & (labels == 1)).sum().item()
        label_1_TN += ((predicted != 1) & (labels != 1)).sum().item()
        label_1_FP += ((predicted == 1) & (labels != 1)).sum().item()
        label_1_FN += ((predicted != 1) & (labels == 1)).sum().item()

        label_2_TP += ((predicted == 2) & (labels == 2)).sum().item()
        label_2_TN += ((predicted != 2) & (labels != 2)).sum().item()
        label_2_FP += ((predicted == 2) & (labels != 2)).sum().item()
        label_2_FN += ((predicted != 2) & (labels == 2)).sum().item()

    return total, correct, valid_loss, label_0_TP, label_0_TN, label_0_FP, label_0_FN, label_1_TP, label_1_TN, label_1_FP, label_1_FN, label_2_TP, label_2_TN, label_2_FP, label_2_FN


In [22]:

tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [None]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf
best_acc=0
sml = 1e-10
best_f1=0.0

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0


    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]
    label_0_TP += out[3]
    label_0_TN += out[4]
    label_0_FP += out[5]
    label_0_FN += out[6]

    label_1_TP += out[7]
    label_1_TN += out[8]
    label_1_FP += out[9]
    label_1_FN += out[10]

    # label_2_TP += out[11]
    # label_2_TN += out[12]
    # label_2_FP += out[13]
    # label_2_FN += out[14]

    # Calculate precision, recall, and F1-score for each class
    label_0_precision = label_0_TP / (label_0_TP + label_0_FP+sml)
    label_0_recall = label_0_TP / (label_0_TP + label_0_FN+sml)
    label_0_f1_score = 2 * (label_0_precision * label_0_recall) / (label_0_precision + label_0_recall+sml)

    label_1_precision = label_1_TP / (label_1_TP + label_1_FP+sml)
    label_1_recall = label_1_TP / (label_1_TP + label_1_FN+sml)
    label_1_f1_score = 2 * (label_1_precision * label_1_recall) / (label_1_precision + label_1_recall+sml)

    label_2_precision = label_2_TP / (label_2_TP + label_2_FP+sml)
    label_2_recall = label_2_TP / (label_2_TP + label_2_FN+sml)
    label_2_f1_score = 2 * (label_2_precision * label_2_recall) / (label_2_precision + label_2_recall+sml)

    # Calculate combined F1-score
    combined_f1_score = (label_0_f1_score + label_1_f1_score) / 2

    # Calculate micro TP, TN, FP, FN values
    micro_TP = label_0_TP + label_1_TP
    micro_TN = label_0_TN + label_1_TN
    micro_FP = label_0_FP + label_1_FP
    micro_FN = label_0_FN + label_1_FN

    # Calculate micro precision, recall, and F1 score
    micro_precision = micro_TP / (micro_TP + micro_FP)
    micro_recall = micro_TP / (micro_TP + micro_FN)
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    val_acc=correct / total * 100

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if combined_f1_score > best_f1:
        best_f1 = combined_f1_score
        torch.save(model.state_dict(), ModelPath)
        print(f'saved on epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))
    print("\tLabel 0 Precision: {:.4f}\tLabel 0 Recall: {:.4f}\tLabel 0 F1-score: {:.4f}\n"
      "\tLabel 1 Precision: {:.4f}\tLabel 1 Recall: {:.4f}\tLabel 1 F1-score: {:.4f}\n"
      "\tLabel 2 Precision: {:.4f}\tLabel 2 Recall: {:.4f}\tLabel 2 F1-score: {:.4f}\n"
      "\tCombined F1-score: {:.4f}".format(label_0_precision, label_0_recall, label_0_f1_score,
                                            label_1_precision, label_1_recall, label_1_f1_score,
                                            label_2_precision, label_2_recall, label_2_f1_score,
                                            combined_f1_score))
    print(f'micro precision: {micro_precision}, Micro recall: {micro_recall}, micro f1: {micro_f1}')

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/8


  0%|          | 0/5389 [00:00<?, ?it/s]

saved on epoch: 1
	Train loss:0.333732.. 	Valid Loss:0.344460.. 	Val Accuracy: 91.4150
	Label 0 Precision: 0.9806	Label 0 Recall: 0.6021	Label 0 F1-score: 0.7461
	Label 1 Precision: 0.9043	Label 1 Recall: 0.9969	Label 1 F1-score: 0.9483
	Label 2 Precision: 0.0000	Label 2 Recall: 0.0000	Label 2 F1-score: 0.0000
	Combined F1-score: 0.8472
micro precision: 0.9141496156760853, Micro recall: 0.9141496156760853, micro f1: 0.9141496156760852
Epoch: 2/8


  0%|          | 0/5389 [00:00<?, ?it/s]

saved on epoch: 2
	Train loss:0.282377.. 	Valid Loss:0.323623.. 	Val Accuracy: 92.4326
	Label 0 Precision: 0.9440	Label 0 Recall: 0.6791	Label 0 F1-score: 0.7899
	Label 1 Precision: 0.9208	Label 1 Recall: 0.9893	Label 1 F1-score: 0.9539
	Label 2 Precision: 0.0000	Label 2 Recall: 0.0000	Label 2 F1-score: 0.0000
	Combined F1-score: 0.8719
micro precision: 0.9243260798960702, Micro recall: 0.9243260798960702, micro f1: 0.9243260798960701
Epoch: 3/8


  0%|          | 0/5389 [00:00<?, ?it/s]

In [None]:
torch.cuda.empty_cache()

In [None]:
# torch.save(model.state_dict(), DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_finalhs_midnonfrozen_acc1_sub_finaluntested.pth")

In [None]:
from matplotlib import pyplot as plt

plt.plot(train_loss_data, label="Training loss")
plt.plot(valid_loss_data, label="validation loss")
plt.legend(frameon=False)

# Testing on test dataset

In [23]:
model.load_state_dict(torch.load(ModelPath))

<All keys matched successfully>

In [24]:
all_preds = []
all_labels = []


df_test = pd.read_csv(TestPath)
test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=False)

for batch in test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

# df_test['real_HS'] = all_labels
# df_test['predicted_HS'] = all_preds
# df_test.to_csv(DirPath+'nc/'+'test_HS_pred.csv')

In [25]:
print(len(all_preds))

9238


In [26]:
from sklearn.metrics import classification_report

# preds = np.argmax(preds, axis = 1)
print(classification_report(all_labels, all_preds, digits=4))

              precision    recall  f1-score   support

           0     0.9333    0.6937    0.7959      1936
           1     0.9240    0.9869    0.9544      7302

    accuracy                         0.9254      9238
   macro avg     0.9286    0.8403    0.8751      9238
weighted avg     0.9259    0.9254    0.9212      9238



In [None]:
all_preds = []
all_labels = []


df_test = pd.read_csv(DirPath+'Dataset/final_test.xlsx - test.csv')
training_data = NewsDatasets(df_test)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

for batch in train_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

df_test['real_HS'] = all_labels
df_test['predicted_HS'] = all_preds
df_test.to_csv(DirPath+'nc/'+'val_HS_pred.csv')

In [None]:
from sklearn.metrics import classification_report

# preds = np.argmax(preds, axis = 1)
print(classification_report(all_labels, all_preds))

<h1>Training the model with All Collected dataset with the selected model and hyperparameters(Code not yet updated)<h1>

In [None]:
!pip install --quiet transformers

In [None]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

In [None]:
#df loading
df_train = pd.read_csv('train.csv')[['sentence','hate speech']]
df_val = pd.read_csv('val.csv')[['sentence','hate speech']]
df_test = pd.read_csv('test.csv')[['sentence','hate speech']]

#concatenating all the data
df_train = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
print(df_train)
print(df_train.describe())

In [None]:
#defining previous hyperparameters got from testing
max_number_input_tokens=256
batch_size_training = 16
first_dropout_rate = 0.3
hidden_output = 768
bert_model_name = "sagorsarker/bangla-bert-base"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.8
epochs = 6
classes = 2

In [None]:
from google.colab import drive
drive.mount('/content/drive')
DirPath = ('/content/drive/My Drive/Test/')

In [None]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        return value['sentence'] , value['hate speech']

In [None]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=True)

test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=True)

In [None]:
class CustomBERTBengali(nn.Module):
    def __init__(self, bert):
        super(CustomBERTBengali, self).__init__()
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.tanh = nn.Tanh()
        self.out = nn.Linear(hidden_output * 3, classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        o1 = outputs.hidden_states[-1]
        o2 = outputs.pooler_output
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        pooled_output = o2
        cat = torch.cat((apool, mpool, pooled_output), 1)
        bo = self.bert_drop(cat)
        logits = self.out(bo)
        logits = self.softmax(logits)
        return logits

In [None]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CustomBERTBengali(bert)
model.to(device)

In [None]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(model.parameters(), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [None]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss = criterion(logs, labels)
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [None]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device)

        # move things to model
        output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    return total, correct, valid_loss

In [None]:
tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [None]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if True:
        best_loss = valid_loss
        torch.save(model.state_dict(), DirPath+bert_model_name+"_CustomBertBengaliFullDataset6epoch885044valacc.pth")
        print(f'epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))