<h1>loading train, validation and test data<h1>

In [1]:
!pip install --quiet transformers

In [2]:
import time

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
from tqdm.notebook import tqdm
from transformers import BertModel, BertTokenizer, BertForSequenceClassification, AutoModelForMaskedLM, AutoTokenizer

In [4]:
###Hyperparameter for the new model
#defining some hyperparameters
max_number_input_tokens=512
batch_size_training = 8
first_dropout_rate = 0.0
hidden_output = 768
bert_model_name = "csebuetnlp/banglabert"
adam_opt_lr = 3e-5
scheduler_step = 1
scheduler_gamma = 0.98
epochs = 8
classes = 4
model_layer = ''
name_change=''
headlineContentSeparator = ' \\\\ '

#other options
isSaveModel = False

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
DirPath = ('/kaggle/input/mis-dis-satire/')
TestPath = DirPath+'test_data.csv'
ValPath = DirPath+'val_data.csv'
TrainPath = DirPath+'train_data.csv'
ModelPath = DirPath+'Models/'+'FNBaseline_buetbertrHritThesis.pth'

In [6]:
#df loading
df_train = pd.read_csv(TrainPath) # [['sentence','hate speech']]
df_val = pd.read_csv(ValPath)#[['sentence','hate speech']]
df_test = pd.read_csv(TestPath)#[['sentence','hate speech']]

#concatenating all the data
# df_train = pd.concat([df_train, df_val, df_test], ignore_index=True)

print(df_train.shape)
print(df_val.shape)
print(df_test.shape)
display(df_train)
print(df_train.describe())
print(df_train['Label'].value_counts())
print(df_val['Label'].value_counts())
print(df_test['Label'].value_counts())

(43105, 3)
(9237, 3)
(9238, 3)


Unnamed: 0,Headline,Content,Label
0,বরিশাল কারাগারে হাজতির মৃত্যু,বরিশাল কেন্দ্রীয় কারাগারে খলিলুর রহমান নান্টু ...,3.0
1,সারা দেশে গণহারে গ্রেপ্তারের ঘটনা উদ্বেগের কারণ,সারা দেশে দলীয় নেতা–কর্মীদের গ্রেপ্তার প্রসঙ্গ...,3.0
2,মসজিদে নামাজরত অবস্থায় শাবি শিক্ষকের মৃত্যু,শাহজালাল বিজ্ঞান ও প্রযুক্তি বিশ্ববিদ্যালয়ের ম...,3.0
3,বাগেরহাটে আওয়ামী লীগের দু’গ্রুপে সংঘর্ষে নিহত ২,বাগেরহাটের মোরেলগঞ্জে আওয়ামী লীগের দুই গ্রুপের...,3.0
4,'ছবির ক্যাপশন অতিরঞ্জিত করা হয়েছে','এটি কনভোকেশনের সেরা ও শ্রেষ্ঠ ছবি। নিশ্চিত সে...,3.0
...,...,...,...
43100,খালেদার স্বাস্থ্য পরীক্ষার জন্য বিএসএমএমইউ গেল...,বেগম জিয়ার স্বাস্থ্য পরীক্ষার জন্য দ্বিতীয় দিন...,3.0
43101,পিভিসি পাইপে ইয়াবা,সিরাজগঞ্জে কুরিয়ার সার্ভিসের মাধ্যমে আসা পিভিস...,3.0
43102,ইয়াসের ক্ষতিপূরণ চেয়ে ভুরি ভুরি ‘ভুয়ো আবেদনপত্...,এ যেন সেই আমফানেরই পুনরাবৃত্তি। আমফানের পর ক্ষ...,2.0
43103,উত্তর কোরিয়াকে জ্বালানি দেয়ার মার্কিন অভিযোগ প...,নিষেধাজ্ঞা লঙ্ঘন করে উত্তর কোরিয়াকে জ্বালানি ত...,3.0


              Label
count  43103.000000
mean       2.663295
std        0.746240
min        0.000000
25%        3.000000
50%        3.000000
75%        3.000000
max        3.000000
Label
3.0    34092
2.0     5195
1.0     2130
0.0     1686
Name: count, dtype: int64
Label
3.0    7273
2.0    1115
1.0     489
0.0     360
Name: count, dtype: int64
Label
3.0    7313
2.0    1112
1.0     462
0.0     351
Name: count, dtype: int64


<h1>preparing training, validation and test data. Preparing model. Training model<h1>

In [7]:
class NewsDatasets(Dataset):
    def __init__(self, data, max_length=max_number_input_tokens):
        self.data = data

        self.config = {
            "max_length": max_length,
            "padding": "max_length",
            "return_tensors": "pt",
            "truncation": True,
            "add_special_tokens": True,
            "truncation_strategy":"longest_first"
        }

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        value = self.data.iloc[idx]
        # print(value["Headline"],value['Content']) #debugging dataloader
        return value['Headline']+headlineContentSeparator+value['Content'] , value['Label']

In [8]:
training_data = NewsDatasets(df_train)
train_dataloader = DataLoader(training_data, batch_size=batch_size_training, shuffle=True)

val_data = NewsDatasets(df_val)
val_dataloader = DataLoader(val_data, batch_size=batch_size_training, shuffle=False)

test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=False)

In [9]:
class BERTBengali(nn.Module):
    def __init__(self, bert):
        super(BERTBengali, self).__init__()
        #self.bert = BertForMaskedLM.from_pretrained("sagorsarker/bangla-bert-base")
        self.bert = bert
        self.bert_drop = nn.Dropout(first_dropout_rate)
        self.out = nn.Linear(hidden_output, classes)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        bo = self.bert_drop(output[1])

        output = self.out(bo)
        return output

In [11]:
bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
model = BERTBengali(bert)
# model2Forlastlayers = CustomBERTBengali(bert)

model.to(device)
# model2Forlastlayers.to(device)
# model2Forlastlayers.load_state_dict(torch.load(DirPath+'Models by Sami/'+bert_model_name+"_modeltest.pth"))

# model.l0 = model2Forlastlayers.l0
# model.l2 = model2Forlastlayers.l1
# model.bert = model2Forlastlayers.bert

# model.load_state_dict(torch.load(DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_HScollected_lastfrozen_acc1_sub.pth"))

# for params in model.bert.parameters():
#   params.requires_grad = False
# for params in model.bert.embeddings.parameters():
#   params.requires_grad = True
# for params in model.bert.encoder.parameters():
#   params.requires_grad = False
# # for params in model.l2.parameters():
# #   params.requires_grad = True
# # for params in model.l1.parameters():
# #   params.requires_grad = True
# for params in model.l0.parameters():
#   params.requires_grad = True

for name, param in model.named_parameters():
  if param.requires_grad:
      print(f"name: {name} is trainable")
  else:
      print(f"name: {name} is non-trainable")

You are using a model of type electra to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertModel were not initialized from the model checkpoint at csebuetnlp/banglabert and are newly initialized: ['embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.layer.0.intermediate.dense.bias', 'encoder.layer.0.intermedi

name: bert.embeddings.word_embeddings.weight is trainable
name: bert.embeddings.position_embeddings.weight is trainable
name: bert.embeddings.token_type_embeddings.weight is trainable
name: bert.embeddings.LayerNorm.weight is trainable
name: bert.embeddings.LayerNorm.bias is trainable
name: bert.encoder.layer.0.attention.self.query.weight is trainable
name: bert.encoder.layer.0.attention.self.query.bias is trainable
name: bert.encoder.layer.0.attention.self.key.weight is trainable
name: bert.encoder.layer.0.attention.self.key.bias is trainable
name: bert.encoder.layer.0.attention.self.value.weight is trainable
name: bert.encoder.layer.0.attention.self.value.bias is trainable
name: bert.encoder.layer.0.attention.output.dense.weight is trainable
name: bert.encoder.layer.0.attention.output.dense.bias is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.weight is trainable
name: bert.encoder.layer.0.attention.output.LayerNorm.bias is trainable
name: bert.encoder.layer.0.inter

In [12]:
#testing if the input of model works before starting training
s = "আমি বাংলায় গান গাই। [SEP]"

# debugging dataloader
# i = 0
# for batch in train_dataloader:
#   text, labels = batch
#   for j in range(len(text)):
#     print(i+1)
#     i+=1

# s=headlineContentSeparator
# print(s)
t = tokenizer.encode_plus(s, return_tensors="pt").to(device)
print(t)
out = model(**t)
print(out)

{'input_ids': tensor([[   2,  857,    1, 1755, 3893,  205,    3,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[-0.2408, -0.2054,  0.0465,  0.1316]], grad_fn=<AddmmBackward0>)


In [14]:
print(df_train.iloc[94]['Headline'],df_train.iloc[94]['Content'])

বর্ণাঢ্য আয়োজনে উন্নয়ন মেলার শুরু, প্রথম দিনেই ভিড় রাজশাহী: রাজশাহীতে বর্ণাঢ্য আয়োজনে শুরু হয়েছে চতুর্থ জাতীয় উন্নয়ন মেলা-২০১৮। প্রথম দিনেই মেলায় উপচেপড়া ভিড় লক্ষ্য করা গেছে। বৃহস্পতিবার (০৪ অক্টোবর) দুপুরে প্রধানমন্ত্রী শেখ হাসিনা ভিডিও কনফান্সের মাধ্যমে দেশব্যাপী এ মেলার উদ্বোধন করেন। পরে রাজশাহী সিটি করপোরেশনের (রাসিক) মেয়র এএইচএম খায়রুজ্জামান লিটন প্রধান অতিথি হিসেবে উপস্থিত থেকে বেলুন ও পায়রা উড়িয়ে স্থানীয়ভাবে মেলার উদ্বোধন করেন। এসময় রাজশাহী জেলা প্রশাসক এসএম আব্দুল কাদেরের সভাপতিত্বে অনুষ্ঠানে আরও উপস্থিত ছিলেন-ভূমি সংরক্ষণ বিভাগের অতিরিক্ত সচিব সালমা আকতার জাহান, প্রধানমন্ত্রীর কার্যালয়ের এনজিও ব্যুরোর মহাপরিচালক কেএম আব্দুস সালাম, বিভাগীয় কমিশনার নূর-উর রহমান, মহানগর আওয়ামী লীগের সিনিয়র সহ-সভাপতি শাহীন আকতার রেনী, পুলিশের রাজশাহী রেঞ্জের ডিআইজি খুরশিদ হোসেন, মহানগর পুলিশ কমিশনার একেএম হাফিজ আক্তার, সংরক্ষিত আসনের সংসদ সদস্য আকতার জাহান, সাবেক প্রতিমন্ত্রী অধ্যাপিকা জিন্নাতুন নেসা তালুকদার, মহানগর আওয়ামী লীগের সাধারণ সম্পাদক ডাবলু সরকার, রাজশাহী কলেজ অধ্যক্ষ অধ্যাপক মুহা. হব

In [15]:
from torch.optim.lr_scheduler import StepLR

optimizer = AdamW(filter(lambda p: p.requires_grad, model.parameters()), lr=adam_opt_lr)
criterion = nn.CrossEntropyLoss()
scheduler = StepLR(optimizer, step_size=scheduler_step, gamma=scheduler_gamma)

In [16]:
def train(model, dataloader, optimizer, criterion, config):
    model.train()  # prep model for training
    train_loss = 0
    for batch in tqdm(dataloader):
        text, labels = batch

        model.zero_grad()

        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        #labels = labels.to(device)
        labels = labels.to(device, dtype=torch.long)  # Convert labels to torch.long

        # move things to model
        logs = model( input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)

        loss = criterion(logs, labels)
        #print("successfully calculated criterion in train!")
        train_loss += loss.item() * input_ids.size(0)
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

    return train_loss

In [17]:
def evaluate(model, dataloader, criterion, config):
    total = 0
    correct = 0
    valid_loss = 0.0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0

    model.eval()  # prep model for evaluation
    for batch in dataloader:
        text, labels = batch
        inputs = tokenizer.batch_encode_plus(
            text, **config
        )
        input_ids = inputs['input_ids'].to(device)
        token_type_ids = inputs['token_type_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)
        labels = labels.to(device, dtype=torch.long)

        # move things to model
        output = model(input_ids=input_ids, attention_mask=attention_mask,token_type_ids=token_type_ids)

        loss_p = criterion(output, labels)
        # update running validation loss
        valid_loss += loss_p.item() * input_ids.size(0)
        # calculate accuracy
        proba = torch.exp(output)
        top_p, top_class = proba.topk(1, dim=1)
        equals = top_class == labels.view(*top_class.shape)
        # accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

        _, predicted = torch.max(output.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        #print(f'predicted: {predicted} labels: {labels}')
        label_0_TP += ((predicted == 0) & (labels == 0)).sum().item()
        label_0_TN += ((predicted != 0) & (labels != 0)).sum().item()
        label_0_FP += ((predicted == 0) & (labels != 0)).sum().item()
        label_0_FN += ((predicted != 0) & (labels == 0)).sum().item()

        label_1_TP += ((predicted == 1) & (labels == 1)).sum().item()
        label_1_TN += ((predicted != 1) & (labels != 1)).sum().item()
        label_1_FP += ((predicted == 1) & (labels != 1)).sum().item()
        label_1_FN += ((predicted != 1) & (labels == 1)).sum().item()

        label_2_TP += ((predicted == 2) & (labels == 2)).sum().item()
        label_2_TN += ((predicted != 2) & (labels != 2)).sum().item()
        label_2_FP += ((predicted == 2) & (labels != 2)).sum().item()
        label_2_FN += ((predicted != 2) & (labels == 2)).sum().item()

    return total, correct, valid_loss, label_0_TP, label_0_TN, label_0_FP, label_0_FN, label_1_TP, label_1_TN, label_1_FP, label_1_FN, label_2_TP, label_2_TN, label_2_FP, label_2_FN


In [18]:

tokenizer_config = {
    "max_length": max_number_input_tokens,
    "padding": "max_length",
    "return_tensors": "pt",
    "truncation": True,
    "add_special_tokens": True,
     "truncation_strategy":"longest_first"
}

In [19]:
train_loss_data, valid_loss_data = [], []
valid_loss_min = np.Inf
since = time.time()
best_loss = np.inf
best_acc=0
sml = 1e-10
best_f1=0.0

for epoch in range(epochs):
    print("Epoch: {}/{}".format(epoch + 1, epochs))
    # monitor training loss
    train_loss = 0.0
    valid_loss = 0.0
    total = 0
    correct = 0
    label_0_TP = 0
    label_0_TN = 0
    label_0_FP = 0
    label_0_FN = 0

    label_1_TP = 0
    label_1_TN = 0
    label_1_FP = 0
    label_1_FN = 0

    label_2_TP = 0
    label_2_TN = 0
    label_2_FP = 0
    label_2_FN = 0


    e_since = time.time()

    # Train Model
    train_loss += train(model, train_dataloader, optimizer, criterion, tokenizer_config)
    # Now Evaluate
    out = evaluate(model, val_dataloader, criterion, tokenizer_config)
    total += out[0]
    correct += out[1]
    valid_loss += out[2]
    label_0_TP += out[3]
    label_0_TN += out[4]
    label_0_FP += out[5]
    label_0_FN += out[6]

    label_1_TP += out[7]
    label_1_TN += out[8]
    label_1_FP += out[9]
    label_1_FN += out[10]

    # label_2_TP += out[11]
    # label_2_TN += out[12]
    # label_2_FP += out[13]
    # label_2_FN += out[14]

    # Calculate precision, recall, and F1-score for each class
    label_0_precision = label_0_TP / (label_0_TP + label_0_FP+sml)
    label_0_recall = label_0_TP / (label_0_TP + label_0_FN+sml)
    label_0_f1_score = 2 * (label_0_precision * label_0_recall) / (label_0_precision + label_0_recall+sml)

    label_1_precision = label_1_TP / (label_1_TP + label_1_FP+sml)
    label_1_recall = label_1_TP / (label_1_TP + label_1_FN+sml)
    label_1_f1_score = 2 * (label_1_precision * label_1_recall) / (label_1_precision + label_1_recall+sml)

    label_2_precision = label_2_TP / (label_2_TP + label_2_FP+sml)
    label_2_recall = label_2_TP / (label_2_TP + label_2_FN+sml)
    label_2_f1_score = 2 * (label_2_precision * label_2_recall) / (label_2_precision + label_2_recall+sml)

    # Calculate combined F1-score
    combined_f1_score = (label_0_f1_score + label_1_f1_score) / 2

    # Calculate micro TP, TN, FP, FN values
    micro_TP = label_0_TP + label_1_TP
    micro_TN = label_0_TN + label_1_TN
    micro_FP = label_0_FP + label_1_FP
    micro_FN = label_0_FN + label_1_FN

    # Calculate micro precision, recall, and F1 score
    micro_precision = micro_TP / (micro_TP + micro_FP)
    micro_recall = micro_TP / (micro_TP + micro_FN)
    micro_f1 = 2 * (micro_precision * micro_recall) / (micro_precision + micro_recall)

    scheduler.step()

    # print training/validation statistics
    # calculate average loss over an epoch
    train_loss = train_loss / len(train_dataloader.dataset)
    valid_loss = valid_loss / len(val_dataloader.dataset)

    val_acc=correct / total * 100

    # calculate train loss and running loss
    train_loss_data.append(train_loss * 100)
    valid_loss_data.append(valid_loss * 100)

    if combined_f1_score > best_f1:
        best_f1 = combined_f1_score
        torch.save(model.state_dict(), ModelPath)
        print(f'saved on epoch: {epoch+1}')

    print("\tTrain loss:{:.6f}..".format(train_loss),
          "\tValid Loss:{:.6f}..".format(valid_loss),
          "\tVal Accuracy: {:.4f}".format(correct / total * 100))
    print("\tLabel 0 Precision: {:.4f}\tLabel 0 Recall: {:.4f}\tLabel 0 F1-score: {:.4f}\n"
      "\tLabel 1 Precision: {:.4f}\tLabel 1 Recall: {:.4f}\tLabel 1 F1-score: {:.4f}\n"
      "\tLabel 2 Precision: {:.4f}\tLabel 2 Recall: {:.4f}\tLabel 2 F1-score: {:.4f}\n"
      "\tCombined F1-score: {:.4f}".format(label_0_precision, label_0_recall, label_0_f1_score,
                                            label_1_precision, label_1_recall, label_1_f1_score,
                                            label_2_precision, label_2_recall, label_2_f1_score,
                                            combined_f1_score))
    print(f'micro precision: {micro_precision}, Micro recall: {micro_recall}, micro f1: {micro_f1}')

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(
    time_elapsed // 60, time_elapsed % 60))

Epoch: 1/8


  0%|          | 0/5389 [00:00<?, ?it/s]

TypeError: can only concatenate str (not "float") to str

In [None]:
torch.cuda.empty_cache()

In [None]:
# torch.save(model.state_dict(), DirPath+bert_model_name+"_lasttwopoolerf_contest_val_from_finalhs_midnonfrozen_acc1_sub_finaluntested.pth")

In [None]:
from matplotlib import pyplot as plt

plt.plot(train_loss_data, label="Training loss")
plt.plot(valid_loss_data, label="validation loss")
plt.legend(frameon=False)

# Testing on test dataset

In [None]:
model.load_state_dict(torch.load(ModelPath))

In [None]:
all_preds = []
all_labels = []


df_test = pd.read_csv(TestPath)
test_data = NewsDatasets(df_test)
test_dataloader = DataLoader(test_data, batch_size=batch_size_training, shuffle=False)

for batch in test_dataloader:
    text, labels = batch
    inputs = tokenizer.batch_encode_plus(
        text, **tokenizer_config
    )
    input_ids = inputs['input_ids'].to(device)
    token_type_ids = inputs['token_type_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    labels = labels.to(device)

    # move things to model
    output = model(token_type_ids=token_type_ids, input_ids=input_ids, attention_mask=attention_mask)
    preds = output.detach().cpu().numpy()
    preds = np.argmax(preds, axis = 1)
    all_preds.extend(preds)
    all_labels.extend(labels.cpu().numpy())

# df_test['real_HS'] = all_labels
# df_test['predicted_HS'] = all_preds
# df_test.to_csv(DirPath+'nc/'+'test_HS_pred.csv')

In [None]:
print(len(all_preds))

In [None]:
from sklearn.metrics import classification_report

# preds = np.argmax(preds, axis = 1)
print(classification_report(all_labels, all_preds, digits=4))