In [None]:
!pip install transformers



## **Load the Drive helper and mount**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## **Importing packages**

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from tqdm import tqdm
from tqdm import trange
import os

# Torch ML libraries
import transformers
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import AutoTokenizer, AutoModel
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')

## **Reading dataset**

In [None]:
train_data = pd.read_csv('Data/train.csv')
dev_data = pd.read_csv('Data/dev.csv')
test_data = pd.read_csv('Data/test.csv')

In [None]:
test_data.head()

Unnamed: 0,text
0,বাংলাদেশের হিন্দুরা নিজেদের জন্য আলাদা হিন্দু ...
1,মাইজদী - চৌমুহুনী - ফেনী মন্দিরে হামলা নিয়ে রি...
2,"দয়া করে পবিত্র কুরআনুল কারিম বলেন,,,,পবিত্র কথ..."
3,বিবিসি হলো সত্য কে বিনষ্টকারী আর মিথ্যা কে গ্র...
4,বুধবার কি তোরা মারা গেছিলি বিবিসি বাংলা


## **Getting the sentences and their labels from the dataframe**

In [None]:
train_texts = train_data['text'].tolist()
train_label = train_data['label'].tolist()

dev_texts = dev_data['text'].tolist()
dev_label = dev_data['label'].tolist()


In [None]:
BATCH_SIZE = 32
num_label = len(train_data['label'].unique())
print(num_label)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

3
cuda


In [None]:
max_len = 250

## **Initializing tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained('intfloat/multilingual-e5-base')

Downloading (…)okenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [None]:
print(tokenizer)

XLMRobertaTokenizerFast(name_or_path='intfloat/multilingual-e5-base', vocab_size=250002, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False)}, clean_up_tokenization_spaces=True)


In [None]:
class BertInputItem(object):
  def __init__(self, text, input_ids, input_mask, label_id):
    self.text = text
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.label_id = label_id

def convert_examples_to_inputs(example_texts, example_labels, max_seq_length, tokenizer, verbose=0):
  input_items = []
  examples = zip(example_texts, example_labels)
  for (ex_index, (text, label)) in enumerate(examples):
    #print(text)
    text = 'পাঠ্য অংশের অনুভূতি শ্রেণীবদ্ধ করুন: ' + text
    tokenizer_output = tokenizer(text, max_length=max_len, padding='max_length', truncation=True)
    #print(tokenizer_output['input_ids'], type(tokenizer_output['input_ids']))
    input_ids = tokenizer_output['input_ids']
    input_mask = tokenizer_output['attention_mask']

    label_id = label

    input_items.append(BertInputItem(text = text, input_ids = input_ids, input_mask = input_mask, label_id = label_id))
  return input_items

## **Loading augmented data**

In [None]:
import pickle

a0_v1 = pickle.load(open('Data/Augmented/category0_augmented.pickle', 'rb'))
a1_v1 = pickle.load(open('Data/Augmented/category1_augmented.pickle', 'rb'))
a2_v1 = pickle.load(open('Data/Augmented/category2_augmented.pickle', 'rb'))

a0_v1 = [i[0] for i in a0_v1]
a1_v1 = [i[0] for i in a1_v1]
a2_v1 = [i[0] for i in a2_v1]

train_texts.extend(a0_v1)
train_label.extend([0]*len(a0_v1))

train_texts.extend(a1_v1)
train_label.extend([1]*len(a1_v1))

train_texts.extend(a2_v1)
train_label.extend([2]*len(a2_v1))

print(len(train_texts), len(train_label))

4089 4089


In [None]:
print(len(train_texts))
train_features = convert_examples_to_inputs(train_texts, train_label, max_len, tokenizer, verbose=0)
dev_features = convert_examples_to_inputs(dev_texts, dev_label, max_len, tokenizer, verbose=0)
print(len(train_features), len(dev_features))

In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def get_data_loader(features, max_seq_length, batch_size, shuffle=True):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype = torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype = torch.long)
  all_label_ids = torch.tensor([f.label_id for f in features], dtype = torch.long)
  data = TensorDataset(all_input_ids, all_input_mask, all_label_ids)

  dataloader = DataLoader(data, shuffle=shuffle, batch_size = batch_size)
  return dataloader

## **Creating dataloader**

In [None]:
train_dataloader = get_data_loader(train_features, max_len, BATCH_SIZE, shuffle=True)
dev_dataloader = get_data_loader(dev_features, max_len, BATCH_SIZE, shuffle=False)

## **Initializing model**

In [None]:
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained('intfloat/multilingual-e5-large', num_labels = num_label)
model.to(device)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at intfloat/multilingual-e5-large and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=1024, out_fe

In [None]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
import math

GRADIENT_ACCUMULATION_STEPS = 1
EPOCHS = 15
LEARNING_RATE = 5 * 10**-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5

num_train_steps = int(len(train_dataloader.dataset)/BATCH_SIZE/GRADIENT_ACCUMULATION_STEPS * EPOCHS)
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params' : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params' : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 00.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr = LEARNING_RATE, correct_bias = False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps)

## **Evaluate function**

In [None]:
def evaluate(model, dataloader):
  model.eval()

  eval_loss = 0
  nb_eval_steps = 0
  predicted_labels, correct_labels = [], []

  for step, batch in enumerate(tqdm(dataloader, desc = "Evaluation iteration")):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, label_ids = batch

      with torch.no_grad():
        outputs = model(input_ids, attention_mask = input_mask, labels=label_ids)
      tmp_eval_loss = outputs[0]
      logits = outputs[1]
      outputs = np.argmax(logits.to('cpu'), axis=1)
      label_ids = label_ids.to('cpu').numpy()

      predicted_labels += list(outputs)
      correct_labels += list(label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      nb_eval_steps += 1

  eval_loss = eval_loss/nb_eval_steps
  correct_labels = np.array(correct_labels)
  predicted_labels = np.array(predicted_labels)


  return eval_loss, correct_labels, predicted_labels


In [None]:
OUTPUT_DIR = 'BLP2023/Saved_dir/Task1/mulilingual-e5-base-w-aug-r1'
MODEL_NAME = "intfloat/multilingual-e5-base"
MODEL_FILE_NAME = 'pytorch_model.bin'
PATIENCE = 2

## **Training loop**

In [None]:
train_loss_history = []
dev_loss_history = []

no_improvement = 0
for e in trange(int(EPOCHS), desc="Epoch"):
    model.train()
    tr_loss = 0

    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, label_ids = batch

        outputs = model(input_ids, attention_mask = input_mask, labels = label_ids)

        loss = outputs[0]

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss/GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        tr_loss += loss.item()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm(model.parameters(), MAX_GRAD_NORM)

            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

    dev_loss, _, _ = evaluate(model, dev_dataloader)

    print("Loss history : ", dev_loss_history)
    #print("Dev loss : ", dev_loss)

    if len(dev_loss_history) == 0 or dev_loss < min(dev_loss_history):
        no_improvement = 0
        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_dir = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
        torch.save(model_to_save.state_dict(), output_model_dir)
        print(f"Model saved at epoch {e}")
    else:
        no_improvement += 1

    if no_improvement >= PATIENCE:
        print("No improvement on development set. Finish training")
        break

    dev_loss_history.append(dev_loss)
    train_loss_history.append(tr_loss/len(train_dataloader))

## **Train-Val loss plot**

In [None]:
plt.plot(train_loss_history, label='train_loss')
plt.plot(dev_loss_history, label='validation loss')

plt.title('Training history')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

## **Loading saved model plot**

In [None]:
import os
from sklearn.metrics import classification_report, precision_recall_fscore_support

model_state_dict = torch.load(os.path.join(OUTPUT_DIR, MODEL_FILE_NAME), map_location=lambda storage, loc:storage)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = num_label, state_dict = model_state_dict)
model.to(device)

model.eval()


Downloading (…)lve/main/config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

## **Calculating evaluation metric**

In [None]:
#_, train_correct, train_predicted = evaluate(model, train_dataloader)
_, dev_correct, dev_predicted = evaluate(model, dev_dataloader)

#print("Training performance : ", precision_recall_fscore_support(train_correct, train_predicted, average='micro'))
print("Dev performance : ", precision_recall_fscore_support(dev_correct, dev_predicted, average='macro'))

from sklearn.metrics import f1_score
print("Macro F1-score = ", f1_score(dev_correct, dev_predicted, average='macro'))

In [None]:
df = pd.DataFrame({'text': dev_texts, 'label': dev_predicted})
df.head()

Unnamed: 0,text,label
0,পাডা পুতার মাঝখানে পরে সাধারণ ২ মানুষের জিবন শ...,0
1,করোনার চাপে অনেক কিছু বন্ধ ও অনেক বিধি নিষেধ ক...,0
2,সঠিক তদন্ত করতে হবে। বিচারের আওতায় আনতে হবে য...,0
3,যে লোকটা মারা গেছে তার কি হবে তার দায়ভার কে ন...,0
4,নিউ মার্কেট এবং গুলিস্থান মার্কেটের ব্যবসায়ীর...,2


## **Saving dataframe to csv file. This contains model predictions during development phase.**

In [None]:
df.to_csv('BLP2023/Saved_dir/Task1/mulilingual-e5-base-w-aug-r1/e5_base_w_aug_t1.csv', index=False)

## **Inference on test data**

In [None]:
text_list = test_data['text'].tolist()
model_pred = []

for i in tqdm(range(len(text_list))):
    text = 'পাঠ্য অংশের অনুভূতি শ্রেণীবদ্ধ করুন: ' + text_list[i]
    tokenizer_output = tokenizer(text, max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
    #print(tokenizer_output['input_ids'], type(tokenizer_output['input_ids']))
    input_ids = tokenizer_output['input_ids'].to(device)
    input_mask = tokenizer_output['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask = input_mask)
    logits = outputs['logits'].detach().cpu()
    pred = np.argmax(logits.to('cpu'), axis = 1).detach().numpy()[0]
    model_pred.append(pred)

print(len(text_list), len(model_pred))

100%|██████████| 2016/2016 [00:40<00:00, 50.07it/s]

2016 2016





## **Saving test data predictions to csv file during evaluation phase**

In [None]:
df = pd.DataFrame({'text': text_list, 'label': model_pred})
df.to_csv('BLP2023/Saved_dir/Task1/mulilingual-e5-base-w-aug-r1/test_data_predictions.csv', index=False)
df.head()

Unnamed: 0,text,label
0,বাংলাদেশের হিন্দুরা নিজেদের জন্য আলাদা হিন্দু ...,0
1,মাইজদী - চৌমুহুনী - ফেনী মন্দিরে হামলা নিয়ে রি...,0
2,"দয়া করে পবিত্র কুরআনুল কারিম বলেন,,,,পবিত্র কথ...",0
3,বিবিসি হলো সত্য কে বিনষ্টকারী আর মিথ্যা কে গ্র...,0
4,বুধবার কি তোরা মারা গেছিলি বিবিসি বাংলা,0
