In [None]:
!pip install transformers sentencepiece

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m54.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m24.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting

In [None]:
# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## **Importing packages**

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from tqdm import tqdm
from tqdm import trange
import os

# Torch ML libraries
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
import torch

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

# Misc.
import warnings
warnings.filterwarnings('ignore')

## **Reading dataset**

In [None]:
train_data = pd.read_csv('BLP2023/blp_task2/data/blp23_sentiment_train.tsv', sep = '\t')
dev_data = pd.read_csv('BLP2023/blp_task2/data/blp23_sentiment_dev.tsv', sep='\t')
test_data = pd.read_csv('BLP2023/blp_task2/data/blp23_sentiment_dev_test.tsv', sep='\t')

actual_test_data = pd.read_csv('BLP2023/blp_task2/data/blp23_sentiment_test.tsv', sep='\t')

In [None]:
actual_test_data.head()

Unnamed: 0,id,text
0,7135,মুখস্ত শিক্ষা দিয়ে কি করবে এই জাতি ? বাংলাদেশ...
1,28949,জর্ডানের সাবেক যুবরাজ প্রিন্স হামজার ভিডিও বার...
2,10210,আমার ছেলের দুর্ভাগ্য না সৌভাগ্য জানিনা জ্বর এর...
3,9526,Pranoy Sen তখন পাকিস্তান ও আফগানিস্তান ভারতের ...
4,2142,আরো কত মিথ্যাচার করবে


In [None]:
train_texts = train_data['text'].tolist()
dev_texts = dev_data['text'].tolist()

train_label = train_data['label'].tolist()
dev_label = dev_data['label'].tolist()

## **Creating mapping from label to ID**

In [None]:
label_to_id = {'Positive' : 1, 'Neutral': 0, 'Negative': 2}
id_to_label = {k: v for v, k in label_to_id.items()}
print(id_to_label)

{1: 'Positive', 0: 'Neutral', 2: 'Negative'}


In [None]:
BATCH_SIZE = 32
num_label = len(train_data['label'].unique())
print(num_label)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

3
cuda


In [None]:
max_len = 250

In [None]:
MODEL_NAME = 'cardiffnlp/twitter-xlm-roberta-base-sentiment'

## **Initializing tokenizer**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

Downloading (…)lve/main/config.json:   0%|          | 0.00/841 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

In [None]:
class BertInputItem(object):
  def __init__(self, text, input_ids, input_mask, label_id):
    self.text = text
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.label_id = label_id

def convert_examples_to_inputs(example_texts, example_labels, max_seq_length, tokenizer, verbose=0):
  input_items = []
  examples = zip(example_texts, example_labels)
  for (ex_index, (text, label)) in enumerate(examples):
    tokenizer_output = tokenizer(text, max_length=max_len, padding='max_length', truncation=True)
    #print(tokenizer_output['input_ids'], type(tokenizer_output['input_ids']))
    input_ids = tokenizer_output['input_ids']
    input_mask = tokenizer_output['attention_mask']

    label_id = label_to_id[label]

    input_items.append(BertInputItem(text = text, input_ids = input_ids, input_mask = input_mask, label_id = label_id))
  return input_items

## **Creating Input features**

In [None]:
print(len(train_texts))
train_features = convert_examples_to_inputs(train_texts, train_label, max_len, tokenizer, verbose=0)
dev_features = convert_examples_to_inputs(dev_texts, dev_label, max_len, tokenizer, verbose=0)
print(len(train_features), len(dev_features))

35266
35266 3934


In [None]:
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler

def get_data_loader(features, max_seq_length, batch_size, shuffle=True):
  all_input_ids = torch.tensor([f.input_ids for f in features], dtype = torch.long)
  all_input_mask = torch.tensor([f.input_mask for f in features], dtype = torch.long)
  all_label_ids = torch.tensor([f.label_id for f in features], dtype = torch.long)
  data = TensorDataset(all_input_ids, all_input_mask, all_label_ids)

  dataloader = DataLoader(data, shuffle=shuffle, batch_size = batch_size)
  return dataloader

## **Creating dataloaders**

In [None]:
train_dataloader = get_data_loader(train_features, max_len, BATCH_SIZE, shuffle=True)
dev_dataloader = get_data_loader(dev_features, max_len, BATCH_SIZE, shuffle=False)

## **Initializing model**

In [None]:
from transformers.models.auto.modeling_auto import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = num_label)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

In [None]:
from transformers.optimization import AdamW, get_linear_schedule_with_warmup
import math


GRADIENT_ACCUMULATION_STEPS = 1
EPOCHS = 15
LEARNING_RATE = 5 * 10**-5
WARMUP_PROPORTION = 0.1
MAX_GRAD_NORM = 5

num_train_steps = int(len(train_dataloader.dataset)/BATCH_SIZE/GRADIENT_ACCUMULATION_STEPS * EPOCHS)
num_warmup_steps = int(WARMUP_PROPORTION * num_train_steps)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']

optimizer_grouped_parameters = [
    {'params' : [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params' : [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 00.0}
]

optimizer = AdamW(optimizer_grouped_parameters, lr = LEARNING_RATE, correct_bias = False)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = num_warmup_steps, num_training_steps = num_train_steps)

## **Evaluate function**

In [None]:
def evaluate(model, dataloader):
  model.eval()

  eval_loss = 0
  nb_eval_steps = 0
  predicted_labels, correct_labels = [], []

  for step, batch in enumerate(tqdm(dataloader, desc = "Evaluation iteration")):
      batch = tuple(t.to(device) for t in batch)
      input_ids, input_mask, label_ids = batch

      with torch.no_grad():
        outputs = model(input_ids, attention_mask = input_mask, labels=label_ids)
      tmp_eval_loss = outputs[0]
      logits = outputs[1]
      outputs = np.argmax(logits.to('cpu'), axis=1)
      label_ids = label_ids.to('cpu').numpy()

      predicted_labels += list(outputs)
      correct_labels += list(label_ids)

      eval_loss += tmp_eval_loss.mean().item()
      nb_eval_steps += 1

  eval_loss = eval_loss/nb_eval_steps
  correct_labels = np.array(correct_labels)
  predicted_labels = np.array(predicted_labels)


  return eval_loss, correct_labels, predicted_labels


In [None]:
OUTPUT_DIR = 'BLP2023/Saved_dir/xlm-roberta-sentiment-task2'
MODEL_FILE_NAME = 'pytorch_model.bin'
PATIENCE = 2

## **Training loop**

In [None]:
train_loss_history = []
dev_loss_history = []

no_improvement = 0
for e in trange(int(EPOCHS), desc="Epoch"):
    model.train()
    tr_loss = 0

    nb_tr_examples, nb_tr_steps = 0, 0
    for step, batch in enumerate(tqdm(train_dataloader, desc="Training iteration")):
        batch = tuple(t.to(device) for t in batch)
        input_ids, input_mask, label_ids = batch

        outputs = model(input_ids, attention_mask = input_mask, labels = label_ids)

        loss = outputs[0]

        if GRADIENT_ACCUMULATION_STEPS > 1:
            loss = loss/GRADIENT_ACCUMULATION_STEPS

        loss.backward()
        tr_loss += loss.item()

        if (step + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            torch.nn.utils.clip_grad_norm(model.parameters(), MAX_GRAD_NORM)

            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()

    dev_loss, _, _ = evaluate(model, dev_dataloader)

    print("Loss history : ", dev_loss_history)
    #print("Dev loss : ", dev_loss)

    if len(dev_loss_history) == 0 or dev_loss < min(dev_loss_history):
        no_improvement = 0
        model_to_save = model.module if hasattr(model, 'module') else model
        output_model_dir = os.path.join(OUTPUT_DIR, MODEL_FILE_NAME)
        torch.save(model_to_save.state_dict(), output_model_dir)
        print(f"Model saved at epoch {e}")
    else:
        no_improvement += 1

    if no_improvement >= PATIENCE:
        print("No improvement on development set. Finish training")
        break

    dev_loss_history.append(dev_loss)
    train_loss_history.append(tr_loss/len(train_dataloader))

## **Train-Val loss plot**

In [None]:
plt.plot(train_loss_history, label='train_loss')
plt.plot(dev_loss_history, label='validation loss')

plt.title('Training history')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend()

# **Loading saved model**

In [None]:
import os
from sklearn.metrics import classification_report, precision_recall_fscore_support

model_state_dict = torch.load(os.path.join(OUTPUT_DIR, MODEL_FILE_NAME), map_location=lambda storage, loc:storage)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels = num_label, state_dict = model_state_dict)
model.to(device)

model.eval()


Downloading pytorch_model.bin:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768,

## **Inference during development phase**

In [None]:
id_list = test_data['id'].tolist()
text_list = test_data['text'].tolist()
model_pred = []

for i in tqdm(range(len(text_list))):
    tokenizer_output = tokenizer(text_list[i], max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
    #print(tokenizer_output['input_ids'], type(tokenizer_output['input_ids']))
    input_ids = tokenizer_output['input_ids'].to(device)
    input_mask = tokenizer_output['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask = input_mask)
    logits = outputs['logits'].detach().cpu()
    pred = np.argmax(logits.to('cpu'), axis = 1).detach().numpy()[0]
    model_pred.append(id_to_label[pred])

print(len(text_list), len(id_list), len(model_pred))

100%|██████████| 3426/3426 [01:17<00:00, 44.24it/s]

3426 3426 3426





In [None]:
df = pd.DataFrame({'id': id_list, 'label': model_pred})
print(len(df))
df.head()

3426


Unnamed: 0,id,label
0,30670,Negative
1,4125,Negative
2,27077,Negative
3,17552,Positive
4,4137,Negative


## **Saving dataframe to csv file**

In [None]:
df.to_csv('BLP2023/Saved_dir/xlm-roberta-sentiment-task2/task.tsv',sep='\t', index=False)

## **Inference on test set**

In [None]:
id_list = actual_test_data['id'].tolist()
text_list = actual_test_data['text'].tolist()
model_pred = []

for i in tqdm(range(len(text_list))):
    tokenizer_output = tokenizer(text_list[i], max_length=max_len, padding='max_length', truncation=True, return_tensors='pt')
    #print(tokenizer_output['input_ids'], type(tokenizer_output['input_ids']))
    input_ids = tokenizer_output['input_ids'].to(device)
    input_mask = tokenizer_output['attention_mask'].to(device)

    outputs = model(input_ids, attention_mask = input_mask)
    logits = outputs['logits'].detach().cpu()
    pred = np.argmax(logits.to('cpu'), axis = 1).detach().numpy()[0]
    model_pred.append(id_to_label[pred])

print(len(text_list), len(id_list), len(model_pred))

100%|██████████| 6707/6707 [02:11<00:00, 50.87it/s]

6707 6707 6707





In [None]:
df = pd.DataFrame({'id': id_list, 'label': model_pred})
print(len(df))
df.to_csv('BLP2023/Saved_dir/xlm-roberta-sentiment-task2/test_set_pred_task.tsv',sep='\t', index=False)
df.head()

6707


Unnamed: 0,id,label
0,7135,Negative
1,28949,Positive
2,10210,Negative
3,9526,Neutral
4,2142,Negative
