<a href="https://colab.research.google.com/github/Triansh/bert/blob/main/BERT_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries
* Installing the transformers library 
* Importing the required libraries

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import random
import gc
import torch
import torch.nn as nn
import torch.optim as optim
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

In [None]:
## Loading training and validation data for fine-tuning

In [None]:
def read_data(file_path):
  return pd.read_csv(file_path, sep='\t', header=None, names=['query', 'rating'])

In [None]:
train_df =  read_data('./train.tsv')
dev_df = read_data('./dev.tsv')
train_df= train_df.append(dev_df).reset_index(drop=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21250 entries, 0 to 21249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   query   21250 non-null  object 
 1   rating  21250 non-null  float64
dtypes: float64(1), object(1)
memory usage: 332.2+ KB


In [None]:
validation_df = read_data('./test.tsv')
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3850 entries, 0 to 3849
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   query   3850 non-null   object 
 1   rating  3850 non-null   float64
dtypes: float64(1), object(1)
memory usage: 60.3+ KB


## Setting Random SEED and Devices

In [None]:
SEED = 12345

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
# device = 'cpu'
device

'cuda'

## Dataset and Dataloader instances
* Creates a QWFDataset (Query well-formedness Dataset) inherited from the pytorch's dataset
* Used pytorch's Dataloader to load the dataset

In [None]:
class QWFDataset(Dataset):
  
  def __init__(self, df, maxlen):
    self.df = df.copy()
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    self.maxlen = maxlen

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    query = self.df['query'][index]
    label = 1 if self.df['rating'][index] >= 0.8 else 0

    encoded_dict = self.tokenizer.encode_plus(
                        query,                        # Sentence to encode.
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = self.maxlen,              # Pad & truncate all sentences.
                        padding = 'max_length',
                        truncation=True,
                        return_attention_mask = True, # Construct attn. masks.
                        return_tensors = 'pt',        # Return pytorch tensors.
                    )
    ids = encoded_dict['input_ids'].detach().clone().reshape((-1,))
    mask = encoded_dict['attention_mask'].detach().clone().reshape((-1,))

    # print("Shape of ids: ", ids.shape, mask.shape)

    val = {
        'ids': ids,
        'mask': mask,
        'targets': torch.tensor(label, dtype=torch.long)
    } 
    # print(val)
    return val


In [None]:
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 8
MAX_LEN = 32

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 1
                }

validation_params = {'batch_size': VALIDATION_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 1
                }

In [None]:
print(f"Training Dataset: {train_df.shape}")
print(f"VALIDATION Dataset: {validation_df.shape}")

train_set =  QWFDataset(train_df, maxlen=MAX_LEN)
validation_set =  QWFDataset(validation_df, maxlen=MAX_LEN)

Training Dataset: (21250, 2)
VALIDATION Dataset: (3850, 2)


In [None]:
train_loader = DataLoader(train_set, **train_params)
validation_loader = DataLoader(validation_set, **validation_params)

## Fine-tuning the pretrained model

In [None]:
class QWFClassificationModel(nn.Module):

  def __init__(self ):
    super(QWFClassificationModel, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.pre_classifier = nn.Linear(768, 512)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.3)
    self.classifier = nn.Linear(512, 1)

  def forward(self, tokens, attention_mask):
    hidden_state, _ = self.bert(tokens, attention_mask = attention_mask,return_dict=False)
    pooler = hidden_state[:, 0]
    pooler = self.pre_classifier(pooler)
    pooler = self.relu(pooler)
    pooler = self.dropout(pooler)
    outputs = self.classifier(pooler)
    return outputs

In [None]:
def get_correct(outputs, targets):
  # print(outputs)
  probs = torch.sigmoid(outputs)
  # print('prob from train: ',probs)
  labels = (probs >= 0.5).long()
  n_correct = (labels==targets).sum().item()
  return n_correct

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
model = QWFClassificationModel()
x = model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
learning_rate = 1e-5
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params = model.parameters(), lr = learning_rate)
epochs = 2

## Training 

In [None]:
def train(epoch, data_loader):

  training_loss = 0
  training_count = 0
  total_correct = 0

  model.train()

  loop = tqdm(data_loader,leave=True)

  for batch in loop:

    optimizer.zero_grad()

    ids = batch['ids'].to(device, dtype = torch.long)
    mask = batch['mask'].to(device, dtype = torch.long)
    targets = batch['targets'].to(device, dtype = torch.long)

    outputs = model(ids, mask).squeeze()
    loss = criterion(outputs, targets.float())

    training_loss += loss.item()
    total_correct += get_correct(outputs.data, targets)
    training_count+=targets.size(0)

    loss.backward()
    optimizer.step()

    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

  epoch_loss = training_loss / len(data_loader)
  epoch_accu = total_correct * 100 / training_count
  print(f"The Mean loss for Epoch: {epoch_loss}")
  print(f"The Total Accuracy for Epoch {epoch}: {epoch_accu}")

  return 

In [None]:
for epoch in range(epochs):
  train(epoch, train_loader)
# model.save

Epoch 0: 100%|██████████| 665/665 [04:23<00:00,  2.53it/s, loss=1.14]


The Mean loss for Epoch: 0.4594027056953961
The Total Accuracy for Epoch 0: 77.41176470588235


Epoch 1: 100%|██████████| 665/665 [04:22<00:00,  2.53it/s, loss=0.24]

The Mean loss for Epoch: 0.3388124349422025
The Total Accuracy for Epoch 1: 85.6





## Validation

In [None]:
def validate(model, validation_loader):
  model.eval()

  validation_loss = 0
  validation_count = 0
  total_correct = 0

  with torch.no_grad():
    loop = tqdm(validation_loader,leave=True)

    for data in loop:

      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.long)

      outputs = model(ids, mask).squeeze()

      loss = criterion(outputs, targets.float())
      validation_loss += loss.item()

      total_correct += get_correct(outputs.data, targets)

      validation_count += targets.size(0)

    accuarcy = total_correct * 100 / validation_count
    print(f"Validation Loss : {validation_loss / len(validation_loader)}")
    print(f"Validation Accuracy : {accuarcy}")
    return accuarcy

In [None]:
accuracy = validate(model, validation_loader)

100%|██████████| 482/482 [00:22<00:00, 21.80it/s]

Validation Loss : 0.39552819328884986
Validation Accuracy : 82.44155844155844





In [None]:
torch.save(model, f'./model-{str(accuracy)}-{SEED}')