<a href="https://colab.research.google.com/github/Triansh/bert/blob/main/BERT_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries
* Installing the transformers library 
* Importing the required libraries

In [None]:
# !pip install transformers

In [None]:
import pandas as pd
import random
import torch
import gc
import torch.nn as nn
import torch.optim as optim
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel


In [None]:
def read_data(file_path):
  return pd.read_csv(file_path, sep='\t', header=None, names=['query', 'rating'])

In [None]:
train_df =  read_data('./train.tsv')
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17500 entries, 0 to 17499
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   query   17500 non-null  object 
 1   rating  17500 non-null  float64
dtypes: float64(1), object(1)
memory usage: 273.6+ KB


In [None]:
dev_df = read_data('./dev.tsv')
dev_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3750 entries, 0 to 3749
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   query   3750 non-null   object 
 1   rating  3750 non-null   float64
dtypes: float64(1), object(1)
memory usage: 58.7+ KB


In [None]:
# test_df = read_data('./test.tsv')
# test_df.info()

In [None]:
SEED = 14345

random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
# device = 'cpu'
device

'cuda'

## Dataset Creation
* Creates a QWF (Query well-formedness Dataset) inherited from the pytorch's dataset

In [None]:
class QWFDataset(Dataset):
  
  def __init__(self, df, maxlen):
    self.df = df.copy()
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    self.maxlen = maxlen

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    query = self.df['query'][index]
    label = 1 if self.df['rating'][index] >= 0.8 else 0

    encoded_dict = self.tokenizer.encode_plus(
                        query,                        # Sentence to encode.
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = self.maxlen,              # Pad & truncate all sentences.
                        padding = 'max_length',
                        truncation=True,
                        return_attention_mask = True, # Construct attn. masks.
                        return_tensors = 'pt',        # Return pytorch tensors.
                    )
    ids = encoded_dict['input_ids'].reshape((-1,))
    mask = encoded_dict['attention_mask'].reshape((-1,))

    # print("Shape of ids: ", ids.shape, mask.shape)

    val = {
        'ids': torch.tensor(ids, dtype=torch.long),
        'mask': torch.tensor(mask, dtype=torch.long),
        'targets': torch.tensor(label, dtype=torch.long)
    } 
    # print(val)
    return val


In [None]:
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 8
MAX_LEN = 64

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 1
                }

validation_params = {'batch_size': VALIDATION_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 1
                }

In [None]:
print(f"Training Dataset: {train_df.shape}")
print(f"VALIDATION Dataset: {dev_df.shape}")

train_set =  QWFDataset(train_df, maxlen=MAX_LEN)
validation_set =  QWFDataset(dev_df, maxlen=MAX_LEN)

Training Dataset: (17500, 2)
VALIDATION Dataset: (3750, 2)


In [None]:
train_loader = DataLoader(train_set, **train_params)
validation_loader = DataLoader(validation_set, **validation_params)

In [None]:
class QWFClassificationModel(nn.Module):

  def __init__(self ):
    super(QWFClassificationModel, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.pre_classifier = nn.Linear(768, 512)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.3)
    self.classifier = nn.Linear(512, 1)
    # self.sigmoid = nn.Sigmoid()

  def forward(self, tokens, attention_mask):
    hidden_state, _ = self.bert(tokens, attention_mask = attention_mask,return_dict=False)
    pooler = hidden_state[:, 0]
    pooler = self.pre_classifier(pooler)
    pooler = self.relu(pooler)
    pooler = self.dropout(pooler)
    outputs = self.classifier(pooler)
    # output = self.sigmoid(outputs)
    # # print('output from model: ', pooler.squeeze())
    # # print('Probs from model: ', probs.squeeze())
    return outputs

In [None]:
def get_correct(outputs, targets):
  # print(outputs)
  probs = torch.sigmoid(outputs)
  # print('prob from train: ',probs)
  labels = (probs >= 0.5).long()
  n_correct = (labels==targets).sum().item()
  return n_correct

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
model = QWFClassificationModel()
x = model.to(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
learning_rate = 1e-5
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(params = model.parameters(), lr = learning_rate)
epochs = 2

In [None]:
def train(epoch, data_loader):
  tr_loss = 0
  n_correct = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  model.train()
  for _,data in enumerate(data_loader, 0):
    ids = data['ids'].to(device, dtype = torch.long)
    mask = data['mask'].to(device, dtype = torch.long)
    targets = data['targets'].to(device, dtype = torch.long)

    outputs = model(ids, mask).squeeze()
    # print('Probs from model: ', probs)

    loss = criterion(outputs, targets.float())
    tr_loss += loss.item()
    n_correct += get_correct(outputs.data, targets)

    nb_tr_steps += 1
    nb_tr_examples+=targets.size(0)
    
    if (_+1) % 100==0:
      # print('output from train: ', outputs)
      loss_step = tr_loss/nb_tr_steps
      accu_step = (n_correct*100)/nb_tr_examples 
      print(f"Training Loss per 100 steps: {loss_step}")
      print(f"Training Accuracy per 100 steps: {accu_step}")

    optimizer.zero_grad()
    loss.backward()
    # # When using GPU
    optimizer.step()

  print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
  epoch_loss = tr_loss/nb_tr_steps
  epoch_accu = (n_correct*100)/nb_tr_examples
  print(f"Training Loss Epoch: {epoch_loss}")
  print(f"Training Accuracy Epoch: {epoch_accu}")

  return 

In [None]:
for epoch in range(epochs):
  train(epoch, train_loader)
# model.save



Training Loss per 100 steps: 0.5966095149517059
Training Accuracy per 100 steps: 65.84375
Training Loss per 100 steps: 0.5425497534871101
Training Accuracy per 100 steps: 71.296875
Training Loss per 100 steps: 0.5134759996334711
Training Accuracy per 100 steps: 73.63541666666667
Training Loss per 100 steps: 0.4944518867135048
Training Accuracy per 100 steps: 75.1015625
Training Loss per 100 steps: 0.47874504745006563
Training Accuracy per 100 steps: 76.35
The Total Accuracy for Epoch 0: 76.74857142857142
Training Loss Epoch: 0.4723832393695057
Training Accuracy Epoch: 76.74857142857142




Training Loss per 100 steps: 0.3481288143992424
Training Accuracy per 100 steps: 85.25
Training Loss per 100 steps: 0.3558354529738426
Training Accuracy per 100 steps: 85.046875
Training Loss per 100 steps: 0.3588961311181386
Training Accuracy per 100 steps: 84.6875
Training Loss per 100 steps: 0.35528641287237406
Training Accuracy per 100 steps: 84.9453125
Training Loss per 100 steps: 0.35056389644742014
Training Accuracy per 100 steps: 85.225
The Total Accuracy for Epoch 1: 85.30857142857143
Training Loss Epoch: 0.3491549556172526
Training Accuracy Epoch: 85.30857142857143


In [None]:
def validate(model, validation_loader):
  model.eval()
  n_correct = 0; n_wrong = 0; total = 0
  tr_loss = 0
  nb_tr_steps = 0
  nb_tr_examples = 0
  with torch.no_grad():
    for _, data in enumerate(validation_loader, 0):
      ids = data['ids'].to(device, dtype = torch.long)
      mask = data['mask'].to(device, dtype = torch.long)
      targets = data['targets'].to(device, dtype = torch.long)
      outputs = model(ids, mask).squeeze()
      loss = criterion(outputs, targets.float())
      tr_loss += loss.item()
      n_correct += get_correct(outputs.data, targets)

      nb_tr_steps += 1
      nb_tr_examples+=targets.size(0)
      
      if (_+1)%100==0:
        loss_step = tr_loss/nb_tr_steps
        accu_step = (n_correct*100)/nb_tr_examples
        print(f"Validation Loss per 100 steps: {loss_step}")
        print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    return epoch_accu

In [None]:
accuracy = validate(model, validation_loader)



Validation Loss per 100 steps: 0.45467622943222524
Validation Accuracy per 100 steps: 79.125
Validation Loss per 100 steps: 0.4443891962431371
Validation Accuracy per 100 steps: 79.8125
Validation Loss per 100 steps: 0.451128485960265
Validation Accuracy per 100 steps: 79.375
Validation Loss per 100 steps: 0.44184628397226333
Validation Accuracy per 100 steps: 79.96875
Validation Loss Epoch: 0.4397104711198349
Validation Accuracy Epoch: 80.05333333333333


In [None]:
torch.save(model, f'./model-{str(accuracy)}-{SEED}')