<a href="https://colab.research.google.com/github/Triansh/bert/blob/main/MLM_BERT_Assignment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries
* Installing the transformers library 
* Importing the required libraries

In [None]:
!pip install transformers



In [None]:
import pandas as pd
import random
import gc
import torch.nn as nn
import torch.optim as opt
import re

from torch import backends, manual_seed, rand, flatten, cuda, tensor, long
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForMaskedLM
from transformers import AdamW
from tqdm import tqdm

## Setting  Random SEED and Devices

In [None]:
SEED = 12345

random.seed(SEED)
manual_seed(SEED)
cuda.manual_seed(SEED)
backends.cudnn.deterministic = True

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'
# device = 'cpu'
device

'cuda'

## MLM

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
with open('./wiki.train.tokens', 'r') as f:
  texts = [ line.strip() for line in f if line.strip() != '' and '=' not in line ][:35000]
text = []
reg = re.compile(r"\.|\?|\!")
for t in texts:
  text.extend(reg.split(t))
len(text)

202163

In [None]:
inputs = tokenizer(
                        text,                        # Sentence to encode.
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = 32,     # Pad & truncate all sentences.
                        padding = 'max_length',
                        truncation=True,
                        return_attention_mask = True, # Construct attn. masks.
                        return_tensors = 'pt',        # Return pytorch tensors.
                    )

In [None]:
inputs['labels'] = inputs.input_ids.detach().clone()
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [None]:
# create mask array
# Now sentence have a [MASK] with 15 % probability
# Donot mask [PAD], [CLS], [SEP]
randoms = rand(inputs.input_ids.shape)
mask_array = (randoms < 0.15) * (inputs.input_ids != 101) * \
           (inputs.input_ids != 102) * (inputs.input_ids != 0)

In [None]:
selection = []

for i in range(inputs.input_ids.shape[0]):
    selection.append(flatten(mask_array[i].nonzero()).tolist())

In [None]:
# mark true indexes as [MASK]
for i in range(inputs.input_ids.shape[0]):
    inputs.input_ids[i, selection[i]] = 103

In [None]:
class Wiki103Datset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [None]:
dataset = Wiki103Datset(inputs)

In [None]:
loader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
gc.collect()
cuda.empty_cache()

In [None]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [None]:
optimizer_1 = AdamW(model.parameters(), lr=1e-5)

In [None]:
epochs = 1

for epoch in range(epochs):
    model.train()
    # setup loop with TQDM and dataloader
    loop = tqdm(loader, leave=True)
    for batch in loop:
        # initialize calculated gradients (from prev step)
        optimizer_1.zero_grad()
        # pull all tensor batches required for training
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        # process
        outputs = model(input_ids, attention_mask=attention_mask,
                        labels=labels)
        # extract loss
        loss = outputs.loss
        # calculate loss for every parameter that needs grad update
        loss.backward()
        # update parameters
        optimizer_1.step()
        # print relevant info to progress bar
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  """
Epoch 0: 100%|██████████| 6318/6318 [52:51<00:00,  1.99it/s, loss=0.192]


In [None]:
from torch import save
save(model, './bt-0.1')

## Loading training and validation data for fine-tuning

In [None]:
def read_data(file_path):
  return pd.read_csv(file_path, sep='\t', header=None, names=['query', 'rating'])

In [None]:
train_df =  read_data('./train.tsv')
dev_df = read_data('./dev.tsv')
train_df= train_df.append(dev_df).reset_index(drop=True)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21250 entries, 0 to 21249
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   query   21250 non-null  object 
 1   rating  21250 non-null  float64
dtypes: float64(1), object(1)
memory usage: 332.2+ KB


In [None]:
validation_df = read_data('./test.tsv')
validation_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3850 entries, 0 to 3849
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   query   3850 non-null   object 
 1   rating  3850 non-null   float64
dtypes: float64(1), object(1)
memory usage: 60.3+ KB


## Dataset Creation
* Creates a QWF (Query well-formedness Dataset) inherited from the pytorch's dataset

In [None]:
class QWFDataset(Dataset):
  
  def __init__(self, tokenizer, df, maxlen):
    self.df = df.copy()
    self.tokenizer = tokenizer
    self.maxlen = maxlen

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    query = self.df['query'][index]
    label = 1 if self.df['rating'][index] >= 0.8 else 0

    encoded_dict = self.tokenizer.encode_plus(
                        query,                        # Sentence to encode.
                        add_special_tokens = True,    # Add '[CLS]' and '[SEP]'
                        max_length = self.maxlen,              # Pad & truncate all sentences.
                        padding = 'max_length',
                        truncation=True,
                        return_attention_mask = True, # Construct attn. masks.
                        return_tensors = 'pt',        # Return pytorch tensors.
                    )
    ids = encoded_dict['input_ids'].detach().clone().reshape((-1,))
    mask = encoded_dict['attention_mask'].detach().clone().reshape((-1,))

    # print("Shape of ids: ", ids.shape, mask.shape)

    val = {
        'ids': ids,
        'mask': mask,
        'targets': tensor(label, dtype=long)
    } 
    # print(val)
    return val


In [None]:
TRAIN_BATCH_SIZE = 32
VALIDATION_BATCH_SIZE = 8
MAX_LEN = 32

In [None]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 1
                }

validation_params = {'batch_size': VALIDATION_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 1
                }

In [None]:
print(f"Training Dataset: {train_df.shape}")
print(f"VALIDATION Dataset: {validation_df.shape}")

train_set =  QWFDataset(tokenizer, train_df, maxlen=MAX_LEN)
validation_set =  QWFDataset(tokenizer, validation_df, maxlen=MAX_LEN)

Training Dataset: (21250, 2)
VALIDATION Dataset: (3850, 2)


In [None]:
train_loader = DataLoader(train_set, **train_params)
validation_loader = DataLoader(validation_set, **validation_params)

In [None]:
class QWFClassificationModel(nn.Module):

  def __init__(self, model):
    super(QWFClassificationModel, self).__init__()

    self.bert = model
    self.pre_classifier = nn.Linear(768, 512)
    self.relu = nn.ReLU()
    self.dropout = nn.Dropout(0.3)
    self.classifier = nn.Linear(512, 1)

  def forward(self, tokens, attention_mask):
    output = self.bert(tokens, attention_mask = attention_mask, output_hidden_states=True, return_dict=True)
    hidden_state = output['hidden_states'][0]
    pooler = hidden_state[:, 0]
    pooler = self.pre_classifier(pooler)
    pooler = self.relu(pooler)
    pooler = self.dropout(pooler)
    outputs = self.classifier(pooler)
    return outputs

In [None]:
from torch import sigmoid
def get_correct(outputs, targets):
  probs = sigmoid(outputs)
  labels = (probs >= 0.5).long()
  correct = (labels==targets).sum().item()
  return correct

In [None]:
# gc.collect()
# cuda.empty_cache()

In [None]:
# from torch import load
# model = load('./bt-0.1')

In [None]:
finetuning_model = QWFClassificationModel(model)
finetuning_model.to(device)

QWFClassificationModel(
  (bert): BertForMaskedLM(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
   

In [None]:
learning_rate = 1e-5
criterion = nn.BCEWithLogitsLoss()
optimizer = opt.Adam(params = finetuning_model.parameters(), lr = learning_rate)
epochs = 1

In [None]:
def train(epoch, data_loader):

  training_loss = 0
  training_count = 0
  total_correct = 0

  finetuning_model.train()

  loop = tqdm(data_loader,leave=True)

  for batch in loop:

    optimizer.zero_grad()

    ids = batch['ids'].to(device, dtype = long)
    mask = batch['mask'].to(device, dtype = long)
    targets = batch['targets'].to(device, dtype = long)

    outputs = finetuning_model(ids, mask).squeeze()
    loss = criterion(outputs, targets.float())

    training_loss += loss.item()
    total_correct += get_correct(outputs.data, targets)
    training_count+=targets.size(0)

    loss.backward()
    optimizer.step()

    loop.set_description(f'Epoch {epoch}')
    loop.set_postfix(loss=loss.item())

  epoch_loss = training_loss / len(data_loader)
  epoch_accu = total_correct * 100 / training_count
  print(f"The Mean loss for Epoch: {epoch_loss}")
  print(f"The Total Accuracy for Epoch {epoch}: {epoch_accu}")

  return 

In [None]:
for epoch in range(epochs):
  train(epoch, train_loader)

Epoch 0: 100%|██████████| 665/665 [02:29<00:00,  4.44it/s, loss=0.503]

The Mean loss for Epoch: 0.6697806567177736
The Total Accuracy for Epoch 0: 60.97882352941176





In [None]:
from torch import no_grad

def validate(model, validation_loader):
  finetuning_model.eval()

  validation_loss = 0
  validation_count = 0
  total_correct = 0

  with no_grad():
    loop = tqdm(validation_loader,leave=True)

    for batch in loop:

      ids = batch['ids'].to(device, dtype = long)
      mask = batch['mask'].to(device, dtype = long)
      targets = batch['targets'].to(device, dtype = long)

      outputs = finetuning_model(ids, mask).squeeze()

      loss = criterion(outputs, targets.float())
      validation_loss += loss.item()

      total_correct += get_correct(outputs.data, targets)

      validation_count += targets.size(0)

    accuracy = total_correct * 100 / validation_count
    print(f"Validation Loss : {validation_loss / len(validation_loader)}")
    print(f"Validation Accuracy : {accuracy}")
    return accuracy

In [None]:
accuracy = validate(model, validation_loader)

100%|██████████| 482/482 [00:26<00:00, 17.99it/s]

Validation Loss : 0.6659648516737079
Validation Accuracy : 61.55844155844156



