In [34]:
import torchtext
import torch
import numpy as np
import pandas as pd
import random
import time
import math
from tqdm.notebook import tqdm

from torch import nn
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

from typing import Tuple, List

In [2]:
import sys
sys.path.insert(0, "..")

In [3]:
df = pd.read_csv("../data/IMDB.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## BERT Dataset

In [4]:
MASK_PERCENTANGE = 0.15
MASK_PROBABILITY = 0.80

In [5]:
sents = df.iloc[random.randint(0, df.shape[0]), 0].split(". ")
idx = random.randint(0, len(sents)-1)
sent1 = sents[idx]

sent1 = sent1.split(" ")
mask_idx = random.randint(0, len(sent1)-1)
if random.random() < MASK_PROBABILITY:
    sent1[mask_idx] = "[MASK]"
    print("Masked")
    print(" ".join(sent1))
else:
    rand_token = random.randint(0, len(sent1)-1)
    sent1[mask_idx] = sent1[rand_token]
    print("Replaced with random token")
    print(" ".join(sent1))

if random.random() <= 0.5:
    nsp_sents = ". [SEP] ".join(sents[idx: idx+2])
    print("NSP: 1")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))
else:
    nsp_sents = sents[idx] + ". [SEP] " + sents[random.randint(idx+1, len(sents))-1]
    print("NSP: 0")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))

Masked
My husband received DVD of OBWAT for Christmas and it was the best gift we received! We watch it every time we need to laugh and so far we have viewed it 12 times!The scenery in this movie is beautiful and the music is outstanding!We also purchased the soundtrack and we play it in our vehicles and at home when ever we need a pick me up and that too is daily!If anyone needs a suggestion for a good gift for movie lovers this movie is it!The characters are hilarious , charming , and their facial expressions are too funny to [MASK] have always been a fan of George Clooney but now I am also a fan of Tim Blake Nelson(Delmar ) and John Turturro (Pete)and am now looking for them in other movies! You gotta see this movie!!!
NSP: 1
My husband received DVD of OBWAT for Christmas and it was the best gift we received! We watch it every time we need to laugh and so far we have viewed it 12 times!The scenery in this movie is beautiful and the music is outstanding!We also purchased the soundtra

In [6]:
CLS_TOKEN = 0
SEP_TOKEN = 1
MASK_TOKEN = 2
PAD_TOKEN = 3 
UNK_TOKEN = 4

In [8]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

def build_vocab(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(build_vocab(df["review"].to_list()),
                                  min_freq=2,
                                  specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                  special_first=True)
vocab.set_default_index(UNK_TOKEN)

In [9]:
len(vocab)

89854

In [10]:
less_ = 0
more_ = 0
for _ in range(10000000):
    if random.random()<=0.5:
        less_ += 1
    else:
        more_ += 1
less_, more_

(4995712, 5004288)

`random.random()` generates an equal distribution and it can be used for the NSP task

In [36]:
class IMDBBERTDataset(Dataset):

    NSP_PERCENTAGE = 0.50

    CLS_TOKEN = 0
    SEP_TOKEN = 1
    MASK_TOKEN = 2
    PAD_TOKEN = 3 
    UNK_TOKEN = 4

    def __init__(self,
                 path: str,
                 max_sent_len: int=50) -> None:
        super().__init__()
        self.df = pd.read_csv(path)
        self.tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
        self.vocab = build_vocab_from_iterator(self._build_vocab(self.df["review"].to_list()),
                                               min_freq=2,
                                               specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                               special_first=True)
        self.vocab.set_default_index(self.UNK_TOKEN)
        self.max_sent_len = max_sent_len
        self.token_ids = []
        self.masked_token = []
        self.masked_idx = []
        self.is_next = []
        self.segment_tokens = []
        self._prepare_data()
        

    def _prepare_data(self) -> None:
        for i in range(self.df.shape[0]):
            try:
                sentences = self.df.iloc[i, 0].split(". ")
                if random.random() <= self.NSP_PERCENTAGE:
                    rand_idx = random.randint(0, len(sentences)-2)
                    sentences = sentences[rand_idx:rand_idx+2]
                    is_next = 0
                else:
                    rand_idx = random.randint(1, len(sentences)-1)
                    sentences = [sentences[rand_idx], sentences[rand_idx-1]]
                    is_next = 1
                
                sentences = ["[CLS]"] + self.tokenizer(sentences[0]) + ["[SEP]"] + self.tokenizer(sentences[1])
                if len(sentences) < self.max_sent_len:
                    while len(sentences) < self.max_sent_len:
                        sentences += ["[PAD]"]
                else:
                    sentences = sentences[:self.max_sent_len]
                
                sep_idx = sentences.index("[SEP]")
                segment_token = [0]*(sep_idx+1) + [1]*(len(sentences)-1-sep_idx)

                # assert len(segment_token) == len(sentences), f"Length not equal, sep_idx: {sep_idx} "

                token_ids = self.vocab(sentences)
                mask_token, mask_idx = self.SEP_TOKEN, -1
                while mask_token == self.SEP_TOKEN:
                    mask_idx = random.randint(1, len(token_ids)-1)
                    mask_token = token_ids[mask_idx]
                token_ids[mask_idx] = self.MASK_TOKEN
                self.token_ids.append(token_ids)
                self.masked_token.append(mask_token)
                self.masked_idx.append(mask_idx)
                self.segment_tokens.append(segment_token)
                self.is_next.append(is_next)
            except:
                pass
 
        self.bert_df = pd.DataFrame(data={
            "token_ids" : self.token_ids,
            "segment_tokens" : self.segment_tokens,
            "masked_token" : self.masked_token,
            "masked_idx" : self.masked_idx,
            "is_next" : self.is_next
        })
        
    def _build_vocab(self, data_iter):
        for sentence in data_iter:
            yield self.tokenizer(sentence)
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
        token_ids = self.token_ids[index]
        segment_tokens = self.segment_tokens[index]
        masked_token = self.masked_token[index]
        masked_idx = self.masked_idx[index]
        is_next = self.is_next[index]
        return torch.tensor(token_ids), torch.tensor(segment_tokens), torch.tensor(masked_token), torch.tensor(masked_idx), torch.tensor(is_next)
    
    def __len__(self) -> int:
        return self.bert_df.shape[0]

## BERT EMBEDDING

In [17]:
ds = IMDBBERTDataset("../data/IMDB.csv")
ds.bert_df

Unnamed: 0,token_ids,segment_tokens,masked_token,masked_idx,is_next
0,"[0, 28, 109, 445, 14, 148, 236, 3404, 87, 23, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",15,24,1
1,"[0, 28, 133, 12, 4391, 6, 29, 5, 448, 12, 2044...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,12,0
2,"[0, 363, 35, 638, 11, 79, 115, 170, 103, 475, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",54,47,1
3,"[0, 28, 383, 35, 6, 23, 10720, 579, 6, 64, 50,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",275,26,0
4,"[0, 69, 140, 22, 46, 549, 6, 1512, 206, 4470, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,41,0
...,...,...,...,...,...
42011,"[0, 59, 22, 8, 242, 35704, 275, 1, 5, 64, 157,...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ...",3,45,0
42012,"[0, 6805, 87, 6, 105, 14, 1613, 19, 3216, 55, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",144,34,1
42013,"[0, 14, 2, 8, 4212, 4674, 13, 49834, 9209, 633...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",263,2,0
42014,"[0, 69, 12, 8, 391, 993, 6, 13181, 4418, 2031,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",10,34,0


In [18]:
VOCAB_SIZE = len(ds.vocab)
VOCAB_SIZE

89854

In [28]:
class BERTEmbedding(nn.Module):

    def __init__(self,
                 d_model: int=512,
                 vocab_size: int=1000,
                 max_seq_len: int=100,
                 dropout: float=0.1) -> None:
        
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=d_model)
        self.pe = torch.zeros(size=(max_seq_len, d_model),
                              requires_grad=False)
        
        for pos in range(max_seq_len):
            for dim in range(d_model):
                if pos%2==0:
                    self.pe[pos, dim] = math.sin(pos//(10000**(2*dim//d_model)))
                else:
                    self.pe[pos, dim] = math.cos(pos//(10000**(2*dim//d_model)))

        self.segment_embedding = nn.Embedding(num_embeddings=2,
                                              embedding_dim=d_model)
        self.dropout = nn.Dropout(p=dropout)
    
    def __repr__(self) -> str:
        return f"BERTEmbedding(d_model={self.d_model}, vocab_size={self.vocab_size})"

    def __str__(self) -> str:
        return f"BERTEmbedding(d_model={self.d_model}, vocab_size={self.vocab_size})"

    def forward(self, 
                x: torch.Tensor,
                segment_tokens: torch.Tensor) -> torch.Tensor:
        # x -> [batch_size, max_seq_len]
        token_embeddings = self.token_embedding(x)
        position_encoding = self.pe[:x.shape[1], :].unsqueeze(0) # positional_encoding -> [1, max_seq_len, d_model]
        segment_embedding = self.segment_embedding(segment_tokens)
        return self.dropout(token_embeddings + position_encoding + segment_embedding)

In [25]:
from torch.utils.data import DataLoader
data_loader = DataLoader(dataset=ds,
                         batch_size=32,
                         shuffle=True)
batch = next(iter(data_loader))
bert_embedding = BERTEmbedding(d_model=512,
                               vocab_size=VOCAB_SIZE)
with torch.inference_mode():
    token_ids, segment_tokens = batch[0], batch[1]
    embedding = bert_embedding(token_ids, segment_tokens)
    print(f"token_ids shape: {token_ids.shape}")
    print(f"embedding shape: {embedding.shape}")

token_ids shape: torch.Size([32, 50])
embedding shape: torch.Size([32, 50, 512])


## BERT 

* BERT model is made up of just the encoder part of the transformer architecture.
* I am going to build the BERT-Base model in this kernel
* Parameters for BERT-Base: 
    * Number of encoder layers   **L : 12**
    * Model Dimension            **H : 768**
    * Number of Attention heads  **A : 12** 

In [14]:
batch = torch.rand(32, 50, 768)
masked_idx = torch.tensor([random.randint(0, 49) for _ in range(32)])
masked_tokens = batch[range(len(masked_idx)), masked_idx]
masked_tokens.shape

torch.Size([32, 768])

In [37]:
from scripts.scripts import TransformerEncoder, create_padding_mask

In [33]:
class BERT(nn.Module):
    def __init__(self,
                 num_layers: int=12,
                 d_model: int=768,
                 num_heads: int=12,
                 vocab_size: int=1000,
                 d_ff: int=2048,
                 attn_dropout: float=0.1,
                 ff_dropout: float=0.1) -> None:
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.bert_encoder = TransformerEncoder(num_encoders=num_layers,
                                               d_model=d_model,
                                               num_heads=num_heads,
                                               d_ff=d_ff,
                                               attn_dropout=attn_dropout,
                                               ff_dropout=ff_dropout)
        self.masked_block = nn.Linear(in_features=d_model,
                                      out_features=vocab_size)
        self.nsp_block = nn.Linear(in_features=d_model,
                                        out_features=2)
    

    def __repr__(self) -> str:
        return f"BERT(num_layers={self.num_layers}, d_model={self.d_model}, num_heads={self.num_heads})"
    
    def __str__(self) -> str:
        return f"BERT(num_layers={self.num_layers}, d_model={self.d_model}, num_heads={self.num_heads})"

    def forward(self,
                x: torch.Tensor,
                mask: torch.Tensor,
                masked_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        
        """
            x -> Input embedding, shape: [batch_size, max_seq_len, d_model]
            mask -> Mask for padding, shape: [batch_size, 1, 1, max_seq_len]
            masked_idx -> For each sequence a different index position has been masked
            and the encoder's contextual representation of the masked token will be
            used to predict the true token. To extract the representation for each 
            sequence the different index positions are passed in masked_idx which will 
            be used for indexing the representation. Shape: [batch_size]
        """
        x = self.bert_encoder(x, mask) # [batch_size, max_seq_len, d_model]
        masked_tokens = x[range(len(masked_idx)), masked_idx]

        # For NSP prediction the BERT paper uses the '[CLS]' token which is the 
        # 0th index in each sequence and it is accessed by indexing '0' along the
        # first dimension
        nsp_logits = self.nsp_block(x[:, 0, :]) # x[:, 0, :] -> [batch_size, d_model]
        masked_tokens_logits = self.masked_block(masked_tokens)
        return masked_tokens_logits, nsp_logits

In [None]:
def train_bert(bert_model: BERT,
               bert_embedding: BERTEmbedding,
               dataloader: torch.utils.data.DataLoader,
               optimizer: torch.optim.Optimizer,
               masked_loss_fn: torch.nn.Module,
               nsp_loss_fn: torch.nn.Module,
               epochs: int=10,
               device: str="cpu") -> Tuple[float, float]:
    
    bert_model.to(device)
    bert_embedding.to(device)
    bert_model.train()

    bert_results = {
        "masked_tokens_losses" : [],
        "nsp_losses" : [],
        "masked_tokens_acc" : 0,
        "nsp_acc" : 0,
    }

    for epoch in tqdm(epochs):
        for token_ids, segment_tokens, masked_token, masked_idx, is_next in tqdm(dataloader):
            
            token_ids, segment_tokens = token_ids.to(device), segment_tokens.to(device)
            masked_tokens, masked_idx = masked_token.to(device), masked_idx.to(device)
            is_next = is_next.to(device)
            
            mask = create_padding_mask(token_ids)
            tokens_embedded = bert_embedding(token_ids, segment_tokens)
            masked_tokens_logits, nsp_logits = bert_model(tokens_embedded, mask)
            masked_tokens_loss = masked_loss_fn(masked_tokens_logits, masked_tokens)
            nsp_loss = nsp_loss_fn(nsp_logits, is_next)
            total_loss = masked_tokens_loss + nsp_loss

            optimizer.zero_grad()
            total_loss.backward()
            optimizer.step()
