In [1]:
import torchtext
import torch
import numpy as np
import pandas as pd
import random
import time

from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

from typing import Tuple, List

In [27]:
import sys
sys.path.insert(0, '..')

In [2]:
df = pd.read_csv("../data/IMDB.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Preparing NSP dataset and Masked dataset

In [3]:
MASK_PERCENTANGE = 0.15
MASK_PROBABILITY = 0.80

In [4]:
sents = df.iloc[random.randint(0, df.shape[0]), 0].split(". ")
idx = random.randint(0, len(sents)-1)
sent1 = sents[idx]

sent1 = sent1.split(" ")
mask_idx = random.randint(0, len(sent1)-1)
if random.random() < MASK_PROBABILITY:
    sent1[mask_idx] = "[MASK]"
    print("Masked")
    print(" ".join(sent1))
else:
    rand_token = random.randint(0, len(sent1)-1)
    sent1[mask_idx] = sent1[rand_token]
    print("Replaced with random token")
    print(" ".join(sent1))

if random.random() <= 0.5:
    nsp_sents = ". [SEP] ".join(sents[idx: idx+2])
    print("NSP: 1")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))
else:
    nsp_sents = sents[idx] + ". [SEP] " + sents[random.randint(idx+1, len(sents))-1]
    print("NSP: 0")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))

Replaced with random token
It shows how someone grew up in an environment that but a rich and powerful man but unfortunately because of his ambition and the people around him it led to his destruction
NSP: 0
It shows how someone grew up in an environment that created a rich and powerful man but unfortunately because of his ambition and the people around him it led to his destruction. [SEP] She also knew that business was business and a dangerous one
44


In [5]:
CLS_TOKEN = 0
SEP_TOKEN = 1
MASK_TOKEN = 2
PAD_TOKEN = 3 
UNK_TOKEN = 4

In [6]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

def build_vocab(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(build_vocab(df["review"].to_list()),
                                  min_freq=2,
                                  specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                  special_first=True)
vocab.set_default_index(UNK_TOKEN)

In [7]:
len(vocab)

89854

In [8]:
less_ = 0
more_ = 0
for _ in range(10000000):
    if random.random()<=0.5:
        less_ += 1
    else:
        more_ += 1
less_, more_

(4999562, 5000438)

In [10]:
%%writefile ./data_setup.py
import torch
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pandas as pd
import random
from typing import Tuple


class IMDBBERTDataset(Dataset):

    NSP_PERCENTAGE = 0.50

    CLS_TOKEN = 0
    SEP_TOKEN = 1
    MASK_TOKEN = 2
    PAD_TOKEN = 3 
    UNK_TOKEN = 4

    def __init__(self,
                 path: str,
                 max_sent_len: int=50) -> None:
        super().__init__()
        self.df = pd.read_csv(path)
        self.tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
        self.vocab = build_vocab_from_iterator(self._build_vocab(self.df["review"].to_list()),
                                               min_freq=2,
                                               specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                               special_first=True)
        self.vocab.set_default_index(self.UNK_TOKEN)
        self.max_sent_len = max_sent_len
        self.token_ids = []
        self.masked_token = []
        self.masked_idx = []
        self.is_next = []
        self.segment_tokens = []
        self._prepare_data()
        

    def _prepare_data(self) -> None:
        for i in range(self.df.shape[0]):
            try:
                sentences = self.df.iloc[i, 0].split(". ")
                if random.random() <= self.NSP_PERCENTAGE:
                    rand_idx = random.randint(0, len(sentences)-2)
                    sentences = sentences[rand_idx:rand_idx+2]
                    is_next = 0
                else:
                    rand_idx = random.randint(1, len(sentences)-1)
                    sentences = [sentences[rand_idx], sentences[rand_idx-1]]
                    is_next = 1
                
                sentences = ["[CLS]"] + self.tokenizer(sentences[0]) + ["[SEP]"] + self.tokenizer(sentences[1])
                if len(sentences) < self.max_sent_len:
                    while len(sentences) < self.max_sent_len:
                        sentences += ["[PAD]"]
                else:
                    sentences = sentences[:self.max_sent_len]
                
                sep_idx = sentences.index("[SEP]")
                segment_token = [0]*(sep_idx+1) + [1]*(len(sentences)-1-sep_idx)

                # assert len(segment_token) == len(sentences), f"Length not equal, sep_idx: {sep_idx} "

                token_ids = self.vocab(sentences)
                mask_token, mask_idx = self.SEP_TOKEN, -1
                while mask_token == self.SEP_TOKEN:
                    mask_idx = random.randint(1, len(token_ids)-1)
                    mask_token = token_ids[mask_idx]
                token_ids[mask_idx] = self.MASK_TOKEN
                self.token_ids.append(token_ids)
                self.masked_token.append(mask_token)
                self.masked_idx.append(mask_idx)
                self.segment_tokens.append(segment_token)
                self.is_next.append(is_next)
            except:
                pass
 
        self.bert_df = pd.DataFrame(data={
            "token_ids" : self.token_ids,
            "segment_tokens" : self.segment_tokens,
            "masked_token" : self.masked_token,
            "masked_idx" : self.masked_idx,
            "is_next" : self.is_next
        })
        
    def _build_vocab(self, data_iter):
        for sentence in data_iter:
            yield self.tokenizer(sentence)
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, int, int, int]:
        token_ids = self.token_ids[index]
        segment_tokens = self.segment_tokens[index]
        masked_token = self.masked_token[index]
        masked_idx = self.masked_idx[index]
        is_next = self.is_next[index]
        return torch.tensor(token_ids), torch.tensor(segment_tokens), masked_token, masked_idx, is_next
    
    def __len__(self) -> int:
        return self.bert_df.shape[0]

if __name__ == "__main__":
    ds = IMDBBERTDataset(path="../data/IMDB.csv")
    print(f"Shape: {ds.bert_df.shape}")
    print(ds.bert_df.head(10))

Overwriting ./data_setup.py


## BERT EMBEDDING

Bert embedding -> Segment Embedding + Positional Embedding + Token Embedding

In [33]:
%%writefile bert_embedding.py
import torch
from torch import nn
import math

class BERTEmbedding(nn.Module):

    def __init__(self,
                 d_model: int=512,
                 vocab_size: int=1000,
                 max_seq_len: int=100,
                 dropout: float=0.1) -> None:
        
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=d_model)
        self.pe = torch.zeros(size=(max_seq_len, d_model),
                              requires_grad=False)
        
        for pos in range(max_seq_len):
            for dim in range(d_model):
                if pos%2==0:
                    self.pe[pos, dim] = math.sin(pos//(10000**(2*dim//d_model)))
                else:
                    self.pe[pos, dim] = math.cos(pos//(10000**(2*dim//d_model)))

        self.segment_embedding = nn.Embedding(num_embeddings=2,
                                              embedding_dim=d_model)
        self.dropout = nn.Dropout(p=dropout)
    
    def __repr__(self) -> str:
        return f"BERTEmbedding(d_model={self.d_model}, vocab_size={self.vocab_size})"

    def __str__(self) -> str:
        return f"BERTEmbedding(d_model={self.d_model}, vocab_size={self.vocab_size})"

    def forward(self, 
                x: torch.Tensor,
                segment_tokens: torch.Tensor) -> torch.Tensor:
        # x -> [batch_size, max_seq_len]
        token_embeddings = self.token_embedding(x)
        position_encoding = self.pe[:x.shape[1], :].unsqueeze(0) # positional_encoding -> [1, max_seq_len, d_model]
        segment_embedding = self.segment_embedding(segment_tokens)
        return self.dropout(token_embeddings + position_encoding + segment_embedding)

Writing bert_embedding.py


In [9]:
from data_setup import IMDBBERTDataset
ds = IMDBBERTDataset("../data/IMDB.csv")
ds.bert_df

Unnamed: 0,token_ids,segment_tokens,masked_token,masked_idx,is_next
0,"[0, 351, 10, 5, 99, 2132, 54, 1096, 16, 142, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",4279,40,0
1,"[0, 111, 113, 21, 121, 1964, 76, 294, 396, 25,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5,17,0
2,"[0, 28, 133, 12, 4391, 6, 29, 5, 448, 12, 2044...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",35,14,1
3,"[0, 10662, 35, 35401, 206, 7484, 50, 13, 174, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",74,40,1
4,"[0, 151, 251, 872, 12, 36, 659, 5, 147, 305, 1...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",21,31,1
...,...,...,...,...,...
41969,"[0, 159, 35, 3899, 8999, 25, 85, 448, 52, 823,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",51,18,1
41970,"[0, 159, 35, 1053, 10, 99, 17, 1049, 17, 1362,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",3,38,0
41971,"[0, 7432, 2472, 12, 106, 6, 83, 235, 12, 2, 84...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6,9,0
41972,"[0, 6805, 87, 6, 105, 14, 2, 19, 3216, 55, 11,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...",1613,6,1


In [10]:
VOCAB_SIZE = len(ds.vocab)
VOCAB_SIZE

89854

In [11]:
from torch.utils.data import DataLoader
from bert_embedding import BERTEmbedding
data_loader = DataLoader(dataset=ds,
                         batch_size=32,
                         shuffle=True)
batch = next(iter(data_loader))
bert_embedding = BERTEmbedding(d_model=512,
                               vocab_size=VOCAB_SIZE)
with torch.inference_mode():
    token_ids, segment_tokens = batch[0], batch[1]
    embedding = bert_embedding(token_ids, segment_tokens)
    print(f"token_ids shape: {token_ids.shape}")
    print(f"embedding shape: {embedding.shape}")

token_ids shape: torch.Size([32, 50])
embedding shape: torch.Size([32, 50, 512])


## BERT 

* BERT model is made up of just the encoder part of the transformer architecture.
* I am going to build the BERT-Base model in this kernel
* Parameters for BERT-Base: 
    * Number of encoder layers   **L : 12**
    * Model Dimension            **H : 768**
    * Number of Attention heads  **A : 12** 

In [26]:
%%writefile -a ../scripts/scripts.py

def create_padding_mask(batch: torch.Tensor,
                        padding_idx: int=0) -> torch.Tensor:
    # batch -> [batch_size, max_seq_len]
    mask = batch != padding_idx
    return mask.unsqueeze(1).unsqueeze(2) # mask -> [batch_size, 1, 1, max_seq_len] 

Appending to ../scripts/scripts.py


In [36]:
from scripts.scripts import TransformerEncoderLayer, TransformerEncoder

In [None]:
import torch
from torch import nn

import sys
sys.path.insert(0, '..')
from scripts.scripts import TransformerEncoderLayer, TransformerEncoder

class BERT(nn.Module):
    def __init__(self,
                 num_layers: int=12,
                 d_model: int=768,
                 num_heads: int=12,
                 vocab_size: int=1000,
                 d_ff: int=2048,
                 attn_dropout: float=0.1,
                 ff_dropout: float=0.1) -> None:
        super().__init__()
        self.bert_encoder = TransformerEncoder(num_encoders=num_layers,
                                               d_model=d_model,
                                               num_heads=num_heads,
                                               d_ff=d_ff,
                                               attn_dropout=attn_dropout,
                                               ff_dropout=ff_dropout)
        self.masked_block = nn.Linear(in_features=d_model,
                                      out_features=vocab_size)
        self.nsp_block = nn.Linear(in_features=d_model,
                                        out_features=2)
    
    def forward(self,
                x: torch.Tensor,
                mask: torch.Tensor,
                masked_idx: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        
        x = self.bert_encoder(x, mask)
        nsp_logits = self.nsp_block(x[:, 0, :])
        
        

In [48]:
idxs = torch.tensor([
    [[1]],
    [[2]],
    [[3]],
    [[4]]
])
tensor = torch.rand(size=(32, 10, 768))
# tensor[:, idxs, :].shape

In [49]:
torch.gather(tensor, 1, idxs)

tensor([[[0.5899]],

        [[0.9063]],

        [[0.8251]],

        [[0.8510]]])

In [52]:
tensor[0, 1, 0]

tensor(0.5899)