In [1]:
import torchtext
import torch
import numpy as np
import pandas as pd
import random
import time

from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

from typing import Tuple, List

In [2]:
df = pd.read_csv("../data/IMDB.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Preparing NSP dataset and Masked dataset

In [3]:
MASK_PERCENTANGE = 0.15
MASK_PROBABILITY = 0.80

In [4]:
sents = df.iloc[random.randint(0, df.shape[0]), 0].split(". ")
idx = random.randint(0, len(sents)-1)
sent1 = sents[idx]

sent1 = sent1.split(" ")
mask_idx = random.randint(0, len(sent1)-1)
if random.random() < MASK_PROBABILITY:
    sent1[mask_idx] = "[MASK]"
    print("Masked")
    print(" ".join(sent1))
else:
    rand_token = random.randint(0, len(sent1)-1)
    sent1[mask_idx] = sent1[rand_token]
    print("Replaced with random token")
    print(" ".join(sent1))

if random.random() <= 0.5:
    nsp_sents = ". [SEP] ".join(sents[idx: idx+2])
    print("NSP: 1")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))
else:
    nsp_sents = sents[idx] + ". [SEP] " + sents[random.randint(idx+1, len(sents))-1]
    print("NSP: 0")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))

Masked
The are always chances a movie won't hold ones beliefs as well as another, but I felt that rhythm of this picture and the timing was [MASK]
NSP: 0
The are always chances a movie won't hold ones beliefs as well as another, but I felt that rhythm of this picture and the timing was excellent. [SEP] Washington and of course Christopher Walken they both exceed the challenge of showing the darkest sides of humanity trying to move to the light.
52


In [5]:
CLS_TOKEN = 0
SEP_TOKEN = 1
MASK_TOKEN = 2
PAD_TOKEN = 3 
UNK_TOKEN = 4

In [6]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

def build_vocab(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(build_vocab(df["review"].to_list()),
                                  min_freq=2,
                                  specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                  special_first=True)
vocab.set_default_index(UNK_TOKEN)

In [7]:
len(vocab)

89854

In [8]:
less_ = 0
more_ = 0
for _ in range(10000000):
    if random.random()<=0.5:
        less_ += 1
    else:
        more_ += 1
less_, more_

(4999551, 5000449)

In [10]:
%%writefile ./data_setup.py
import torch
from torch.utils.data import Dataset
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

import pandas as pd
import random
from typing import Tuple


class IMDBBERTDataset(Dataset):

    NSP_PERCENTAGE = 0.50

    CLS_TOKEN = 0
    SEP_TOKEN = 1
    MASK_TOKEN = 2
    PAD_TOKEN = 3 
    UNK_TOKEN = 4

    def __init__(self,
                 path: str,
                 max_sent_len: int=50) -> None:
        super().__init__()
        self.df = pd.read_csv(path)
        self.tokenizer = get_tokenizer("spacy", language="en_core_web_sm")
        self.vocab = build_vocab_from_iterator(self._build_vocab(self.df["review"].to_list()),
                                               min_freq=2,
                                               specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                               special_first=True)
        self.vocab.set_default_index(self.UNK_TOKEN)
        self.max_sent_len = max_sent_len
        self.token_ids = []
        self.masked_token = []
        self.masked_idx = []
        self.is_next = []
        self.segment_tokens = []
        self._prepare_data()
        

    def _prepare_data(self) -> None:
        for i in range(self.df.shape[0]):
            try:
                sentences = self.df.iloc[i, 0].split(". ")
                if random.random() <= self.NSP_PERCENTAGE:
                    rand_idx = random.randint(0, len(sentences)-2)
                    sentences = sentences[rand_idx:rand_idx+2]
                    is_next = 0
                else:
                    rand_idx = random.randint(1, len(sentences)-1)
                    sentences = [sentences[rand_idx], sentences[rand_idx-1]]
                    is_next = 1
                
                sentences = ["[CLS]"] + self.tokenizer(sentences[0]) + ["[SEP]"] + self.tokenizer(sentences[1])
                if len(sentences) < self.max_sent_len:
                    while len(sentences) < self.max_sent_len:
                        sentences += ["[PAD]"]
                else:
                    sentences = sentences[:self.max_sent_len]
                
                sep_idx = sentences.index("[SEP]")
                segment_token = [0]*(sep_idx+1) + [1]*(len(sentences)-1-sep_idx)

                # assert len(segment_token) == len(sentences), f"Length not equal, sep_idx: {sep_idx} "

                token_ids = self.vocab(sentences)
                mask_token, mask_idx = self.SEP_TOKEN, -1
                while mask_token == self.SEP_TOKEN:
                    mask_idx = random.randint(1, len(token_ids)-1)
                    mask_token = token_ids[mask_idx]
                token_ids[mask_idx] = self.MASK_TOKEN
                self.token_ids.append(token_ids)
                self.masked_token.append(mask_token)
                self.masked_idx.append(mask_idx)
                self.segment_tokens.append(segment_token)
                self.is_next.append(is_next)
            except:
                pass
 
        self.bert_df = pd.DataFrame(data={
            "token_ids" : self.token_ids,
            "segment_tokens" : self.segment_tokens,
            "masked_token" : self.masked_token,
            "masked_idx" : self.masked_idx,
            "is_next" : self.is_next
        })
        
    def _build_vocab(self, data_iter):
        for sentence in data_iter:
            yield self.tokenizer(sentence)
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, int, int, int]:
        token_ids = self.token_ids[index]
        segment_tokens = self.segment_tokens[index]
        masked_token = self.masked_token[index]
        masked_idx = self.masked_idx[index]
        is_next = self.is_next[index]
        return torch.tensor(token_ids), torch.tensor(segment_tokens), masked_token, masked_idx, is_next
    
    def __len__(self) -> int:
        return self.bert_df.shape[0]

if __name__ == "__main__":
    ds = IMDBBERTDataset(path="../data/IMDB.csv")
    print(f"Shape: {ds.bert_df.shape}")
    print(ds.bert_df.head(10))

Overwriting ./data_setup.py


## BERT EMBEDDING

Bert embedding -> Segment Embedding + Positional Embedding + Token Embedding

In [11]:
%%writefile bert_embedding.py
import torch
from torch import nn
import math

class BERTEmbedding(nn.Module):

    def __init__(self,
                 d_model: int=512,
                 vocab_size: int=1000,
                 max_seq_len: int=100,
                 dropout: float=0.1) -> None:
        
        super().__init__()
        self.d_model = d_model
        self.vocab_size = vocab_size
        self.token_embedding = nn.Embedding(num_embeddings=vocab_size,
                                            embedding_dim=d_model)
        self.pe = torch.zeros(size=(max_seq_len, d_model),
                              requires_grad=False)
        
        for pos in range(max_seq_len):
            for dim in range(d_model):
                if pos%2==0:
                    self.pe[pos, dim] = math.sin(pos//(10000**(2*dim//d_model)))
                else:
                    self.pe[pos, dim] = math.cos(pos//(10000**(2*dim//d_model)))

        self.segment_embedding = nn.Embedding(num_embeddings=2,
                                              embedding_dim=d_model)
        self.dropout = nn.Dropout(p=dropout)
    
    def __repr__(self) -> str:
        return f"BERTEmbedding(d_model={self.d_model}, vocab_size={self.vocab_size})"

    def __str__(self) -> str:
        return f"BERTEmbedding(d_model={self.d_model}, vocab_size={self.vocab_size})"

    def forward(self, 
                x: torch.Tensor,
                segment_tokens: torch.Tensor) -> torch.Tensor:
        # x -> [batch_size, max_seq_len]
        token_embeddings = self.token_embedding(x)
        position_encoding = self.pe[:x.shape[1], :].unsqueeze(0) # positional_encoding -> [1, max_seq_len, d_model]
        segment_embedding = self.segment_embedding(segment_tokens)
        return self.dropout(token_embeddings + position_encoding + segment_embedding)

In [13]:
from data_setup import IMDBBERTDataset
ds = IMDBBERTDataset("../data/IMDB.csv")
ds.bert_df

Unnamed: 0,token_ids,segment_tokens,masked_token,masked_idx,is_next
0,"[0, 229, 35, 240, 6, 23, 19, 12, 659, 68, 643,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",53,20,1
1,"[0, 59, 320, 30, 299, 1927, 9, 299, 5152, 6, 6...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,16,1
2,"[0, 3256, 70, 18, 8, 269, 145, 8, 146, 495, 32...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",38,19,0
3,"[0, 12238, 1586, 228, 8, 4619, 2, 53, 440, 488...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, ...",3601,6,0
4,"[0, 861, 9894, 56, 267, 996, 1848, 11, 86, 2, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",603,9,1
...,...,...,...,...,...
41871,"[0, 340, 4809, 320, 8, 7590, 47, 12, 11974, 44...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",45,29,0
41872,"[0, 7262, 38, 107, 4830, 1802, 35, 1015, 14, 2...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",9,30,1
41873,"[0, 453, 933, 443, 1, 453, 2020, 418, 3, 3, 3,...","[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3,14,1
41874,"[0, 14, 217, 19, 24, 88, 8, 207, 240, 63, 324,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ...",47,24,0


In [16]:
VOCAB_SIZE = len(ds.vocab)
VOCAB_SIZE

89854

In [32]:
from torch.utils.data import DataLoader
data_loader = DataLoader(dataset=ds,
                         batch_size=32,
                         shuffle=True)
batch = next(iter(data_loader))
bert_embedding = BERTEmbedding(d_model=512,
                               vocab_size=VOCAB_SIZE)
with torch.inference_mode():
    token_ids, segment_tokens = batch[0], batch[1]
    embedding = bert_embedding(token_ids, segment_tokens)
    print(f"token_ids shape: {token_ids.shape}")
    print(f"embedding shape: {embedding.shape}")

token_ids shape: torch.Size([32, 50])
embedding shape: torch.Size([32, 50, 512])
