In [28]:
import torchtext
import torch
import numpy as np
import pandas as pd
import random
import time

from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader

from typing import Tuple, List

In [3]:
df = pd.read_csv("../data/IMDB.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Preparing NSP dataset and Masked dataset

In [4]:
MASK_PERCENTANGE = 0.15
MASK_PROBABILITY = 0.80

In [27]:
sents = df.iloc[random.randint(0, df.shape[0]), 0].split(". ")
idx = random.randint(0, len(sents)-1)
sent1 = sents[idx]

sent1 = sent1.split(" ")
mask_idx = random.randint(0, len(sent1)-1)
if random.random() < MASK_PROBABILITY:
    sent1[mask_idx] = "[MASK]"
    print("Masked")
    print(" ".join(sent1))
else:
    rand_token = random.randint(0, len(sent1)-1)
    sent1[mask_idx] = sent1[rand_token]
    print("Replaced with random token")
    print(" ".join(sent1))

if random.random() <= 0.5:
    nsp_sents = ". [SEP] ".join(sents[idx: idx+2])
    print("NSP: 1")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))
else:
    nsp_sents = sents[idx] + ". [SEP] " + sents[random.randint(idx+1, len(sents))-1]
    print("NSP: 0")
    print(nsp_sents)
    print(len(nsp_sents.split(" ")))

Masked
However, I [MASK] consider it marvelous entertainment typical of Hollywood's golden age.
NSP: 1
However, I still consider it marvelous entertainment typical of Hollywood's golden age.
12


In [6]:
CLS_TOKEN = 0
SEP_TOKEN = 1
MASK_TOKEN = 2
PAD_TOKEN = 3 
UNK_TOKEN = 4

In [7]:
tokenizer = get_tokenizer("spacy", language="en_core_web_sm")

def build_vocab(data_iter):
    for sentence in data_iter:
        yield tokenizer(sentence)

vocab = build_vocab_from_iterator(build_vocab(df["review"].to_list()),
                                  min_freq=2,
                                  specials=["[CLS]", "[SEP]", "[MASK]", "[PAD]", "<UNK>"],
                                  special_first=True)
vocab.set_default_index(UNK_TOKEN)

In [8]:
len(vocab)

89854

In [16]:
less_ = 0
more_ = 0
for _ in range(10000000):
    if random.random()<=0.5:
        less_ += 1
    else:
        more_ += 1
less_, more_

(4998650, 5001350)

In [40]:
class IMDBBERTDataset(Dataset):

    NSP_PERCENTAGE = 0.50

    CLS_TOKEN = 0
    SEP_TOKEN = 1
    MASK_TOKEN = 2
    PAD_TOKEN = 3 
    UNK_TOKEN = 4

    def __init__(self,
                 path: str,
                 tokenizer,
                 vocab,
                 max_sent_len: int=50) -> None:
        super().__init__()
        self.df = pd.read_csv(path)
        self.tokenizer = tokenizer
        self.vocab = vocab
        self.max_sent_len = max_sent_len
        self.df["token_ids"] = 0 # token_ids = 2
        self.df["masked_token"] = 0 # masked_token = 3
        self.df["masked_idx"] = 0 # masked_idx = 4
        self.df["is_next"] = 0 # is_next = 5

        self._prepare_data()
        

    def _prepare_data(self) -> None:
        token_ids_list = []
        for i in range(self.df.shape[0]):
            sentences = self.df.iloc[i, 0].split(". ")
            if random.random() <= self.NSP_PERCENTAGE:
                self.df.iat[i, 5] = 0
                rand_idx = random.randint(0, len(sentences)-2)
                sentences = sentences[rand_idx:rand_idx+2]
            else:
                self.df.iat[i, 5] = 1
                rand_idx = random.randint(1, len(sentences)-1)
                sentences = [sentences[rand_idx], sentences[rand_idx-1]]
            
            sentences = "[CLS] " + sentences[0] + ". [SEP] " + sentences[1]
            
            if len(sentences.split(" ")) < self.max_sent_len:
                while len(sentences.split(" ")) < self.max_sent_len:
                    sentences += " [PAD]"
            else:
                sentences = " ".join(sentences.split(" ")[:self.max_sent_len])
            
            token_ids = self.vocab(self.tokenizer(sentences))
            mask_token, mask_idx = self.SEP_TOKEN, -1
            while mask_token == self.SEP_TOKEN:
                mask_idx = random.randint(1, len(token_ids)-1)
                mask_token = token_ids[mask_idx]
            token_ids[mask_idx] = self.MASK_TOKEN
            token_ids_list.append(token_ids)
            self.df.iat[i, 3] = mask_token
            self.df.iat[i, 4] = mask_idx
        
        self.df["token_ids"] = token_ids_list
        
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, int, int, int]:
        token_ids = self.df.iloc[index, 2]
        masked_token = self.df.iloc[index, 3]
        masked_idx = self.df.iloc[index, 4]
        is_next = self.df.iloc[index, 5]
        return token_ids, masked_token, masked_idx, is_next
    
    def __len__(self) -> int:
        return self.df.shape[0]


In [None]:
ds = IMDBBERTDataset(path="../data/IMDB.csv",
                     tokenizer=tokenizer,
                     vocab=vocab)