In [11]:
import torch
from torch import nn
import numpy as np
import pandas as pd

import random
from typing import Tuple

from torch.utils.data import Dataset, DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [3]:
df = pd.read_csv("../data/IMDB.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [28]:
class IMDBMaskedBertDataset(Dataset):

    PAD_TOKEN = 0
    CLS_TOKEN = 1
    SEP_TOKEN = 2
    MASK_TOKEN = 3
    UNK_TOKEN = 4

    SPECIALS = ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]

    def __init__(self,
                 path: str,
                 max_len: int=10) -> None:
        super().__init__()
        self.max_len = max_len
        self.df = pd.read_csv(path)

        self.tokenizer = get_tokenizer(tokenizer="spacy",
                                       language="en_core_web_sm")

        self.masked_tokens = []
        self.masked_token_idxs = []
        self.sentences = []

        self._prepare_data()
        self.sentences = torch.tensor(self.sentences)
        self.masked_token_idxs = torch.tensor(self.masked_token_idxs)
        self.masked_tokens = torch.tensor(self.masked_tokens)


    def __len__(self) -> int:
        return len(self.sentences)
    
    def __getitem__(self, index: int) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        return self.sentences[index], self.masked_tokens[index], self.masked_token_idxs[index]
    
    def _build_vocab(self, data_iter):
        for sent in data_iter:
            yield self.tokenizer(sent)


    def _prepare_data(self):

        for i in range(self.df.shape[0]):
            for sent in self.df.iloc[i, 0].split('. '):
                if 1 < len(self.tokenizer(sent)) <= self.max_len:
                    self.sentences.append(sent)
        
        self.vocab = build_vocab_from_iterator(self._build_vocab(self.sentences),
                                               min_freq=2,
                                               special_first=True,
                                               specials=self.SPECIALS)
        self.vocab.set_default_index(self.UNK_TOKEN)
        
        self._mask_data()

        for i in range(len(self.sentences)):
            self.sentences[i] = self.vocab(self.sentences[i])
            self.masked_tokens[i] = self.vocab(self.masked_tokens[i])

    
    def _mask_data(self):

        for i in range(len(self.sentences)):
            sentence = self.tokenizer(self.sentences[i])
            mask_idx = random.randint(0, len(sentence)-1)
            self.masked_token_idxs.append(mask_idx+1)
            self.masked_tokens.append([sentence[mask_idx]])
            sentence[mask_idx] = "[MASK]"
            sentence = ["[CLS]"] + sentence + ["[SEP]"]
            while len(sentence)<self.max_len+2:
                sentence.append("[PAD]")
            self.sentences[i] = sentence

In [29]:
masked_ds = IMDBMaskedBertDataset(path="../data/IMDB.csv")

In [37]:
masked_ds[random.randint(0, len(masked_ds)-1)]

(tensor([ 1,  3, 14,  2,  0,  0,  0,  0,  0,  0,  0,  0]),
 tensor([13174]),
 tensor(1))