In [6]:
import torchtext
import torch
import numpy as np
import pandas as pd
import random
import time

from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [3]:
df = pd.read_csv("../data/IMDB.csv")
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


## Preparing NSP dataset and Masked dataset

In [11]:
MASK_PERCENTANGE = 0.15
MASK_PROBABILITY = 0.80

In [9]:
sents = df.iloc[0, 0]
sents = sents.split(". ")

In [12]:
idx = random.choice(range(len(sents)-1))
idx

5

In [13]:
two_sents = sents[idx:idx+2]
two_sents

['It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda',
 "Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare"]

In [14]:
masked_sent = two_sents[0]
nsp_sent = " ".join(two_sents)
masked_sent, nsp_sent

('It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda',
 "It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare")

In [37]:
masked_sent = two_sents[0]
mask_idx = random.randint(0, len(masked_sent.split(" ")))
if random.random()<=0.8:
    masked_sent = masked_sent.split(" ")
    masked_sent[mask_idx] = "[MASK]"
    print(" ".join(masked_sent))
else:
    rand_token = random.choice((masked_sent.split(" ")))
    print(f"Rand token: {rand_token}")
    masked_sent = masked_sent.split(" ")
    masked_sent[mask_idx] = rand_token
    print(" ".join(masked_sent))

[MASK] focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda


In [39]:
if random.random() <= 0.5:
    nsp_sents = ". [SEP] ".join(sents[idx:idx+2])
    print("NSP: 1")
    print(nsp_sents)
else:
    random_sent_idx = random.randint(idx+1, len(sents))
    nsp_sents = sents[idx] + ". [SEP] " + sents[random_sent_idx]
    print("NSP: 0")
    print(nsp_sents)

NSP: 1
It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. [SEP] Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fact that it goes where other shows wouldn't dare


In [53]:
sents = df.iloc[random.randint(0, df.shape[0]), 0].split(". ")
idx = random.randint(0, len(sents)-1)
sent1 = sents[idx]

sent1 = sent1.split(" ")
mask_idx = random.randint(0, len(sent1)-1)
if random.random() < MASK_PROBABILITY:
    sent1[mask_idx] = "[MASK]"
    print("Masked")
    print(" ".join(sent1))
else:
    rand_token = random.randint(0, len(sent1)-1)
    sent1[mask_idx] = sent1[rand_token]
    print("Replaced with random token")
    print(" ".join(sent1))

if random.random() <= 0.5:
    nsp_sents = ". [SEP] ".join(sents[idx: idx+2])
    print("NSP: 1")
    print(nsp_sents)
else:
    nsp_sents = sents[idx] + ". [SEP] " + sents[random.randint(idx+1, len(sents))-1]
    print("NSP: 0")
    print(nsp_sents)

Masked
Cartoon characters keep popping up as bit players and [MASK] drawing unintentional laughs from the premiere audience
NSP: 0
Cartoon characters keep popping up as bit players and extras, drawing unintentional laughs from the premiere audience. [SEP] The biggest sin is that the audience is supposed to feel sympathy for kids who gun down old ladies, run over puppies chained to a tree, rob and steal, all the while complaining about their sad, sorry lives
