In [1]:
import torchtext
from torchtext.datasets import multi30k,Multi30k
import torch
from typing import Iterable, List
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Transformer
import math
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import IMDB,PennTreebank
import time



import warnings
def warn(*args,**kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings('ignore')


In [2]:
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB()

In [3]:
label,sample = next(iter(train_iter))

In [4]:
label,sample

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [5]:
label_list =([label for label,_ in train_iter])
text_list = ([text for _,text in train_iter])

In [6]:
len(label_list)

12500

In [7]:
len(text_list)

12500

In [25]:
import random
idx = random.randint(0,len(text_list))

text_list[idx],label_list[idx]

('I just saw DreamGirls yesterday, and I was REALLY underimpressed. Despite all the Oscar buzz, this is nothing special. Anyone who was really impressed by this film has never bothered to see any of the true movie musical classics. Except for Eddie Murphy\'s great musical and dramatic performance, Dreamgirls is just a glorified TV movie with no style or flair. Just a bunch of amateurs singing AT each other!<br /><br />Now, the first half hour was good, but I was irritated at how Eddie Murphy\'s terrific raveup performances were truncated and interrupted by montages. Those were easily the best songs and best performances in the film. And the "rise to the top" portion of the film was the only part of the film that had a consistent point of view or any momentum. The remaining hour and 45 minutes was a formless, rambling mess that was neither realistic nor fantastic enough to be interesting. It was also visually dull and included too many sound-alike tunes.<br /><br />Condon didn\'t try to

The data has 12500 test sample data. Here each one is labelled as one. So each movie got a positive review somehow

## Data Processing

In [9]:
UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2
special_symbols = ['<unk>','<pad>','<|endoftext|>']

In [10]:
# set the tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_token(dataset):
    for _,text in dataset:
        yield tokenizer(text)

# Create the vocabulary
vocab = build_vocab_from_iterator(yield_token(train_iter), specials = special_symbols,special_first=True)
vocab.set_default_index(vocab['<unk>'])

In [11]:
# Text to index, Index to text

# input--> token; output--> index
text_to_idx = lambda text: [vocab[token] for token in tokenizer(text)]

# input --> index; output--> token
idx_to_text = lambda seq_en: " ".join([vocab.get_itos()[idx] for idx in seq_en])

In [12]:
a = torch.tensor([12,24,36,48,60,72,84],dtype = torch.int64)
#index_to_text = idx_to_text([12,24,36,48,60,72,84])
index_to_text = idx_to_text(a)

index_to_text

'it t ! or has my been'

In [13]:
items = list(vocab.get_itos())[:10]
items

['<unk>', '<pad>', '<|endoftext|>', '.', 'the', ',', 'a', 'and', "'", 'of']

In [14]:
text = idx_to_text(torch.tensor([0,1,2]))
text

'<unk> <pad> <|endoftext|>'

## Collate Function

Collate function shapes how the dataloader perceives the data. To pass the data through the model each string should have equal length

In [27]:
def get_sample(block_size, text):
    """The goal of the function is to generate a training sample. The input and the target dataset. It's done because the model learns from the input
    and tries to predict the output.
    
    Parameters(block_size,text):
        block_size: actually indicates the context size. At how many tokens the model can look at once. 
        text: the full dataset. A list of long tokens
        """


    src_seq,trg_seq = [],[]
    sample_len = len(text) # gives the length of the total length of the input

    # Calculating the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text
    random_sample_stop = sample_len - block_size

    # random sampling should be starting in this limit

    if random_sample_stop >=1:

        # pick the starting point
        random_start = torch.randint(0,random_sample_stop, size = (1,)).item()

        stop = random_start + block_size

        # Create the input and the target sequence
        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop+1]



    # Handle the case where the text sequence is exactly same as the context size
    elif random_sample_stop <= 0:

        random_start = 0
        stop = sample_len

        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop]

        # Append an empty string to maintain the sequence alignment
        trg_seq.append('<|endoftext|>')

    return src_seq,trg_seq
    

In [28]:
# Gather the training data tokens into a list
BATCH_SIZE = 1 # for each batch its return a list of tokens which could be used for training

batch_of_tokens = []

for i in range(BATCH_SIZE):
    label,text = next(iter(train_iter))
    batch_of_tokens.append(tokenizer(text))

In [29]:
len(batch_of_tokens[0])

317

In [30]:
# Check how is the input and the output sequence would be feed to the training model

# create the whole dataset for training of 100 tokens
text = batch_of_tokens[0][0:100]
block_size = 10

src_seq, trg_seq = get_sample(block_size,text)

In [31]:
src_seq,trg_seq

(['.',
  'in',
  'particular',
  'she',
  'wants',
  'to',
  'focus',
  'her',
  'attentions',
  'to'],
 ['in',
  'particular',
  'she',
  'wants',
  'to',
  'focus',
  'her',
  'attentions',
  'to',
  'making'])

In [32]:
vocab(src_seq),vocab(trg_seq)

([3, 14, 979, 68, 518, 10, 1216, 57, 12246, 10],
 [14, 979, 68, 518, 10, 1216, 57, 12246, 10, 233])

In [33]:
src_batch,trg_batch = [],[]

# Define the batch size
BATCH_SIZE = 2
block_size = 10
# Loop to create src batch and target batch
for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter)) # Get the first sample every time, as we convert the train data iterable in every turn


    # turn the text into tokenizer and then send them
    text_tokens = tokenizer(text)

    # Generate source and target tokens
    src_tokens,trg_tokens = get_sample(block_size,text_tokens) # block_size is previously declared; block_size = 10

    # Get the indices of that tokens
    src_indices,trg_indices = vocab(src_tokens),vocab(trg_tokens)

    # Turn the indices into tensors
    src_sequence = torch.tensor(src_indices)
    trg_sequence = torch.tensor(trg_indices)
    

    # print the output 
    print(f"Sample: {i}")
    print(f"Source Sequence (Text): {src_tokens}")
    print(f"Source Sequence (Indices): {src_indices}")
    print(f"Source Sequence (Shape): {len(src_tokens)}")
    print(f"Target Sequence (Text): {trg_tokens}")
    print(f"Target Sequence (Indices): {trg_indices}")
    print(f"Target Sequence (Shape): {len(trg_tokens)}")

Sample: 0
Source Sequence (Text): ['wanting', 'to', 'study', 'the', 'meat', 'and', 'potatoes', '(', 'no', 'pun']
Source Sequence (Indices): [1798, 10, 2307, 4, 2876, 7, 14661, 29, 56, 4419]
Source Sequence (Shape): 10
Target Sequence (Text): ['to', 'study', 'the', 'meat', 'and', 'potatoes', '(', 'no', 'pun', 'intended']
Target Sequence (Indices): [10, 2307, 4, 2876, 7, 14661, 29, 56, 4419, 1218]
Target Sequence (Shape): 10
Sample: 1
Source Sequence (Text): ['between', 'asking', 'politicians', 'and', 'ordinary', 'denizens', 'of', 'stockholm', 'about', 'their']
Source Sequence (Indices): [259, 1743, 7457, 7, 2318, 29828, 9, 16111, 52, 80]
Source Sequence (Shape): 10
Target Sequence (Text): ['asking', 'politicians', 'and', 'ordinary', 'denizens', 'of', 'stockholm', 'about', 'their', 'opinions']
Target Sequence (Indices): [1743, 7457, 7, 2318, 29828, 9, 16111, 52, 80, 4554]
Target Sequence (Shape): 10


## Collate Function

In [34]:
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cuda'
DEVICE

'mps'

In [35]:
BLOCK_SIZE = 30

def collate_batch(batch):
    src_batch,trg_batch = [],[]

    for _,text in batch:

        tokens = tokenizer(text)

        src_tokens,trg_tokens = get_sample(BLOCK_SIZE,tokens)

        src_indices,trg_indices = vocab(src_tokens),vocab(trg_tokens)

        src_seq,trg_seq = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(trg_indices,dtype = torch.int64)

        src_batch.append(src_seq)
        trg_batch.append(trg_seq)


    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    trg_batch = pad_sequence(trg_batch,padding_value = PAD_IDX, batch_first = False)

    return src_batch.to(DEVICE),trg_batch.to(DEVICE)

## Create DataLoader

In [36]:
from torch.utils.data import DataLoader,Dataset

train_dataloader = DataLoader(train_iter,
                       batch_size = BATCH_SIZE,
                       shuffle = True,
                       collate_fn = collate_batch) # collate batch will define how the data will be retuerned

test_dataloader = DataLoader(test_iter,
                            batch_size = BATCH_SIZE,
                            shuffle = True,
                            collate_fn = collate_batch)

## Iterating Through Data Samples

In [47]:
dataset = iter(train_dataloader)

for sample in range(5):
    src, tgr = next(dataset)

    for i in range(BATCH_SIZE):
        print(f"sample: {sample}")
        print(f"Source shape: {src.shape}")
        print(f"source: {idx_to_text(src[:,i])}") # take the indices as the intput and return the word as the output
        print("\n")
        print(f"Target: {idx_to_text(tgr[:,i])}")
        print("\n")

sample: 0
Source shape: torch.Size([30, 2])
source: had better effects when they aired their first episodes in 1993 . that was 4 years before sg-1 started . and they did ' nt have the apparent two million


Target: better effects when they aired their first episodes in 1993 . that was 4 years before sg-1 started . and they did ' nt have the apparent two million dollar


sample: 0
Source shape: torch.Size([30, 2])
source: as other male reviewers have , i did enjoy seeing karen allen ' s cute , petite body . i ' ll give the movie four stars two of them


Target: other male reviewers have , i did enjoy seeing karen allen ' s cute , petite body . i ' ll give the movie four stars two of them are


sample: 1
Source shape: torch.Size([30, 2])
source: historically inaccurate and ridiculous that he refused , and also demanded they stop using his name as a source ( it embarrassed him to think people would think he was


Target: inaccurate and ridiculous that he refused , and also demanded they s

## **MASKING**