In [2]:
import torchtext
from torchtext.datasets import multi30k,Multi30k
import torch
from typing import Iterable, List
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Transformer
import math
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import IMDB,PennTreebank
import time



import warnings
def warn(*args,**kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings('ignore')


In [3]:
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB()

In [4]:
label,sample = next(iter(train_iter))

In [5]:
label,sample

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [6]:
label_list =([label for label,_ in train_iter])
text_list = ([text for _,text in train_iter])

In [7]:
len(label_list)

12500

In [8]:
len(text_list)

12500

In [9]:
import random
idx = random.randint(0,len(text_list))

text_list[idx],label_list[idx]

("I can't believe that in the 34 prior comments, nobody mentioned that this film is a blatant rip-off of Born Yesterday. A man is hired to bring an ostensibly dumb blonde up to the requirements of a gangster. Hired gun and blonde fall in love and live happily ever after. Gangster is left in the lurch. But Born Yesterday was an intelligent treatment whereas this is just so much fluff. Technicolor transfer to DVD is deplorable. Natalie Kalmus would be rolling over in her grave. Check out the paperboy. Recognize him? But, it's historically interesting to see the roots of Rock 'n Roll. Also interesting is Ewell's introduction to CinemaScope, a new format at the time.",
 1)

The data has 12500 test sample data. Here each one is labelled as one. So each movie got a positive review somehow

## Data Processing

In [10]:
UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2
special_symbols = ['<unk>','<pad>','<|endoftext|>']

In [11]:
# set the tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_token(dataset):
    for _,text in dataset:
        yield tokenizer(text)

# Create the vocabulary
vocab = build_vocab_from_iterator(yield_token(train_iter), specials = special_symbols,special_first=True)
vocab.set_default_index(vocab['<unk>'])

In [12]:
# Text to index, Index to text

# input--> token; output--> index
text_to_idx = lambda text: [vocab[token] for token in tokenizer(text)]

# input --> index; output--> token
idx_to_text = lambda seq_en: " ".join([vocab.get_itos()[idx] for idx in seq_en])

In [13]:
items = list(vocab.get_itos())[:10]
items

['<unk>', '<pad>', '<|endoftext|>', '.', 'the', ',', 'a', 'and', "'", 'of']

In [14]:
text = idx_to_text(torch.tensor([0,1,2]))
text

'<unk> <pad> <|endoftext|>'

## Collate Function

Collate function shapes how the dataloader perceives the data. To pass the data through the model each string should have equal length

In [15]:
def get_sample(block_size, text):
    """The goal of the function is to generate a training sample. The input and the target dataset. It's done because the model learns from the input
    and tries to predict the output.
    
    Parameters(block_size,text):
        block_size: actually indicates the context size. At how many tokens the model can look at once. 
        text: the full dataset. A list of long tokens
        """


    src_seq,trg_seq = [],[]
    sample_len = len(text) # gives the length of the total length of the input

    # Calculating the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text
    random_sample_stop = sample_len - block_size

    # random sampling should be starting in this limit

    if random_sample_stop >=1:

        # pick the starting point
        random_start = torch.randint(0,random_sample_stop, size = (1,)).item()

        stop = random_start + block_size

        # Create the input and the target sequence
        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop+1]



    # Handle the case where the text sequence is exactly same as the context size
    elif random_sample_stop <= 0:

        random_start = 0
        stop = sample_len

        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop]

        # Append an empty string to maintain the sequence alignment
        trg_seq.append('<|endoftext|>')

    return src_seq,trg_seq
    

In [16]:
# Gather the training data tokens into a list
BATCH_SIZE = 1 # for each batch its return a list of tokens which could be used for training

batch_of_tokens = []

for i in range(BATCH_SIZE):
    label,text = next(iter(train_iter))
    batch_of_tokens.append(tokenizer(text))

In [17]:
len(batch_of_tokens[0])

317

In [18]:
# Check how is the input and the output sequence would be feed to the training model

# create the whole dataset for training of 100 tokens
text = batch_of_tokens[0][0:100]
block_size = 10

src_seq, trg_seq = get_sample(block_size,text)

In [19]:
src_seq,trg_seq

(['at', 'first', 'it', 'was', 'seized', 'by', 'u', '.', 's', '.'],
 ['first', 'it', 'was', 'seized', 'by', 'u', '.', 's', '.', 'customs'])

In [20]:
vocab(src_seq),vocab(trg_seq)

([38, 98, 12, 18, 17608, 46, 1466, 3, 17, 3],
 [98, 12, 18, 17608, 46, 1466, 3, 17, 3, 11063])

In [24]:
src_batch,trg_batch = [],[]

# Define the batch size
BATCH_SIZE = 2
block_size = 10
# Loop to create src batch and target batch
for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter)) # Get the first sample every time, as we convert the train data iterable in every turn


    # turn the text into tokenizer and then send them
    text_tokens = tokenizer(text)

    # Generate source and target tokens
    src_tokens,trg_tokens = get_sample(block_size,text_tokens) # block_size is previously declared; block_size = 10

    # Get the indices of that tokens
    src_indices,trg_indices = vocab(src_tokens),vocab(trg_tokens)

    # Turn the indices into tensors
    src_sequence = torch.tensor(src_indices)
    trg_sequence = torch.tensor(trg_indices)
    

    # print the output 
    print(f"Sample: {i}")
    print(f"Source Sequence (Text): {src_tokens}")
    print(f"Source Sequence (Indices): {src_indices}")
    print(f"Source Sequence (Shape): {len(src_tokens)}")
    print(f"Target Sequence (Text): {trg_tokens}")
    print(f"Target Sequence (Indices): {t_indices}")
    print(f"Target Sequence (Shape): {len(tgt_tokens)}")

Sample: 0
Source Sequence (Text): ['fact', 'that', 'any', 'sex', 'shown', 'in', 'the', 'film', 'is', 'shown']
Source Sequence (Indices): [198, 16, 93, 338, 693, 14, 4, 25, 11, 693]
Source Sequence (Shape): 10
Target Sequence (Text): ['that', 'any', 'sex', 'shown', 'in', 'the', 'film', 'is', 'shown', 'for']


NameError: name 'tgt_indices' is not defined

In [None]:
vocab(['i','don','t','give','a','flying','fuck'])