## Importing Library

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from torchtext.datasets import IMDB
import time
import random
import warnings
def warn(*args,**kwargs):
    pass

warnings.warn = warn()
warnings.filterwarnings('ignore')

## Dataset

In [3]:
## Loading dataset
train_iter,test_iter = IMDB()
train_iter,test_iter

(ShardingFilterIterDataPipe, ShardingFilterIterDataPipe)

In [4]:
# Quantitative and qualitative information about the training data
label,text_list = [],[]
test_label,test_text_list = [],[]
start_time = time.time()
for _,text in train_iter:
    label.append(_)
    text_list.append(text)

print(len(label))
print(list(set(label)))
print(f"Total sample:{len(text_list)}")


for _,text in test_iter:
    test_label.append(_)
    test_text_list.append(text)

end_time = time.time()
duration = end_time-start_time
print(len(test_label))
print(list(set(test_label)))
print(f"Total sample in testdataset:{len(test_text_list)}")
print(f"Time required: {duration:.2f} seconds")

12500
[1]
Total sample:12500
25000
[1, 2]
Total sample in testdataset:25000
Time required: 0.63 seconds


The training dataset has 12500 text block and the test dataset has 25000 text block

In [7]:
_,text = next(iter(train_iter))
_test,text_test = next(iter(test_iter))

print(f"First Train Text example: {text}\n")
print(f"First Test Text example: {text_test}")

First Train Text example: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes ar

## Data Processing

In [8]:
## Create tokens
tokenizer = get_tokenizer('basic_english')

UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2

special_symbols = ['<unk>','<pad>','<|endoftext|>']

vocab = build_vocab_from_iterator(map(tokenizer,[text for _,text in train_iter]), specials = special_symbols,special_first = True)


In [9]:
len(vocab)

68813

In [10]:
vocab['drink']

2435

In [11]:
## Sample accumulation
"""In decoder you have to define at how many blocks the model will look at once, This could be mentioned as context."""

def get_sample(context_size, text):
    """Context_size: The number of tokens the model will look at once
    text: the whole text_tokens, From where the model will retrieve the tokens and assign to the source and target"""
    sample_len = len(text)

    src,trg = [],[]
    if sample_len>context_size:
        start = torch.randint(low = 0,high = sample_len-context_size, size = (1,)).item()
        end = start+context_size

        src = text[start:end]
        trg = text[start+1:end+1]


    elif sample_len< context_size:
        start = torch.randint(0,sample_len,size = (1,)).item()
        end = start+context_size

        src = text[start:end]
        trg = text[start+1:end]

        trg.append('<|endoftext|>')

    return src, trg
        

In [12]:
BATCH_SIZE = 2
CONTEXT_SIZE = 20
for _ in range(BATCH_SIZE):
    _,text = next(iter(train_iter))

    src,trg = get_sample(context_size = CONTEXT_SIZE, text = tokenizer(text))

    print(f"Sample: {_}")
    print(f"Source: {src}")
    print(f"Target: {trg}")
    

Sample: 1
Source: ['potatoes', '(', 'no', 'pun', 'intended', ')', 'of', 'swedish', 'cinema', '.', 'but', 'really', ',', 'this', 'film', 'doesn', "'", 't', 'have', 'much']
Target: ['(', 'no', 'pun', 'intended', ')', 'of', 'swedish', 'cinema', '.', 'but', 'really', ',', 'this', 'film', 'doesn', "'", 't', 'have', 'much', 'of']
Sample: 1
Source: ['like', 'some', 'cheaply', 'made', 'porno', '.', 'while', 'my', 'countrymen', 'mind', 'find', 'it', 'shocking', ',', 'in', 'reality', 'sex', 'and', 'nudity', 'are']
Target: ['some', 'cheaply', 'made', 'porno', '.', 'while', 'my', 'countrymen', 'mind', 'find', 'it', 'shocking', ',', 'in', 'reality', 'sex', 'and', 'nudity', 'are', 'a']


## INDEX TO ENGLISH & ENGLISH TO INDEX

In [13]:
idx_to_eng = lambda seq : " ".join([vocab.get_itos()[idx] for idx in seq])
eng_to_idx = lambda text: [vocab[token] for token in tokenizer(text)]

In [14]:
BATCH_SIZE  = 5
CONTEXT_SIZE = 20
src_batch,trg_batch = [],[]

for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter)) # Take the first text sample of the training data
    src,trg = get_sample(context_size = CONTEXT_SIZE,text = tokenizer(text))
    src_vocab,trg_vocab = vocab(src),vocab(trg)
    src_tensors,trg_tensors = torch.tensor(src_vocab,dtype = torch.int64),torch.tensor(trg_vocab,dtype = torch.int64)

    src_batch.append(src_tensors),trg_batch.append(trg_tensors)

    print(f"sample: {i}")
    print(f"Source: {src_batch}")
    print(f"Target: {trg_batch}")

sample: 0
Source: [tensor([  230, 24141,    11,     6,    61,    25,    20,   248,  1798,    10,
         2307,     4,  2876,     7, 14661,    29,    56,  4419,  1218,    27])]
Target: [tensor([24141,    11,     6,    61,    25,    20,   248,  1798,    10,  2307,
            4,  2876,     7, 14661,    29,    56,  4419,  1218,    27,     9])]
sample: 1
Source: [tensor([  230, 24141,    11,     6,    61,    25,    20,   248,  1798,    10,
         2307,     4,  2876,     7, 14661,    29,    56,  4419,  1218,    27]), tensor([  248,  1798,    10,  2307,     4,  2876,     7, 14661,    29,    56,
         4419,  1218,    27,     9,  3994,   534,     3,    21,    69,     5])]
Target: [tensor([24141,    11,     6,    61,    25,    20,   248,  1798,    10,  2307,
            4,  2876,     7, 14661,    29,    56,  4419,  1218,    27,     9]), tensor([ 1798,    10,  2307,     4,  2876,     7, 14661,    29,    56,  4419,
         1218,    27,     9,  3994,   534,     3,    21,    69,     5,    15

## Create Custom Collate Function

In [15]:
def collate_function(batch):
    """The collate_batch function prepares batches of source and target sequences for training by processing each text sample in a given batch. 
    It generates source and target sequences using the get_sample function with a specified block size, converts these sequences to indices using a vocabulary, and transforms them into PyTorch tensors. 
    The sequences are then padded to ensure uniform length across the batch. Finally, it returns the padded source and target batches, ready for training on the specified device (DEVICE)."""

    src_batch,trg_batch = [],[]
    for _,text in batch:

        token_text = tokenizer(text)

        src,trg = get_sample(CONTEXT_SIZE,token_text)

        src_indices, trg_indices = vocab(src),vocab(trg)

        src_seq,trg_seq = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(trg_indices,dtype = torch.int64)

        
        src_batch.append(src_seq)
        trg_batch.append(trg_seq)



    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    trg_batch = pad_sequence(trg_batch,padding_value = PAD_IDX, batch_first = False)
    return src_batch,trg_batch

## Create DataLoaders

In [16]:
train_dataloader = DataLoader(dataset = train_iter,
                             batch_size = 1,
                             shuffle = True,
                             collate_fn = collate_function)
test_dataloader = DataLoader(dataset = test_iter,
                            batch_size = 1,
                            shuffle = True,
                            collate_fn = collate_function)

## Iterating through Data sample

In [17]:
batch = iter(train_dataloader)

for sample in range(10):
    src,trg = next(batch)

    print(f"sample: {sample}")
    print(f"Source: {idx_to_eng(src)}")
    print(f"Target: {idx_to_eng(trg)}")

sample: 0
Source: snippets of a group doing a bad jackson family imitation and eddie murphy morphing from little richard to james brown
Target: of a group doing a bad jackson family imitation and eddie murphy morphing from little richard to james brown to
sample: 1
Source: is made out here to be an apparent dumb-red head , but by film ' s end , she is
Target: made out here to be an apparent dumb-red head , but by film ' s end , she is the
sample: 2
Source: ( and mood ) of a wake . and , as an obviously low budget film , the cinematography and
Target: and mood ) of a wake . and , as an obviously low budget film , the cinematography and acting
sample: 3
Source: getting in touch with his inner female , for goof value . but , even such accidental humor is hard
Target: in touch with his inner female , for goof value . but , even such accidental humor is hard to
sample: 4
Source: this movie has to be january jones ' emotionless performance . . . i guess a pretty face really is
Target: movie 

## Positional Embedding

In [40]:
class PositionalEmbedding(nn.Module):

    def __init__(self,
                emb_dim: int,
                dropout: float,
                max_len = 5000):

        super().__init__()

        den = torch.exp(-torch.arange(0,emb_dim,2)*math.log(10000)/emb_dim)
        pos  = torch.arange(0,max_len).reshape(maxlen,1)

        pos_embedding = torch.zeros(size = (pos,emb_dim))


        pos_embedding[:,0::2] = torch.sin(pos*den)
        pos_embedding[:,1::2] = torch.cos(pos*den)


        # Pos_embedding_shape = [seq_len,emb_dim]

        # add the batch_size to the sequence
        pos_embedding = pos_embedding.unsqueeze(dim = -2)

        self.dropout = nn.Dropout(dropout)

        # Positional embedding is a non-learnable parameter. It won't be updated with time
        
        self.register_buffer('pos_embedding',pos_embedding)


    def forward(self,token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0)])

## Masking

In [36]:
def create_mask(src_seq_len):

    mask = (torch.triu(torch.ones(size = (src_seq_len,src_seq_len)))==1).transpose(0,1)
    mask = mask.float().masked_fill(mask==1,float(0.0)).masked_fill(mask ==0,float('-inf'))

    return mask

In [37]:
# Experiment
triu = torch.triu(torch.ones(size = (3,3))==1).transpose(0,1)
mask = triu.float().masked_fill(triu==1,float(0.0)).masked_fill(triu ==0,float('-inf'))
mask

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])

In [38]:
def generate_mask(src):
    src_seq_len = src.shape[0]
    src_mask = create_mask(src_seq_len)
    src_padding_mask = (src==PAD_IDX).transpose(0,1)

    return src_mask, src_padding_mask

In [39]:
## Test
src_t = torch.rand(5,5)
m =generate_mask(src_t)
m

(tensor([[0., -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf],
         [0., 0., 0., 0., -inf],
         [0., 0., 0., 0., 0.]]),
 tensor([[False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False]]))