## Importing Library

In [36]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from torchtext.datasets import IMDB
import time
import random
import warnings
def warn(*args,**kwargs):
    pass

warnings.warn = warn()
warnings.filterwarnings('ignore')

## Dataset

In [24]:
train_iter,test_iter = IMDB()

label,text_list = [],[]
test_label,test_text_list = [],[]
start_time = time.time()
for _,text in train_iter:
    label.append(_)
    text_list.append(text)

print(len(label))
print(list(set(label)))
print(f"Total sample:{len(text_list)}")


for _,text in test_iter:
    test_label.append(_)
    test_text_list.append(text)

end_time = time.time()
duration = end_time-start_time
print(len(test_label))
print(list(set(test_label)))
print(f"Total sample in testdataset:{len(test_text_list)}")
print(f"Time required: {duration:.2f} seconds")

12500
[1]
Total sample:12500
25000
[1, 2]
Total sample in testdataset:25000
Time required: 1.00 seconds


The training dataset has 12500 text block and the test dataset has 25000 text block

In [22]:
_,text = next(iter(train_iter))
_test,text_test = next(iter(test_iter))

print(f"First Train Text example: {text}\n")
print(f"First Test Text example: {text_test}")

First Train Text example: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes ar

## Data Processing

In [26]:
## Create tokens
tokenizer = get_tokenizer('basic_english')

UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2

special_symbols = ['<unk>','<pad>','<|endoftext|>']

vocab = build_vocab_from_iterator(map(tokenizer,[text for _,text in train_iter]), specials = special_symbols,special_first = True)


In [27]:
len(vocab)

68813

In [29]:
vocab['drink']

2435

In [32]:
## Sample accumulation
"""In decoder you have to define at how many blocks the model will look at once, This could be mentioned as context."""

def get_sample(context_size, text):
    """Context_size: The number of tokens the model will look at once
    text: the whole text_tokens, From where the model will retrieve the tokens and assign to the source and target"""
    sample_len = len(text)

    src,trg = [],[]
    if sample_len>context_size:
        start = torch.randint(low = 0,high = sample_len-context_size, size = (1,)).item()
        end = start+context_size

        src = text[start:end]
        trg = text[start+1:end+1]


    elif sample_len< context_size:
        start = torch.randint(0,sample_len,size = (1,)).item()
        end = start+context_size

        src = text[start:end]
        trg = text[start+1:end]

        trg.append('<|endoftext|>')

    return src, trg
        

In [33]:
BATCH_SIZE = 2
CONTEXT_SIZE = 20
for _ in range(BATCH_SIZE):
    _,text = next(iter(train_iter))

    src,trg = get_sample(context_size = CONTEXT_SIZE, text = tokenizer(text))

    print(f"Sample: {_}")
    print(f"Source: {src}")
    print(f"Target: {trg}")
    

Sample: 1
Source: ['to', 'enter', 'this', 'country', ',', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', 'controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for']
Target: ['enter', 'this', 'country', ',', 'therefore', 'being', 'a', 'fan', 'of', 'films', 'considered', 'controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself']
Sample: 1
Source: ['the', 'controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it']
Target: ['controversy', 'that', 'surrounded', 'it', 'when', 'it', 'was', 'first', 'released', 'in', '1967', '.', 'i', 'also', 'heard', 'that', 'at', 'first', 'it', 'was']


## INDEX TO ENGLISH & ENGLISH TO INDEX

In [41]:
idx_to_eng = lambda seq : " ".join([vocab.get_itos()[idx] for idx in seq])
eng_to_idx = lambda text: [vocab[token] for token in tokenizer(text)]

## Create Custom Collate Function

In [37]:
def collate_function(batch):
    """The collate_batch function prepares batches of source and target sequences for training by processing each text sample in a given batch. 
    It generates source and target sequences using the get_sample function with a specified block size, converts these sequences to indices using a vocabulary, and transforms them into PyTorch tensors. 
    The sequences are then padded to ensure uniform length across the batch. Finally, it returns the padded source and target batches, ready for training on the specified device (DEVICE)."""

    src_batch,trg_batch = [],[]
    for _,text in batch:

        token_text = tokenizer(text)

        src,trg = get_sample(context_size,token_text)

        src_indices, trg_indices = vocab(src),vocab(trg)

        src_seq,trg_seq = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(trg_indices,dtype = torch.int64)

        
        src_batch.append(src_seq)
        trg_batch.append(trg_seq)



    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    trg_batch = pad_sequence(trg_batch,padding_value = PAD_IDX, batch_first = False)
    return src_batch,trg_batch

## Create DataLoaders

In [None]:
train_dataloader = DataLoader(dataset = train_iter,
                             batch_size = 1,
                             shuffle = True,
                             collate_fn = collate_ba)

## Iterating through Data sample