## Importing Libraries

In [43]:
import torchtext
from torchtext.datasets import multi30k,Multi30k
import torch
from typing import Iterable, List
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Transformer
import math
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import IMDB,PennTreebank
import time



import warnings
def warn(*args,**kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings('ignore')


In [3]:
train_iter, valid_iter = IMDB()

In [4]:
data_iter = iter(train_iter)
next(data_iter)
next(data_iter)
next(data_iter)

(1,
 "If only to avoid making this type of film in the future. This film is interesting as an experiment but tells no cogent story.<br /><br />One might feel virtuous for sitting thru it because it touches on so many IMPORTANT issues but it does so without any discernable motive. The viewer comes away with no new perspectives (unless one comes up with one while one's mind wanders, as it will invariably do during this pointless film).<br /><br />One might better spend one's time staring out a window at a tree growing.<br /><br />")

### Data Processing

* Special Symbols and Indices
    * UNK_IDX
    * PAD_IDX
    * EOS_IDX

* yield tokens
* create vocabs
* Default index for unknown tokens
* Text to index
* Index to en


In [5]:
UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2
special_symbols = ['<unk>', '<pad>', '<|endoftext|>']

In [6]:
tokenizer = get_tokenizer('basic_english')

In [7]:
def yield_tokens(data_iter):
    for _,data_sample in data_iter:
        yield tokenizer(data_sample)


vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials = special_symbols, special_first=True)
vocab.set_default_index(UNK_IDX)

In [10]:
text_to_index = lambda text: [vocab[token] for token in tokenizer(text)]
index_to_english = lambda seq_en:" ".join([vocab.get_itos()[index] for index in seq_en])

In [11]:
index_to_english(torch.tensor([0,1,2]))

'<unk> <pad> <|endoftext|>'

In [14]:
items = list(vocab.get_itos())[:30]
items

['<unk>',
 '<pad>',
 '<|endoftext|>',
 '.',
 'the',
 ',',
 'a',
 'and',
 "'",
 'of',
 'to',
 'is',
 'it',
 'i',
 'in',
 'this',
 'that',
 's',
 'was',
 'movie',
 'for',
 'but',
 'with',
 'as',
 't',
 'film',
 'you',
 ')',
 'on',
 '(']

### Collate Function

In [17]:
def get_sample(block_size, text):
    # block size: Context size, how many tokens/indices the model will look after at a time
    # text: text is the long sequence of indices.
    # Determine the length of the input text
    sample_leg = len(text) # Get the total number of tokens in the text

    # Calculate the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text
    random_sample_stop = sample_leg - block_size
    # You're calculating how far you can go into the text while still being able to extract a full block_size chunk.


    if random_sample_stop >= 1:
        random_start = torch.randint(0,random_sample_stop,size = (1,)).item()

        stop = random_start + block_size

        # Create the input and the target sequence
        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop+1]


    # Handle the case where the text length is exactly equal to the block size or less than that

    elif random_sample_stop<=0:

        random_start = 0
        stop = sample_leg

        src_seq = text[random_start:stop]
        trg_seq = text[random_start+1:stop]

        # Append an empty string to maintain the sequence alignmenbt
        trg_seq.append('<|endoftext>|')


    return src_seq,trg_seq    

In [18]:
BATCH_SIZE = 1
batch_of_tokens = []


for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter))
    batch_of_tokens.append(tokenizer(text))

In [22]:
len(batch_of_tokens)

1

In [23]:
text = batch_of_tokens[0][0:100]
text[0:100]
batch_of_tokens

[['i',
  'rented',
  'i',
  'am',
  'curious-yellow',
  'from',
  'my',
  'video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it',
  'was',
  'first',
  'released',
  'in',
  '1967',
  '.',
  'i',
  'also',
  'heard',
  'that',
  'at',
  'first',
  'it',
  'was',
  'seized',
  'by',
  'u',
  '.',
  's',
  '.',
  'customs',
  'if',
  'it',
  'ever',
  'tried',
  'to',
  'enter',
  'this',
  'country',
  ',',
  'therefore',
  'being',
  'a',
  'fan',
  'of',
  'films',
  'considered',
  'controversial',
  'i',
  'really',
  'had',
  'to',
  'see',
  'this',
  'for',
  'myself',
  '.',
  'the',
  'plot',
  'is',
  'centered',
  'around',
  'a',
  'young',
  'swedish',
  'drama',
  'student',
  'named',
  'lena',
  'who',
  'wants',
  'to',
  'learn',
  'everything',
  'she',
  'can',
  'about',
  'life',
  '.',
  'in',
  'particular',
  'she',
  'wants',
  'to',
  'focus',
  'her',
  'attentions',
  'to',
  'making',
 

In [24]:
len(text)

100

In [25]:
len(batch_of_tokens[0])

317

In [31]:
block_size = 10
src_seq,trg_seq = get_sample(block_size,text)

In [32]:
src_seq,trg_seq

(['video',
  'store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it'],
 ['store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when'])

In [33]:
print(f"src: {src_seq}")
print(f"trg: {trg_seq}")

src: ['video', 'store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it']
trg: ['store', 'because', 'of', 'all', 'the', 'controversy', 'that', 'surrounded', 'it', 'when']


In [36]:
# Initialize empty lists to store source and target
src_batch,trg_batch = [],[]

# Define batch size
BATCH_SIZE = 2

# Loop to create batches of source and target sequence
for i in range(BATCH_SIZE):
    _, text = next(iter(train_iter))

    # Generate source and target sequences using the get_sample 
    src_sequence_text, tgt_sequence_text = get_sample(block_size,tokenizer(text))


    # Convert source and target sequences to tokenized vocabulary indices
    src_sequence_indices = vocab(src_sequence_text)

    tgt_sequence_indices = vocab(tgt_sequence_text)

    # Convert the sequences to PyTorch Tensors with dtype int64
    src_sequence = torch.tensor(src_sequence_indices)
    tgt_sequence = torch.tensor(tgt_sequence_indices)

    # Append the source and target sequences to their respective batches
    src_batch.append(src_sequence)
    trg_batch.append(tgt_sequence)

    # Print the output for every 2nd sample
    print(f"Sample {i}")
    print(f"Source Sequence (Text): {src_sequence_text}")
    print(f"Source Sequence (Indices): {src_sequence_indices}")
    print(f"Source Sequence (Shape): {src_sequence.shape}")
    print(f"Target Sequence (Text): {tgt_sequence_text}")
    print(f"Target Sequence (Indices): {tgt_sequence_indices}")
    print(f"Target Sequence (Shape): {tgt_sequence.shape}")

Sample 0
Source Sequence (Text): ['seized', 'by', 'u', '.', 's', '.', 'customs', 'if', 'it', 'ever']
Source Sequence (Indices): [17608, 46, 1466, 3, 17, 3, 11063, 51, 12, 124]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['by', 'u', '.', 's', '.', 'customs', 'if', 'it', 'ever', 'tried']
Target Sequence (Indices): [46, 1466, 3, 17, 3, 11063, 51, 12, 124, 608]
Target Sequence (Shape): torch.Size([10])
Sample 1
Source Sequence (Text): ['scenes', 'in', 'his', 'films', '.', 'i', 'do', 'commend', 'the', 'filmmakers']
Source Sequence (Indices): [144, 14, 39, 129, 3, 13, 81, 11638, 4, 839]
Source Sequence (Shape): torch.Size([10])
Target Sequence (Text): ['in', 'his', 'films', '.', 'i', 'do', 'commend', 'the', 'filmmakers', 'for']
Target Sequence (Indices): [14, 39, 129, 3, 13, 81, 11638, 4, 839, 20]
Target Sequence (Shape): torch.Size([10])


In [51]:
BLOCK_SIZE = 30

def collate_batch(batch):

    src_batch,tgt_batch = [], []

    for _,text in batch:
        tokens = tokenizer(text)
        src_tokens,tgt_tokens = get_sample(block_size,tokens)
        src_indices,tgt_indices = vocab(src_tokens),vocab(tgt_tokens)
        src_sequences,tgt_sequences = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(tgt_indices,dtype = torch.int64)
        src_batch.append(src_sequences)
        tgt_batch.append(tgt_sequences)


    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    tgt_batch = pad_sequence(tgt_batch,padding_value = PAD_IDX, batch_first = False)

    return src_batch.to(DEVICE),tgt_batch.to(DEVICE)

In [52]:
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
DEVICE

'mps'

In [53]:
BATCH_SIZE = 1
from torch.utils.data import Dataset,DataLoader

dataloader = DataLoader(train_iter,
                       batch_size =BATCH_SIZE,
                       shuffle = True,
                       collate_fn = collate_batch)

val_dataloader = DataLoader(valid_iter,
                       batch_size =BATCH_SIZE,
                       shuffle = True,
                       collate_fn = collate_batch)


## Iterating Through Data Samples

`dataset`, an iterator is formed to run over the dataloader, which in return provide src and tgt pair.

In [57]:
dataset = iter(dataloader)
for sample in range(10):
    src,tgt = next(dataset)
    print("sample",sample)
    print("sorce:",index_to_en(src))
    print("\n")
    print("target:",index_to_en(tgt))
    print("\n")

sample 0
sorce: s keen aspiration to pay homage to ozu on his


target: keen aspiration to pay homage to ozu on his centenary


sample 1
sorce: to be an ankylosaur seriously ? and the tyrannosaur seems


target: be an ankylosaur seriously ? and the tyrannosaur seems rooted


sample 2
sorce: annoying . and the saddest thing is the movie is


target: . and the saddest thing is the movie is too


sample 3
sorce: this move had , the reality was disappointing . while


target: move had , the reality was disappointing . while it


sample 4
sorce: i kept watching this hoping that i could see why


target: kept watching this hoping that i could see why it


sample 5
sorce: mr . dark , what happened to mr . coogan


target: . dark , what happened to mr . coogan on


sample 6
sorce: dont know who wrote the script but i bet they


target: know who wrote the script but i bet they got


sample 7
sorce: . it seems to me whomever made this movie is


target: it seems to me whomever made this movie is

In [55]:
def index_to_en(batch):
    return " ".join([vocab.get_itos()[idx] for idx in batch])

## Masking

In [58]:
def generate_square_subsequent_mask(sz,device = DEVICE):
    mask = (torch.triu(torch.ones((sz,sz),device = device))==1).transpose(0,1)
    mask = mask.float().masked_fill(mask ==0,float ('-inf')).masked_fill(mask ==1, float(0.0))
    return mask

In [67]:
def create_mask(src, device = DEVICE):
    src_seq_len = src.shape[0]
    src_mask = generate_square_subsequent_mask(src_seq_len)
    src_padding_mask = (src ==PAD_IDX).transpose(0,1)
    return src_mask,src_padding_mask

## Positional Encoding

In [89]:
class PositionalEncoding(nn.Module):

    def __init__(self,
                emb_dim: int,
                dropout: float,
                maxlen: int = 5000):

        super().__init__()

        den = torch.exp(-torch.arange(0,emb_dim,2)*math.log(10000)/emb_dim)
        pos = torch.arange(0,maxlen).reshape(maxlen,1)
        pos_embedding = torch.zeros(size = (maxlen,emb_dim))

        pos_embedding[:,0::2] = torch.sin(pos*den)
        pos_embedding[:,1::2] = torch.cos(pos*den)

        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self,token_embedding):
        return self.dropout(token_embedding +self.pos_embedding[:token_embedding.size(0),:])

## Token Embedding

In [90]:
class TokenEmbedding(nn.Module):

    def __init__(self,vocab_size,emb_dim):

        super().__init__()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.emb_dim = emb_dim

    def forward(self,tokens):

        return self.embedding(tokens.long())* math.sqrt(self.emb_dim)

## Custom GPT model architecture

* Initialization (__init__): embed_size, vocab_size, num_heads,num_layers,max_seq_len and dropout
* lm_head: generating logits over the vocabulary

* **Weight initialization** initializes the weights of the model for better training convergence. The Xavier uniform initialization is used, which is a common practice for initializing weights in deep learning
* **Decoder** Although named `'decoder'`, this method currently functions as the forward pass through the transformer encoder layers, followed by the generation of `logits` for the language modelling task. It handles the addition of positional encodings to the embeddings and applies a mask if necessary.
* **Forward pass**: This method is similar to the `decoder` method, and defines the forward computation of the model. It produces the input through embedding layers, positional encoding, transformer encoder layers, and produces the final output using the `lm_head`
* **Mask Generation:** Both **decoder** and **forward** methods contain logic to generate a square causal mask if no source mask is provided.
mask ensures that the prediction for a position does not depend on the future tokens in the sequence, which is important for the autoregressive nature of gpt models.
* **Commented Out Decoder**: A section of the code is commented out, suggesting an initial design where a transformer decoder layer was considered. However, the final implementation uses only encoder layers, which is a common simplification for models focusing on language modeling and generation

In [91]:
class CustomGPTModel(nn.Module):

    def __init__(self,emb_dim:int,
                vocab_size: int,
                num_heads: int,
                num_layers: int,
                max_seq_len = 500,
                 dropout = 0.1):
        super().__init__()

        self.init_weights()

        self.embed = nn.Embedding(vocab_size, emb_dim)

        self.positional_encoding = PositionalEncoding(emb_dim = emb_dim,
                                                     dropout = dropout)

        print(emb_dim)

        # Remaining layers are part of the TransformerDecoder
        encoder_layers = nn.TransformerEncoderLayer(d_model = emb_dim,
                                                   nhead = num_heads,
                                                   dropout = dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer=encoder_layers,
                                                        num_layers=num_layers)
        self.emb_dim = emb_dim
        self.lm_head = nn.Linear(emb_dim, vocab_size) # produce the final output, the final logits over the vocabulary




    def init_weights(self):
        
        for p in self.parameters():
            if p.dim()>1:
                nn.init.xavier_uniform_(p)


    def create_mask(src, device = DEVICE):
        src_seq_len = src.shape[0]
        src_mask = nn.Transformer.generate_square_subsequent_mask(src_seq_len)
        src_padding_mask = (src ==PAD_IDX).transpose(0,1)
        return src_mask, src_padding_mask

    def decoder(self, x, src_mask):
        seq_len = x.size(0)


        # Add positional embeddings to the input embeddings
        x = self.embed(x)*math.sqrt(self.emb_dim)

        x = self.positional_encoding(x)


        if src_mask is None:
            """Generate a square casual mask for the sequence. The masked positions are filled with -inf and the unmasked postions will be filled with 0"""

            src_mask, src_padding_mask =create_mask(x)

        output = self.transformer_encoder(x,src_mask)
        logits = self.lm_head(output)
        return logits


    def forward(self,x, src_mask = None, key_padding_mask = None):

        seq_length = x.size(0)

        embedding = self.embed(x) * math.sqrt(self.emb_dim)
        x = self.positional_encoding(embedding)

        if src_mask is None:
            src_mask,src_padding_mask = create_mask(x)

        output = self.transformer_encoder(x,src_mask, key_padding_mask)

        x = self.lm_head(x)

        return x
        

## Model Configaration and initializaion
Here, we configure and instantiate a Custom GPT Model with the following specifications:

- `ntokens`: The total number of unique tokens in the vocabulary, which the model will use to represent words.
- `emsize`: The size of each embedding vector. In this model, each word will be represented by a 200-dimensional vector.
- `nlayers`: The number of transformer encoder layers in the model. We are using two layers in this configuration.
- `nhead`: The number of attention heads in the multi-head attention mechanism. The model will use two attention heads.
- `dropout`: A regularization technique where randomly selected neurons are ignored during training to prevent overfitting. Here, we set the dropout probability to 0.2.

After setting these hyperparameters, we create an instance of `CustomGPTModel` by passing in the embedding size, number of attention heads, number of layers, vocabulary size, and dropout probability. The model is then moved to the specified `DEVICE`, which could be a CPU or GPU, for training or inference.


In [92]:
ntokens = len(vocab)
emb_dim = 200 
nlayers = 2 
nhead = 2
dropout = 0.2

model = CustomGPTModel(emb_dim = emb_dim,
                      vocab_size = ntokens,
                      num_heads=nhead,
                      num_layers=nlayers,
                      dropout = dropout).to(DEVICE)

200


## Prompting

In [93]:
def encode_prompt(prompt, block_size = BLOCK_SIZE):

    # Handle none prompt
    while prompt is None:
        prompt = input("Sorry, prompt cannot be empty. Please enter a valid prompt")

    tokens = tokenizer(prompt)
    number_of_tokens = len(tokens)

    # Handle long prompt
    if number_of_tokens>block_size:
        tokens = tokens[-block_size:]

    prompt_indices = vocab(tokens)

    prompt_encoded = torch.tensor(prompt_indices,dtype = torch.int64).reshape(-1,1)
    

    return prompt_encoded

In [94]:
print(index_to_en(encode_prompt(None)))

Sorry, prompt cannot be empty. Please enter a valid prompt Neyman is the best footballer in the world


<unk> is the best footballer in the world


In [98]:
print(encode_prompt("This is my last message to you all"))

tensor([[ 15],
        [ 11],
        [ 72],
        [256],
        [806],
        [ 10],
        [ 26],
        [ 40]])


In [112]:
prompt_encoded = encode_prompt("Neymar is the best").to(DEVICE)
prompt_encoded

tensor([[  0],
        [ 11],
        [  4],
        [178]], device='mps:0')

In [113]:
logits = model.decoder(prompt_encoded,src_mask=None).to(DEVICE)

In [114]:
logits.shape

torch.Size([4, 1, 68813])

In [115]:
logits = logits.transpose(0,1)
logits.shape

torch.Size([1, 4, 68813])

In [116]:
logit_prediction = logits[:,-1]
logit_prediction.shape

torch.Size([1, 68813])

In [117]:
_,next_word_index = torch.max(logit_prediction, dim = 1)
next_word_index

tensor([52875], device='mps:0')

In [118]:
index_to_en(next_word_index)

'lesser-sitcom'

## Autoregressive text generation

In [119]:
prompt = "This is the beginning of"

In [121]:
prompt_encoded = encode_prompt(prompt).to(DEVICE)
print("Device for prompt encoded:",prompt_encoded.shape)

Device for prompt encoded: torch.Size([5, 1])


In [122]:
max_new_tokens = 10

for i in range(max_new_tokens):

    logits = model.decoder(prompt_encoded,src_mask = None)
    logits = logits.transpose(0,1)

    print(" ")
    print(f"Shape of logits at step {i} {logits.shape}")

    logit_prediction = logits[:,-1]
    print(f"Shape of logit prediction at step {i}: {logit_prediction.shape}")

    next_token_encoded = torch.argmax(logit_prediction,dim = 1).reshape(-1,1)
    print(f"Shape of next token encoded at step {i} : {next_token_encoded.shape}")

    prompt_encoded = torch.cat((prompt_encoded, next_token_encoded),dim = 0).to(DEVICE)
    print(f"Sequence for step{i}: {[index_to_en(j) for j in prompt_encoded]}")
    print(f"Shape: {prompt_encoded.shape}")

 
Shape of logits at step 0 torch.Size([1, 5, 68813])
Shape of logit prediction at step 0: torch.Size([1, 68813])
Shape of next token encoded at step 0 : torch.Size([1, 1])
Sequence for step0: ['this', 'is', 'the', 'beginning', 'of', '*martin']
Shape: torch.Size([6, 1])
 
Shape of logits at step 1 torch.Size([1, 6, 68813])
Shape of logit prediction at step 1: torch.Size([1, 68813])
Shape of next token encoded at step 1 : torch.Size([1, 1])
Sequence for step1: ['this', 'is', 'the', 'beginning', 'of', '*martin', 'voyeuristic']
Shape: torch.Size([7, 1])
 
Shape of logits at step 2 torch.Size([1, 7, 68813])
Shape of logit prediction at step 2: torch.Size([1, 68813])
Shape of next token encoded at step 2 : torch.Size([1, 1])
Sequence for step2: ['this', 'is', 'the', 'beginning', 'of', '*martin', 'voyeuristic', 'rickety']
Shape: torch.Size([8, 1])
 
Shape of logits at step 3 torch.Size([1, 8, 68813])
Shape of logit prediction at step 3: torch.Size([1, 68813])
Shape of next token encoded at s

In [123]:
UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2

# Make sure the tokens are in order of their indices to properly insert them in vocab
special_symbols = ['<unk>','<pad>','<|endoftext|>']
BLOCK_SIZE

30

In [None]:
# auto regressive language model text generation
def generate(model, prompt = None, max_new_tokens = 500, block_size = BLOCK_SIZE,vocab = vocab, tokenizer = tokenizer):
    model.to(DEVICE)

    # Encode the input prompt
    prompt_encoded = encode_prompt(prompt).to(DEVICE) # return the indices number in a tensor form

    tokens = []

    # Generate new tokens up to max_new_tokens
    for _ in range(max_new_tokens):

        # Decode the input prompt using the provided encode_prompt function
        logits = model(prompt_encoded, src_mask = None, key_padding_mask = None)

        # Transpose the logits to bring the sequence length to the first diemnsion
        logits = logits.transpose(0,1)

        # select the logits of the last token in the sequence
        logit_prediction = logits[:,-1]

        # choose the most probable next token from the logits(greedy decoding)
        next_token_encoded = torch.argmax(logit_prediction, dim = -1).reshape(-1,1)

        # if the next token is the end of sequence(EOS) token, stop generation
        if next_token_encoded.item() == EOS_IDX:
            break

        # Append the next token to the prompt encoded and keep only the last 'block_size' tokens
        prompt_encoded = torch.cat((prompt_encoded, next_token_encoded),dim = 0) [-block_size:]

        # Convert the next token index to a token string using c
        

    

## Decoder model

In [124]:
# load the dataset
# import the libraries
import torch
import torchtext
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
from torch.utils.data import Dataset,DataLoader


In [126]:
## collect the dataset
from torchtext.datasets import IMDB

In [127]:
train_iter, test_iter = IMDB()

In [128]:
train_data_sample = next(iter(train_iter))
train_data_sample

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [130]:
def yield_tokens(texts):
    for _,text in texts:
        yield tokenizer(text)

In [131]:
tokens = yield_tokens(train_iter)

In [32]:
#next(tokens)

In [138]:
# Create vocabulary from the tokens
vocab = build_vocab_from_iterator(yield_tokens(train_iter))


In [140]:
len(vocab.get_itos())

68810

In [None]:
# Create the tokens of the whole dataset
# get the vocab 
# create the embedding
class Embedding(nn.Module):
    def __i

In [60]:
one_t = torch.ones(3,3)
one_t

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])

In [61]:
upper_t = torch.triu(one_t)
upper_t

tensor([[1., 1., 1.],
        [0., 1., 1.],
        [0., 0., 1.]])

In [62]:
tru = upper_t==1
tru

tensor([[ True,  True,  True],
        [False,  True,  True],
        [False, False,  True]])

In [65]:
tru_trans = tru.transpose(0,1)
tru_trans

tensor([[ True, False, False],
        [ True,  True, False],
        [ True,  True,  True]])

## Decoder WorkPipeline

In [1]:
### Importing Libraries

In [38]:
import torch
import torchtext
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import DataLoader,Dataset
from torchtext.datasets import Multi30k,multi30k
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer


import nltk
from torch.nn.utils.rnn import pad_sequence

import matplotlib.pyplot as plt


import warnings
def warn(*args,**kwargs):
    pass
warnings.warn = warn()
warnings.filterwarnings('ignore')

## Datasets

In [11]:
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB()

In [26]:
label,sample = next(iter(train_iter))

In [27]:
label,sample

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

The dataset contains reviews about a number a movies. Reviewers labelled the movie as 1 or 0. while 1 indicates that the movie review is tend to positive and 0 indicates that the movie review is not that good


In [30]:
label_list =set([label for label,_ in train_iter])

    


In [31]:
label_list

{1}

## Data Processing

* Padding
* Unk_idx, pad_idx,
* index to text
* text to idx

In [None]:
UNK_IDX,PAD_IDX,EOS_IDX = []

In [33]:
# Declare tokenizer
tokenizer = get_tokenizer('basic_english')
## Tokenization
def yield_token(dataset):
    for label,text in dataset:
        yield tokenizer(text)

In [37]:
train_tokens = yield_token(train_iter)

In [36]:
# Create Vocabulary
vocab = build_vocab_from_iterator()

['i',
 'rented',
 'i',
 'am',
 'curious-yellow',
 'from',
 'my',
 'video',
 'store',
 'because',
 'of',
 'all',
 'the',
 'controversy',
 'that',
 'surrounded',
 'it',
 'when',
 'it',
 'was',
 'first',
 'released',
 'in',
 '1967',
 '.',
 'i',
 'also',
 'heard',
 'that',
 'at',
 'first',
 'it',
 'was',
 'seized',
 'by',
 'u',
 '.',
 's',
 '.',
 'customs',
 'if',
 'it',
 'ever',
 'tried',
 'to',
 'enter',
 'this',
 'country',
 ',',
 'therefore',
 'being',
 'a',
 'fan',
 'of',
 'films',
 'considered',
 'controversial',
 'i',
 'really',
 'had',
 'to',
 'see',
 'this',
 'for',
 'myself',
 '.',
 'the',
 'plot',
 'is',
 'centered',
 'around',
 'a',
 'young',
 'swedish',
 'drama',
 'student',
 'named',
 'lena',
 'who',
 'wants',
 'to',
 'learn',
 'everything',
 'she',
 'can',
 'about',
 'life',
 '.',
 'in',
 'particular',
 'she',
 'wants',
 'to',
 'focus',
 'her',
 'attentions',
 'to',
 'making',
 'some',
 'sort',
 'of',
 'documentary',
 'on',
 'what',
 'the',
 'average',
 'swede',
 'thought',
