## Importing Library

In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset,DataLoader
from torch.nn.utils.rnn import pad_sequence
import torchtext
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

from torchtext.datasets import IMDB
import time
import random
import math
import warnings
def warn(*args,**kwargs):
    pass

warnings.warn = warn
warnings.filterwarnings('ignore')

## Dataset

In [2]:
## Loading dataset
train_iter,test_iter = IMDB()
train_iter,test_iter

(ShardingFilterIterDataPipe, ShardingFilterIterDataPipe)

In [3]:
# Quantitative and qualitative information about the training data
label,text_list = [],[]
test_label,test_text_list = [],[]
start_time = time.time()
for _,text in train_iter:
    label.append(_)
    text_list.append(text)

print(len(label))
print(list(set(label)))
print(f"Total sample:{len(text_list)}")


for _,text in test_iter:
    test_label.append(_)
    test_text_list.append(text)

end_time = time.time()
duration = end_time-start_time
print(len(test_label))
print(list(set(test_label)))
print(f"Total sample in testdataset:{len(test_text_list)}")
print(f"Time required: {duration:.2f} seconds")

12500
[1]
Total sample:12500
25000
[1, 2]
Total sample in testdataset:25000
Time required: 0.92 seconds


The training dataset has 12500 text block and the test dataset has 25000 text block

In [4]:
_,text = next(iter(train_iter))
_test,text_test = next(iter(test_iter))

print(f"First Train Text example: {text}\n")
print(f"First Test Text example: {text_test}")

First Train Text example: I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes ar

## Data Processing

In [5]:
## Create tokens
tokenizer = get_tokenizer('basic_english')

UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2

special_symbols = ['<unk>','<pad>','<|endoftext|>']

vocab = build_vocab_from_iterator(map(tokenizer,[text for _,text in train_iter]), specials = special_symbols,special_first = True)


In [6]:
len(vocab)

68813

In [7]:
vocab['drink']

2435

In [8]:
## Sample accumulation
"""In decoder you have to define at how many blocks the model will look at once, This could be mentioned as context."""

def get_sample(context_size, text):
    """Context_size: The number of tokens the model will look at once
    text: the whole text_tokens, From where the model will retrieve the tokens and assign to the source and target"""
    sample_len = len(text)

    src,trg = [],[]
    if (sample_len-context_size)>=1:
        start = torch.randint(low = 0,high = sample_len-context_size, size = (1,)).item()
        end = start+context_size

        src = text[start:end]
        trg = text[start+1:end+1]


    elif (sample_len-context_size)<=0:
        start = torch.randint(0,sample_len,size = (1,)).item()
        end = start+context_size

        src = text[start:end]
        trg = text[start+1:end]

        trg.append('<|endoftext|>')

    return src, trg
        

In [9]:
BATCH_SIZE = 2
CONTEXT_SIZE = 20
for _ in range(BATCH_SIZE):
    _,text = next(iter(train_iter))

    src,trg = get_sample(context_size = CONTEXT_SIZE, text = tokenizer(text))

    print(f"Sample: {_}")
    print(f"Source: {src}")
    print(f"Target: {trg}")
    

Sample: 1
Source: ['to', 'learn', 'everything', 'she', 'can', 'about', 'life', '.', 'in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some', 'sort']
Target: ['learn', 'everything', 'she', 'can', 'about', 'life', '.', 'in', 'particular', 'she', 'wants', 'to', 'focus', 'her', 'attentions', 'to', 'making', 'some', 'sort', 'of']
Sample: 1
Source: ['years', 'ago', ',', 'this', 'was', 'considered', 'pornographic', '.', 'really', ',', 'the', 'sex', 'and', 'nudity', 'scenes', 'are', 'few', 'and', 'far', 'between']
Target: ['ago', ',', 'this', 'was', 'considered', 'pornographic', '.', 'really', ',', 'the', 'sex', 'and', 'nudity', 'scenes', 'are', 'few', 'and', 'far', 'between', ',']


## INDEX TO ENGLISH & ENGLISH TO INDEX

In [10]:
idx_to_eng = lambda seq : " ".join([vocab.get_itos()[idx] for idx in seq])
eng_to_idx = lambda text: [vocab[token] for token in tokenizer(text)]

In [11]:
BATCH_SIZE  = 5
CONTEXT_SIZE = 20
src_batch,trg_batch = [],[]

for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter)) # Take the first text sample of the training data
    src,trg = get_sample(context_size = CONTEXT_SIZE,text = tokenizer(text))
    src_vocab,trg_vocab = vocab(src),vocab(trg)
    src_tensors,trg_tensors = torch.tensor(src_vocab,dtype = torch.int64),torch.tensor(trg_vocab,dtype = torch.int64)

    src_batch.append(src_tensors),trg_batch.append(trg_tensors)

    print(f"sample: {i}")
    print(f"Source: {src_batch}")
    print(f"Target: {trg_batch}")

sample: 0
Source: [tensor([   58, 12567,  3389,     5,  5375,    80,  1311,    10,    61,   186,
          494,   386,  1978,     5,    70,   338,   144,    14,    39,   129])]
Target: [tensor([12567,  3389,     5,  5375,    80,  1311,    10,    61,   186,   494,
          386,  1978,     5,    70,   338,   144,    14,    39,   129,     3])]
sample: 1
Source: [tensor([   58, 12567,  3389,     5,  5375,    80,  1311,    10,    61,   186,
          494,   386,  1978,     5,    70,   338,   144,    14,    39,   129]), tensor([   25,    20,   248,  1798,    10,  2307,     4,  2876,     7, 14661,
           29,    56,  4419,  1218,    27,     9,  3994,   534,     3,    21])]
Target: [tensor([12567,  3389,     5,  5375,    80,  1311,    10,    61,   186,   494,
          386,  1978,     5,    70,   338,   144,    14,    39,   129,     3]), tensor([   20,   248,  1798,    10,  2307,     4,  2876,     7, 14661,    29,
           56,  4419,  1218,    27,     9,  3994,   534,     3,    21,    69

## Create Custom Collate Function

In [12]:
def collate_function(batch):
    """The collate_batch function prepares batches of source and target sequences for training by processing each text sample in a given batch. 
    It generates source and target sequences using the get_sample function with a specified block size, converts these sequences to indices using a vocabulary, and transforms them into PyTorch tensors. 
    The sequences are then padded to ensure uniform length across the batch. Finally, it returns the padded source and target batches, ready for training on the specified device (DEVICE)."""

    src_batch,trg_batch = [],[]
    for _,text in batch:

        token_text = tokenizer(text)

        src,trg = get_sample(CONTEXT_SIZE,token_text)

        src_indices, trg_indices = vocab(src),vocab(trg)

        src_seq,trg_seq = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(trg_indices,dtype = torch.int64)

        
        src_batch.append(src_seq)
        trg_batch.append(trg_seq)



    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    trg_batch = pad_sequence(trg_batch,padding_value = PAD_IDX, batch_first = False)
    return src_batch,trg_batch

## Create DataLoaders

In [13]:
train_dataloader = DataLoader(dataset = train_iter,
                             batch_size = 1,
                             shuffle = True,
                             collate_fn = collate_function)
test_dataloader = DataLoader(dataset = test_iter,
                            batch_size = 1,
                            shuffle = True,
                            collate_fn = collate_function)

## Iterating through Data sample

In [14]:
batch = iter(train_dataloader)

for sample in range(10):
    src,trg = next(batch)

    print(f"sample: {sample}")
    print(f"Source: {idx_to_eng(src)}")
    print(f"Target: {idx_to_eng(trg)}")

sample: 0
Source: . and saying that it is the same director as the boogeyman , when a new version of that just
Target: and saying that it is the same director as the boogeyman , when a new version of that just came
sample: 1
Source: cop tying the authority , the criminals and the police together - check . slow motion and/or jerky frame rates
Target: tying the authority , the criminals and the police together - check . slow motion and/or jerky frame rates for
sample: 2
Source: , is the pretentious way the episodes are titled . truly great shows are still funny after many , repeated
Target: is the pretentious way the episodes are titled . truly great shows are still funny after many , repeated viewings
sample: 3
Source: something as an outsider . that may be part of the reason for my disappointment . i was expecting more
Target: as an outsider . that may be part of the reason for my disappointment . i was expecting more action
sample: 4
Source: aware of , such as 1 ) does money buy happi

## Device Agnostic Code

In [15]:
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cpu'
DEVICE

'mps'

## Positional Embedding

In [16]:
class PositionalEmbedding(nn.Module):

    def __init__(self,
                emb_dim: int,
                dropout: float,
                max_len = 5000):

        super().__init__()

        den = torch.exp(-torch.arange(0,emb_dim,2)*math.log(10000)/emb_dim)
        pos  = torch.arange(0,max_len).reshape(max_len,1)

        pos_embedding = torch.zeros(size = (max_len,emb_dim))


        pos_embedding[:,0::2] = torch.sin(pos*den)
        pos_embedding[:,1::2] = torch.cos(pos*den)


        # Pos_embedding_shape = [seq_len,emb_dim]

        # add the batch_size to the sequence
        pos_embedding = pos_embedding.unsqueeze(dim = -2)

        self.dropout = nn.Dropout(dropout)

        # Positional embedding is a non-learnable parameter. It won't be updated with time
        
        self.register_buffer('pos_embedding',pos_embedding)


    def forward(self,token_embedding):
        return self.dropout(token_embedding + self.pos_embedding[:token_embedding.size(0)])

## Masking

In [17]:
def create_mask(src_seq_len):

    mask = (torch.triu(torch.ones(size = (src_seq_len,src_seq_len)))==1).transpose(0,1)
    mask = mask.float().masked_fill(mask==1,float(0.0)).masked_fill(mask ==0,float('-inf'))

    return mask

In [18]:
# Experiment
triu = torch.triu(torch.ones(size = (3,3))==1).transpose(0,1)
mask = triu.float().masked_fill(triu==1,float(0.0)).masked_fill(triu ==0,float('-inf'))
mask

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])

In [19]:
def generate_mask(src):
    src_seq_len = src.shape[0]
    src_mask = create_mask(src_seq_len)
    src_padding_mask = (src==PAD_IDX).transpose(0,1)

    return src_mask, src_padding_mask

In [20]:
## Test
src_t = torch.rand(5,5)
m =generate_mask(src_t)
m

(tensor([[0., -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf],
         [0., 0., 0., 0., -inf],
         [0., 0., 0., 0., 0.]]),
 tensor([[False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False]]))

## Custom GPT Model Architecture

In [21]:
class CustomGPTModel(nn.Module):

    def __init__(self,
                emb_dim: int,
                vocab_size: int,
                num_head: int,
                num_layers: int,
                max_seq_len = 500,
                dropout = 0.1):
        super().__init__()

        self.init_weights()

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.positional_encoding = PositionalEmbedding(emb_dim,dropout,max_len = max_seq_len)
        self.emb_dim  = emb_dim

        # Encoder layers
        encoder_layers = nn.TransformerEncoderLayer(d_model = emb_dim,nhead = num_head,dropout = dropout)

        self.transformer_encoder = nn.TransformerEncoder(encoder_layer = encoder_layers,num_layers = num_layers)

        self.lm_head = nn.Linear(emb_dim,vocab_size) # produce the final outputs, the final logits over vocabulary


    def init_weights(self):

        for p in self.parameters():
            if p.dim>1:
                nn.init_xavier_uniform_(p)


    def create_mask(self,source):
        src_seq_len = source.shape[0]

        src_mask = nn.Transformer.generate_square_subsequent_mask(src_seq_len)

        src_padding_mask = (src==PAD_IDX).transpose(0,1)

        return src_mask, src_padding_mask

    def decoder(self,x,src_mask):
        

        x = x.to(DEVICE)

        seq_len = x.shape[0]


        # Add positional encodding to the input embedding
        x = self.embedding(x)*math.sqrt(self.emb_dim)

        x = self.positional_encoding(x)

        if src_mask is None:
            src_mask, src_padding_mask = generate_mask(x)


        output = self.transformer_encoder(x,src_mask)

        logits = self.lm_head(x)

        return logits



    def forward(self,x,src_mask= None, key_padding_mask = None):

        x.to(DEVICE)

        seq_length = x.shape[0]

        # add positional embeddings to the embedding
        x = self.embedding(x)*math.sqrt(self.emb_dim)
        x = self.positional_encoding(x)

        if src_mask is None:
            src_mask, src_padding_mask = generate_mask(x)


        output = self.transformer_encoder(x,src_mask, key_padding_mask)

        x = self.lm_head(x)

        return x
        

        

In [22]:
emb_dim = 500
num_head = 2
num_layers = 2
vocab_size = len(vocab)

model = CustomGPTModel(emb_dim = emb_dim,
                       vocab_size = vocab_size,
                       num_head = num_head,
                       num_layers = num_layers).to(DEVICE)

## Prompting

#### PARAMETERS

In [23]:
BLOCK_SIZE = 20


In [24]:

def prompting(prompt=None,block_size=20):

    while prompt is None:
        prompt = input("Prompt can't be empty. Please enter a valid prompt")

    prompt_tokens = tokenizer(prompt)

    if len(prompt_tokens)>block_size:
        prompt_tokens = prompt_tokens[-block_size:]


    prompt_indices = vocab(prompt_tokens)
    prompt_tensors = torch.tensor(prompt_indices,dtype = torch.int64).reshape(-1,1).to(DEVICE)

    return prompt_tensors
    

In [25]:
prompt = prompting()
prompt

Prompt can't be empty. Please enter a valid prompt football is


tensor([[2233],
        [  11]], device='mps:0')

In [26]:
prompt_tensor = prompting('The sky is')
prompt_tensor

tensor([[   4],
        [2290],
        [  11]], device='mps:0')

## Output Check

In [27]:
def output(prompt):
    prompt_logit = prompting(prompt) # prompt_shape : [seq_len,batch_size]
    logits = model(prompt_logit)

    print(f"output_shape:{logits.shape}")

    logits = logits.transpose(0,1)

    logit_prediction = logits[:,-1]
    print(f"logit prediction dimension:{logit_prediction.shape}")

    next_token_encoded = torch.argmax(logit_prediction,dim = -1).reshape(-1,1)

    return next_token_encoded


result = output("The sky is")
predicted_word = idx_to_eng(result)
predicted_word

output_shape:torch.Size([3, 1, 68813])
logit prediction dimension:torch.Size([1, 68813])


'concept--the'

## Autoregressive Text Generation

In [28]:
# Declaring prompt 
prompt = "The sky is"
prompt_tokens = prompting(prompt)
## By using model output
max_new_tokens = 20 # how many words you may allow your model to generate

for i in range(max_new_tokens):
    

    logit = model(prompt_tokens)
    logit_reshape = logit.transpose(0,1)

    logit_prediction = logit_reshape[:,-1]

    next_token_encoded = torch.argmax(logit_prediction, dim = -1).reshape(-1,1)

    prompt_tokens = torch.cat((prompt_tokens,next_token_encoded),dim =0)

    print(f"input_prompt_shape: {prompt_tokens.shape}")
    print(f"logit shape: {logit.shape}")
    print(f"next_token_shape: {next_token_encoded.shape}")
    print(f"output:{" ".join([idx_to_eng(tokens) for tokens in prompt_tokens])}")

input_prompt_shape: torch.Size([4, 1])
logit shape: torch.Size([3, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is concept--the
input_prompt_shape: torch.Size([5, 1])
logit shape: torch.Size([4, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is concept--the terpsichorean
input_prompt_shape: torch.Size([6, 1])
logit shape: torch.Size([5, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is concept--the terpsichorean bangster
input_prompt_shape: torch.Size([7, 1])
logit shape: torch.Size([6, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is concept--the terpsichorean bangster acturly
input_prompt_shape: torch.Size([8, 1])
logit shape: torch.Size([7, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is concept--the terpsichorean bangster acturly spanish
input_prompt_shape: torch.Size([9, 1])
logit shape: torch.Size([8, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is concept--the terpsichorean ban

In [29]:
# By using model decoder
# Declaring prompt 
prompt = "The sky is"
prompt_tokens = prompting(prompt)
## By using model output
max_new_tokens = 20 # how many words you may allow your model to generate

for i in range(max_new_tokens):
    

    logit = model.decoder(prompt_tokens,src_mask = None)
    logit_reshape = logit.transpose(0,1)

    logit_prediction = logit_reshape[:,-1]

    next_token_encoded = torch.argmax(logit_prediction, dim = -1).reshape(-1,1)

    prompt_tokens = torch.cat((prompt_tokens,next_token_encoded),dim =0)

    print(f"input_prompt_shape: {prompt_tokens.shape}")
    print(f"logit shape: {logit.shape}")
    print(f"next_token_shape: {next_token_encoded.shape}")
    print(f"output:{" ".join([idx_to_eng(tokens) for tokens in prompt_tokens])}")

input_prompt_shape: torch.Size([4, 1])
logit shape: torch.Size([3, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is gangbanger
input_prompt_shape: torch.Size([5, 1])
logit shape: torch.Size([4, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is gangbanger henriksons
input_prompt_shape: torch.Size([6, 1])
logit shape: torch.Size([5, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is gangbanger henriksons celebrated
input_prompt_shape: torch.Size([7, 1])
logit shape: torch.Size([6, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is gangbanger henriksons celebrated tehran
input_prompt_shape: torch.Size([8, 1])
logit shape: torch.Size([7, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is gangbanger henriksons celebrated tehran gq
input_prompt_shape: torch.Size([9, 1])
logit shape: torch.Size([8, 1, 68813])
next_token_shape: torch.Size([1, 1])
output:the sky is gangbanger henriksons celebrated tehran gq crummier
i

# Generation function

In [30]:

def generate(model, prompt=None, max_new_tokens = 15, block_size = BLOCK_SIZE, vocab= vocab, tokenizer=tokenizer):

    model.to(DEVICE)

    prompt_encoded = prompting(prompt).to(DEVICE)
    

    for i in range(max_new_tokens):

        logits = model(prompt_encoded).transpose(0,1)

        logit_prediction = logits[:,-1]

        encoded_logit = torch.argmax(logit_prediction, dim = -1).reshape(-1,1)


        # if the next token is end of text, then stop the generation
        if encoded_logit == EOS_IDX:
            break

        prompt_encoded = torch.cat((prompt_encoded,encoded_logit),dim = 0)[-block_size:]


        print(f"prompt_encoded_shape: {prompt_encoded.shape}")

        

    tokens=[idx_to_eng(tokens) for  tokens in prompt_encoded]

    return " ".join(tokens)

In [31]:
prompt_output = generate(model,prompt = "My love")

prompt_encoded_shape: torch.Size([3, 1])
prompt_encoded_shape: torch.Size([4, 1])
prompt_encoded_shape: torch.Size([5, 1])
prompt_encoded_shape: torch.Size([6, 1])
prompt_encoded_shape: torch.Size([7, 1])
prompt_encoded_shape: torch.Size([8, 1])
prompt_encoded_shape: torch.Size([9, 1])
prompt_encoded_shape: torch.Size([10, 1])
prompt_encoded_shape: torch.Size([11, 1])
prompt_encoded_shape: torch.Size([12, 1])
prompt_encoded_shape: torch.Size([13, 1])
prompt_encoded_shape: torch.Size([14, 1])
prompt_encoded_shape: torch.Size([15, 1])
prompt_encoded_shape: torch.Size([16, 1])
prompt_encoded_shape: torch.Size([17, 1])


In [32]:
prompt_output

'my love burbridge bad-hairdewed performances stalky bauchau bolivia millimeter tasked quip perú fart-jokes pick-pockets slurp naïf use'

In [33]:
main_lis =[]
tokens = torch.randint(0,20,size = (20,1))
main_lis.append([idx_to_eng(tok) for tok in tokens])
main_lis

[['was',
  'a',
  '<pad>',
  'to',
  'movie',
  's',
  '.',
  '<pad>',
  'the',
  'that',
  ',',
  'i',
  's',
  '<|endoftext|>',
  'is',
  'that',
  'that',
  '.',
  '<|endoftext|>',
  '.']]

## Training & Testing

The main difference in training and inferencing lies in the inputs to the decoder.
during training, the decoder has the access 0f the ground truth (receiving the exact target sequence tokens incrementally through a technique known as `teacher forcing`)

In [34]:
src,trg = next(iter(train_dataloader))


In [35]:
src,src.shape,trg.shape,trg

(tensor([[    5],
         [  835],
         [ 7090],
         [    5],
         [  291],
         [ 3064],
         [    4],
         [  203],
         [    9],
         [    4],
         [  812],
         [    5],
         [ 6431],
         [ 2091],
         [   12],
         [   67],
         [    5],
         [    7],
         [55310],
         [    4]]),
 torch.Size([20, 1]),
 torch.Size([20, 1]),
 tensor([[  835],
         [ 7090],
         [    5],
         [  291],
         [ 3064],
         [    4],
         [  203],
         [    9],
         [    4],
         [  812],
         [    5],
         [ 6431],
         [ 2091],
         [   12],
         [   67],
         [    5],
         [    7],
         [55310],
         [    4],
         [ 9424]]))

In [36]:
mask,padding_mask =  generate_mask(src)

In [37]:
mask,padding_mask

(tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0., 0., 0., 0., 0., -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf, -inf],
         [0., 0., 0., 0.

In [38]:
logit = model(src.to(DEVICE),src_mask = mask.to(DEVICE),key_padding_mask = padding_mask.to(DEVICE))
print(logit.shape)

torch.Size([20, 1, 68813])


In [39]:
print(f"output shape{logit.shape}")
print(f"source shape {src.shape}")

output shapetorch.Size([20, 1, 68813])
source shape torch.Size([20, 1])


In [40]:
print(f"Target shape: {trg.shape}")

Target shape: torch.Size([20, 1])


In [41]:
src,trg

(tensor([[    5],
         [  835],
         [ 7090],
         [    5],
         [  291],
         [ 3064],
         [    4],
         [  203],
         [    9],
         [    4],
         [  812],
         [    5],
         [ 6431],
         [ 2091],
         [   12],
         [   67],
         [    5],
         [    7],
         [55310],
         [    4]]),
 tensor([[  835],
         [ 7090],
         [    5],
         [  291],
         [ 3064],
         [    4],
         [  203],
         [    9],
         [    4],
         [  812],
         [    5],
         [ 6431],
         [ 2091],
         [   12],
         [   67],
         [    5],
         [    7],
         [55310],
         [    4],
         [ 9424]]))

In [42]:
print(logit.reshape(-1,logit.shape[-1]).shape)
print(trg.reshape(-1).shape)

torch.Size([20, 68813])
torch.Size([20])


## Loss Function

In [43]:
from torch.nn import CrossEntropyLoss
loss_fn = CrossEntropyLoss(ignore_index=PAD_IDX).to(DEVICE)

In [44]:
loss = loss_fn(logit.reshape(-1,logit.shape[-1]).to(DEVICE),trg.reshape(-1).to(DEVICE))
print(loss.item())

58.5584831237793


In [45]:
def evaluate(model,logit,target,device):

    model.eval()
    total_loss = 0.

    with torch.no_grad():
        for src,trg in eval_data:

            trg = trg.to(device)

            logit = model(src,src_mask = None,key_padding_mask = None).to(device) # logit shape: [seq_len(context size),batch_size,embed_dim] --> [20,1,68813]

            total_loss += loss_fn(logit.reshape(-1,logit.shape[-1]),trg.reshape(-1)) 
            
            # logit_shape: [context_size,embed_dim] 
            # target shape: [context size]
            
            

    return total_loss/ (len(list(eval_data))-1)

## Training the Model

#### Optimizer, scheduler, & Loss function

Absolutely — you're working with some important hyperparameters here that can significantly affect how well and how fast your model trains. Let's go through them one by one:

---

### ✅ `weight_decay = 0.01`

This is **L2 regularization**.

* **What it does:** Adds a penalty to large weights by modifying the loss function:

  $$
  \text{Loss}_{\text{new}} = \text{Loss}_{\text{original}} + \lambda \sum w^2
  $$

  Where `λ` is `weight_decay`.

* **Why use it:** It helps prevent **overfitting** by discouraging the model from assigning large weights to any particular input feature.

* **Typical range:** `1e-5` to `1e-2`. Your `0.01` is on the strong side — that’s fine for small models, but may be too strong for larger ones.

---

### ✅ `betas = (0.9, 0.999)`

These are hyperparameters specific to the **Adam optimizer**.

Adam uses **exponentially decaying averages** of past gradients and squared gradients:

* **β₁ (0.9):** Decay rate for the first moment (mean of gradients).
* **β₂ (0.999):** Decay rate for the second moment (variance of gradients).

#### What they mean:

* `β₁ = 0.9`: Keeps a memory of the last 10 gradients (roughly), helps stabilize updates.
* `β₂ = 0.999`: Keeps a much longer memory (approx. 1000 past squared gradients), helps adapt learning rate per parameter.

#### TL;DR:

| Beta       | Controls              | Purpose          |
| ---------- | --------------------- | ---------------- |
| β₁ (0.9)   | Mean of gradients     | Momentum         |
| β₂ (0.999) | Variance of gradients | Adaptive scaling |

These are **standard default values**. Only tweak if you're doing advanced optimization.

---

### ✅ `scheduler = StepLR(optimizer, step_size=10000, gamma=0.9)`

This is a **learning rate scheduler**.

#### What it does:

* Every **10,000 steps**, it **multiplies** the current learning rate by **0.9** (i.e., decays it by 10%).

$$
\text{new\_lr} = \text{old\_lr} \times \gamma
$$

#### Why use it:

* Helps the model **start learning fast**, but then **refine slowly** as training progresses.
* Avoids overshooting minima in later stages of training.

---

## 🔁 How They Work Together

During training:

1. **Adam** adapts per-parameter updates using running averages.
2. **Weight decay** keeps weights small, reducing overfitting.
3. **StepLR scheduler** gradually lowers the learning rate to improve convergence.

---

## 🔍 Example Flow

Let’s say you're training for 50,000 steps:

* At step 0: `lr = 0.01`
* At step 10,000: `lr = 0.009`
* At step 20,000: `lr = 0.0081`
* ...
* Learning rate gets smaller, allowing more fine-tuned adjustments.

---

## 🧠 TL;DR Table

| Hyperparameter        | Meaning                  | Purpose                    | Tip                           |
| --------------------- | ------------------------ | -------------------------- | ----------------------------- |
| `weight_decay=0.01`   | L2 penalty               | Prevent overfitting        | Reduce if underfitting        |
| `betas=(0.9, 0.999)`  | Exponential averages     | Stability + adaptivity     | Default is good               |
| `gamma=0.9` in StepLR | Learning rate multiplier | Refines training over time | Try `0.95` if decay too sharp |

Let me know if you want to try more **adaptive schedulers** like `CosineAnnealingLR` or `ReduceLROnPlateau`.


In [46]:
optimizer = torch.optim.Adam(model.parameters(),
                            lr = 1e-2,
                            weight_decay = 0.01,
                            betas = (0.9,0.999))

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 10000, gamma = 0.9)
loss_fn = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

In [49]:
def train(model:nn.Module,
         train_data):

    model.train() # enables the model for batch normalization and dropout

    total_loss = 0.
    log_interval = 10000
    start_time = time.time()

    num_batches = len(list(train_data))//BLOCK_SIZE

    for batch,srctrg in enumerate(train_data):
        src = srctrg[0]
        trg = srctrg[1]

        src,trg = src.to(DEVICE),trg.to(DEVICE)
        #print(f"Input shape: {src.shape}")

        logits = model(src,src_mask = None)
        logits_flat = logits.reshape(-1,logits.shape[-1]).to(DEVICE)
        loss = loss_fn(logits_flat,trg.reshape(-1).to(DEVICE))

        optimizer.zero_grad()
        loss.backward()
        # clip the gradient
        torch.nn.utils.clip_grad_norm_(model.parameters(),0.5)
        optimizer.step()
        

        total_loss += loss.item()

        if (batch % log_interval == 0 and batch >0) or batch==42020:
            lr = scheduler.get_last_lr()[0] # get the last learning rate
            ms_per_batch = (time.time()-start_time)*1000/log_interval

            cur_loss = total_loss/batch
            ppl = math.exp(cur_loss)
            
            print(f"epoch {epoch:.3d} | {batch//block_size:5d}/{num_batches:5d} batches | "
                 f'lr {lr:02.4f} | ms/batch {ms_per_batch:5.2f} | '
                  f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
            start_time = time.time()


    return total_loss
            

In [50]:
best_val_loss = float('inf')

epochs = 30
train_losses = []
val_losses = []

for epoch in range(1,epochs+1):
    epoch_start_time = time.time()
    train_loss = train(model,train_dataloader)
    val_loss = evaluate(model,test_dataloader)
    val_ppl = math.exp(val_loss)
    train_losses.append(train_loss)
    val_losses.append(val_loss)


    elapsed = time.time()-epoch_start_time
    print('-'*80)
    print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
        f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
    print('_'*80)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.save_dict(),'model_best_val_loss.pt')

ValueError: Precision not allowed in integer format specifier

## Plotting

In [None]:
num_epochs = len(train_losses)

# create a figure and a set of subplots
fig,ax = plt.subplots()

ax.plot(range(num_epochs),train_losses,label = "Train_losses")
ax.plot(range(num_epochs),val_losses,label = 'Validation losses')

ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')

ax.set_title('Training and Validation Losses')
ax.legend()
plt.show()

In [None]:
!wget 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/kyn1_OsXrzjef0xihlsXmg.pt'
model.load_state_dict(torch.load('kyn1_OsXrzjef0xihlsXmg.pt',map_location=torch.device('cpu')))

## Loading GPT2 model from HuggingFace

In [None]:
# load the tokenizer and the model
tokenizer1 = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# define the input prompt
input_text = 'The world is full of'

# tokenize the input text and prepare the input for the model 
input_ids = tokenizer1.encode(input_text,return_tensors= 'pt')


# Generate text using the model
# set the desired legth of the generated text (max_length),
# and other generation parameters like temperature, top_k and top_p

max_length = 15
temperature = 0.7
top_k = 50
top_p = 0.95



generated_ids = model.generate(
    input_ids,
    max_length = max_length,
    temperature = temperature,
    top_k = top_k,
    top_p = top_p,
    pad_token_id = tokenizer1.eos_token_id,
)

# decode the generated text
generated_text = tokenizer1.decode(generated_ids[0])

# print the input prompt and the generated text
print(f"Input: {input_text}")
print(f"Generated_text: {generated_text}")

Great question! These three parameters—**temperature**, **top-k**, and **top-p (nucleus sampling)**—control how your language model (like GPT-2) generates text. They influence the **creativity vs. predictability** of the output.

Let’s break them down simply:

---

### 🔥 `temperature`

* Controls **randomness** in token selection.
* It's a **scaling factor** applied to the predicted probabilities.
* **Lower values (e.g., 0.2–0.5)** make the model more **confident** and deterministic (boring/repetitive).
* **Higher values (e.g., 1.0–1.5)** make it more **creative**, allowing it to pick less likely words.

**Example:**

```plaintext
temperature = 0.1 → "The world is full of hope and love."
temperature = 1.2 → "The world is full of jellyfish pirates and cheese storms."
```

---

### 🔢 `top_k`

* Limits the model to the **top `k` most likely words** at each step.
* Randomly samples **only from those `k` words**.
* Helps avoid very low-probability tokens (i.e., nonsense).
* **Lower `k`** = safer, more predictable.
* **Higher `k`** = more varied outputs.

**Example:**

```plaintext
top_k = 5 → "The world is full of beauty and grace."
top_k = 50 → "The world is full of complexity, whispers, and algorithms."
```

---

### 🎯 `top_p` (aka nucleus sampling)

* Picks the **smallest set of words** whose combined probability is **≥ `p`** (e.g., 95%).
* Chooses randomly **only from that set**.
* More dynamic than top-k since the number of candidate tokens varies depending on the distribution.

**Example:**

```plaintext
top_p = 0.9 → Usually safe, avoids odd tokens.
top_p = 0.7 → More conservative, tighter sampling.
```

---

### ⚖️ Summary Table

| Parameter     | Range   | Effect                          | Recommendation       |
| ------------- | ------- | ------------------------------- | -------------------- |
| `temperature` | 0.1–2.0 | Creativity vs. certainty        | 0.7–1.0 for balance  |
| `top_k`       | 0–100+  | Limit by top-k probabilities    | 40–100 for diversity |
| `top_p`       | 0–1     | Limit by cumulative probability | 0.9–0.95 for fluency |

---

### ✅ Default/Good Starting Point

```python
temperature = 0.7
top_k = 50
top_p = 0.95
```

This strikes a **good balance** between fluency and creativity.

Let me know if you’d like to see visualizations or real examples for different settings!


In [None]:
Create an instance with the following parameters:

embedding size = 200
number of layers = 2
number of attention heads = 2
dropout probability = 0.2
Create a prompt

Pass the prompt to model to generate text with a maximum length of 15