In [1]:
import torchtext
from torchtext.datasets import multi30k,Multi30k
import torch
from typing import Iterable, List
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.nn import Transformer
import math
from torchtext.vocab import Vocab
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.nn.utils.rnn import pad_sequence
from torchtext.datasets import IMDB,PennTreebank
import time



import warnings
def warn(*args,**kwargs):
    pass
warnings.warn = warn
warnings.filterwarnings('ignore')


In [2]:
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB()

In [3]:
label,sample = next(iter(train_iter))

In [4]:
label,sample

(1,
 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far betwee

In [5]:
label_list =([label for label,_ in train_iter])
text_list = ([text for _,text in train_iter])

In [6]:
len(label_list)

12500

In [7]:
len(text_list)

12500

In [8]:
import random
idx = random.randint(0,len(text_list))

text_list[idx],label_list[idx]

('One of the more \'literate\' Lone Stars, with time spent on character development and interaction, dialog and acting business. The opening scene sets the stage (literally) for the personalities of the gambler, Kansas Charlie (Eddy Chandler), and his buddy, John Scott (John Wayne) the rodeo (say Roh-Day-oh) star, both of whom are slightly randy. The film follows their adventures, as they try to best each other in the pursuit of the Mexican Juanita, and later in their pursuit of perky Mary Kornman, who has the inevitable evil brother (though he\'d been led astray by the real villain, and wants to repent). And oh, of course, they\'re being wrongly accused of two crimes and have to serve jail time before escaping and being exonerated at the end.<br /><br />The high point is Scott continually and deliberately ogling Mary\'s butt in her grocery store, and knocking away the ladder she\'s standing on so he can catch her and grab her as she falls. It all seems a little contemporary for a 30s 

The data has 12500 test sample data. Here each one is labelled as one. So each movie got a positive review somehow

## Data Processing

In [9]:
UNK_IDX, PAD_IDX, EOS_IDX = 0,1,2
special_symbols = ['<unk>','<pad>','<|endoftext|>']

In [10]:
# set the tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_token(dataset):
    for _,text in dataset:
        yield tokenizer(text)

# Create the vocabulary
vocab = build_vocab_from_iterator(yield_token(train_iter), specials = special_symbols,special_first=True)
vocab.set_default_index(vocab['<unk>'])

In [11]:
# Text to index, Index to text

# input--> token; output--> index
text_to_idx = lambda text: [vocab[token] for token in tokenizer(text)]

# input --> index; output--> token
idx_to_text = lambda seq_en: " ".join([vocab.get_itos()[idx] for idx in seq_en])

In [12]:
a = torch.tensor([12,24,36,48,60,72,84],dtype = torch.int64)
#index_to_text = idx_to_text([12,24,36,48,60,72,84])
index_to_text = idx_to_text(a)

index_to_text

'it t ! or has my been'

In [13]:
items = list(vocab.get_itos())[:10]
items

['<unk>', '<pad>', '<|endoftext|>', '.', 'the', ',', 'a', 'and', "'", 'of']

In [14]:
text = idx_to_text(torch.tensor([0,1,2]))
text

'<unk> <pad> <|endoftext|>'

## Collate Function

Collate function shapes how the dataloader perceives the data. To pass the data through the model each string should have equal length

In [15]:
def get_sample(block_size, text):
    """The goal of the function is to generate a training sample. The input and the target dataset. It's done because the model learns from the input
    and tries to predict the output.
    
    Parameters(block_size,text):
        block_size: actually indicates the context size. At how many tokens the model can look at once. 
        text: the full dataset. A list of long tokens
        """


    src_seq,trg_seq = [],[]
    sample_len = len(text) # gives the length of the total length of the input

    # Calculating the stopping point for randomly selecting a sample
    # This ensures the selected sample doesn't exceed the text
    random_sample_stop = sample_len - block_size

    # random sampling should be starting in this limit

    if random_sample_stop >=1:

        # pick the starting point
        random_start = torch.randint(0,random_sample_stop, size = (1,)).item()

        stop = random_start + block_size

        # Create the input and the target sequence
        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop+1]



    # Handle the case where the text sequence is exactly same as the context size
    elif random_sample_stop <= 0:

        random_start = 0
        stop = sample_len

        src_seq = text[random_start:stop]

        trg_seq = text[random_start+1:stop]

        # Append an empty string to maintain the sequence alignment
        trg_seq.append('<|endoftext|>')

    return src_seq,trg_seq
    

In [16]:
# Gather the training data tokens into a list
BATCH_SIZE = 1 # for each batch its return a list of tokens which could be used for training

batch_of_tokens = []

for i in range(BATCH_SIZE):
    label,text = next(iter(train_iter))
    batch_of_tokens.append(tokenizer(text))

In [17]:
len(batch_of_tokens[0])

317

In [18]:
# Check how is the input and the output sequence would be feed to the training model

# create the whole dataset for training of 100 tokens
text = batch_of_tokens[0][0:100]
block_size = 10

src_seq, trg_seq = get_sample(block_size,text)

In [19]:
src_seq,trg_seq

(['store',
  'because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when'],
 ['because',
  'of',
  'all',
  'the',
  'controversy',
  'that',
  'surrounded',
  'it',
  'when',
  'it'])

In [20]:
vocab(src_seq),vocab(trg_seq)

([1024, 87, 9, 40, 4, 7333, 16, 2975, 12, 66],
 [87, 9, 40, 4, 7333, 16, 2975, 12, 66, 12])

In [21]:
src_batch,trg_batch = [],[]

# Define the batch size
BATCH_SIZE = 2
block_size = 10
# Loop to create src batch and target batch
for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter)) # Get the first sample every time, as we convert the train data iterable in every turn


    # turn the text into tokenizer and then send them
    text_tokens = tokenizer(text)

    # Generate source and target tokens
    src_tokens,trg_tokens = get_sample(block_size,text_tokens) # block_size is previously declared; block_size = 10

    # Get the indices of that tokens
    src_indices,trg_indices = vocab(src_tokens),vocab(trg_tokens)

    # Turn the indices into tensors
    src_sequence = torch.tensor(src_indices)
    trg_sequence = torch.tensor(trg_indices)
    

    # print the output 
    print(f"Sample: {i}")
    print(f"Source Sequence (Text): {src_tokens}")
    print(f"Source Sequence (Indices): {src_indices}")
    print(f"Source Sequence (Shape): {len(src_tokens)}")
    print(f"Target Sequence (Text): {trg_tokens}")
    print(f"Target Sequence (Indices): {trg_indices}")
    print(f"Target Sequence (Shape): {len(trg_tokens)}")

Sample: 0
Source Sequence (Text): ['controversial', 'i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself', '.']
Source Sequence (Indices): [3535, 13, 69, 70, 10, 77, 15, 20, 479, 3]
Source Sequence (Shape): 10
Target Sequence (Text): ['i', 'really', 'had', 'to', 'see', 'this', 'for', 'myself', '.', 'the']
Target Sequence (Indices): [13, 69, 70, 10, 77, 15, 20, 479, 3, 4]
Target Sequence (Shape): 10
Sample: 1
Source Sequence (Text): ['scenes', 'in', 'his', 'films', '.', 'i', 'do', 'commend', 'the', 'filmmakers']
Source Sequence (Indices): [144, 14, 39, 129, 3, 13, 81, 11638, 4, 839]
Source Sequence (Shape): 10
Target Sequence (Text): ['in', 'his', 'films', '.', 'i', 'do', 'commend', 'the', 'filmmakers', 'for']
Target Sequence (Indices): [14, 39, 129, 3, 13, 81, 11638, 4, 839, 20]
Target Sequence (Shape): 10


## Collate Function

In [22]:
DEVICE = 'mps' if torch.backends.mps.is_available() else 'cuda'
DEVICE

'mps'

In [23]:
BLOCK_SIZE = 30

def collate_batch(batch):
    src_batch,trg_batch = [],[]

    for _,text in batch:

        tokens = tokenizer(text)

        src_tokens,trg_tokens = get_sample(BLOCK_SIZE,tokens)

        src_indices,trg_indices = vocab(src_tokens),vocab(trg_tokens)

        src_seq,trg_seq = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(trg_indices,dtype = torch.int64)

        src_batch.append(src_seq)
        trg_batch.append(trg_seq)


    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    trg_batch = pad_sequence(trg_batch,padding_value = PAD_IDX, batch_first = False)

    return src_batch.to(DEVICE),trg_batch.to(DEVICE)

## Create DataLoader

In [24]:
from torch.utils.data import DataLoader,Dataset

train_dataloader = DataLoader(train_iter,
                       batch_size = BATCH_SIZE,
                       shuffle = True,
                       collate_fn = collate_batch) # collate batch will define how the data will be retuerned

test_dataloader = DataLoader(test_iter,
                            batch_size = BATCH_SIZE,
                            shuffle = True,
                            collate_fn = collate_batch)

## Iterating Through Data Samples

In [25]:
dataset = iter(train_dataloader)

for sample in range(5):
    src, tgr = next(dataset)

    for i in range(BATCH_SIZE):
        print(f"sample: {sample}")
        print(f"Source shape: {src.shape}")
        print(f"source: {idx_to_text(src[:,i])}") # take the indices as the intput and return the word as the output
        print("\n")
        print(f"Target: {idx_to_text(tgr[:,i])}")
        print("\n")

sample: 0
Source shape: torch.Size([30, 2])
source: . for 1931 , maybe . for 2004 , not acceptable . some of the actors performed well . sadly , the indians always get the short end in these


Target: for 1931 , maybe . for 2004 , not acceptable . some of the actors performed well . sadly , the indians always get the short end in these early


sample: 0
Source shape: torch.Size([30, 2])
source: mcdowell isn ' t much of an actress to begin with , but given the non-existent plot ( i hate to even refer to it as a plot ) in


Target: isn ' t much of an actress to begin with , but given the non-existent plot ( i hate to even refer to it as a plot ) in this


sample: 1
Source shape: torch.Size([30, 2])
source: ' t be surprised if the father of the baby had about eight girlfriends in the first edition of the script . stacy ' s ( the carrier of the


Target: t be surprised if the father of the baby had about eight girlfriends in the first edition of the script . stacy ' s ( the carrier of the 

## **MASKING**

Create masking so that the decoder while predicts the next word can only take words before the output words as the context, not the whole dataset


In [26]:
def generate_sqr_mask(sz,device = DEVICE):
    mask = (torch.triu(torch.ones((sz,sz),device = device))==1)

In [27]:
a = torch.ones(3,3)
a = (torch.triu(a)==1).transpose(0,1)
a = a.float().masked_fill( a ==0,float('-inf')).masked_fill(a ==1,float(0.0))
a

tensor([[0., -inf, -inf],
        [0., 0., -inf],
        [0., 0., 0.]])

In [28]:
def generate_mask(sz,device):
    mask = (torch.triu(torch.ones((sz,sz),device =device))==1).transpose(0,1)
    mask = mask.float().masked_fill(mask ==0,float('-inf')).masked_fill(mask==1,float(0.0))

    return mask

In [29]:
def create_mask(src,device = DEVICE):
    src_seq_len = src.shape[0]
    src_mask = generate_mask(src_seq_len,device = DEVICE)
    src_padding_mask = (src == PAD_IDX).transpose(0,1)
    return src_mask,src_padding_mask

In [30]:
## Test
src_t = torch.rand(5,5)
m =create_mask(src_t,device = DEVICE)
m

(tensor([[0., -inf, -inf, -inf, -inf],
         [0., 0., -inf, -inf, -inf],
         [0., 0., 0., -inf, -inf],
         [0., 0., 0., 0., -inf],
         [0., 0., 0., 0., 0.]], device='mps:0'),
 tensor([[False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False],
         [False, False, False, False, False]]))

## Positional Encoding

In [31]:
class PositionalEncoding(nn.Module):

    def __init__(self,
                emb_dim: int,
                dropout: float,
                maxlen: int = 5000):

        super().__init__()

        den = torch.exp(-torch.arange(0,emb_dim,2)*math.log(10000)/emb_dim)

        pos = torch.arange(0,maxlen).reshape(maxlen,1)

        pos_embedding = torch.zeros(size = (maxlen,emb_dim))


        pos_embedding[:,0::2] = torch.sin(pos*den)
        pos_embedding[:,1::2] = torch.cos(pos*den)


        pos_embedding = pos_embedding.unsqueeze(-2)

        self.dropout = nn.Dropout(dropout)

        self.register_buffer('pos_embedding',pos_embedding)

    def forward(self,token_embedding):
        return self.dropout(token_embedding+self.pos_embedding[:token_embedding.size(0)])

## Token Embedding

In [32]:
class TokenEmbedding(nn.Module):

    def __init__(self,
                vocab_size,
                emb_dim):

        super().__init__()

        self.embedding = nn.embedding(vocab_size,emb_dim)
        self.emb_dim = emb_dim

    def forward(self,tokens):
        return self.embedding(tokens.long())*math.sqrt(self.emb_dim)
        """PyTorch nn.Embedding expects input as LongTensor (i.e., dtype=torch.int64).
        This ensures that the token indices are valid integer indices for lookup.
        If your tokens are accidentally float32, you'll get a type error."""

        

## Custom GPT Model Architecture

* Initialization(init): embedding_dimension, vocab_size, num_heads, num_layers, max_sequence_length, and dropout
* lm_head: Generates logits over the vocabulary

* weight_initialization: Initializes the weights for better training convergence. The Xavier uniform initialization is used, which is a common practice for initializing weights in deep learning.
* Decoder: method currently functions the forward pass, through the transformer encoder layers, followed by the generation of logits for the language modeling task.
1) got the output from the transformer encoder layers
2) Generation of logits for the language modelling task

* Forward Pass: This method is similar to Decoder method, and defines the forward computation of the model. It produces the input through embedding layers, positional encoding, transformer encoded layers, and produces the final output using lm_head
* mask generation: Both decoder and forward methods contain logic to generate a square casual mask if no source mask is provided. Mask ensures that the prediction for a position does not depend on the future tokens in the sequences, which is important for auto regressive nature of gpt models
* Commented out decoder: A section of the code is commented out, suggesting an initial design where a transformer decoder layer was considered. However, the final implementation uses only encoder layers only, which is a common simplification for models focusing on language modeling and generation

In [82]:
class CustomGPTModel(nn.Module):

    def __init__(self,
                emb_dim: int,
                vocab_size: int,
                num_head: int,
                num_layers: int,
                max_seq_len: 500,
                dropout = 0.1):

        super().__init__()
        
        self.init_weights()

        self.embed = nn.Embedding(vocab_size,emb_dim)

        self.positional_encoding = PositionalEncoding(emb_dim = emb_dim,dropout = dropout)

        print(f"EMBEDDING DIMENSION: {emb_dim}")

        # Remaining layers are part of the TransformerDecoder
        encoder_layers = nn.TransformerEncoderLayer(d_model = emb_dim,
                                              nhead = num_head,
                                              dropout = dropout)

        self.transformer_encoder = nn.TransformerEncoder(encoder_layer = encoder_layers,
                                                        num_layers = num_layers)

        self.emb_dim = emb_dim
        self.lm_head = nn.Linear(emb_dim, vocab_size) # produce the final output, the final logits over the vocabulary


    def init_weights(self):

        for p in self.parameters():
            if p.dim()>1:
                nn.init.xavier_uniform_(p)


    def create_mask(src,device = DEVICE):
        src_seq_len = src.shape[0] # src_shape: [seq_len,batch_size,emb_dim]
        src_mask = nn.Transformer.generate_square_subsequent_mask(src_seq_len)
        src_padding_mask = (src ==PAD_IDX).transpose(0,1)
        return src_mask ,src_padding_mask

    def decoder(self,x,src_mask):

        x = x.to(DEVICE)

        seq_len = x.size(0)


        # Add positional arguments to the input embeddings

        x = self.embed(x)*math.sqrt(self.emb_dim)

        x = self.positional_encoding(x)

        if src_mask is None:
            "Generate a square casual mask for the sequence. The masked positions are filled with -inf and the unmasked positions will be filled with 0"

            src_mask,src_padding_mask = create_mask(x)

        output = self.transformer_encoder(x,src_mask)
        logits = self.lm_head(output)
        return logits


    def forward(self,x,src_mask = None, key_padding_mask = None):

        print(f"Forward input shape: {x.shape}")

        seq_len = x.size(0)

        # add positional embeddings to the input embeddings

        embedding = self.embed(x) * math.sqrt(self.emb_dim)
        x = self.positional_encoding(x)

        if src_mask is None:
            src_mask, src_padding_mask= create_mask(x)
            

        output =self.transformer_encoder(x,src_mask,key_padding_mask)

        x = self.lm_head(x)

        return x

        
        
        
        

## Model Configaration and Initialization

* **`STEPS`**:
  1) Create the tokens of the datasets, and vocabularies
  2) Create the collate funcion; which will return `source` and `target`
  3) create positional encodings (source)
  4) create embeddings (source)
  5) create mask
  6) create transformer layers
  7) create transformer encoder
  8) pass the output of the transformer encoder throught the linear logit


**`Configure and instantiate a custom gpt model with the following specification`**
* ntokens: Its basically the `vocab_size`, total number of unique tokens in the vocabulary, which the model will use to represent the word
* emsize: The size of each embedding vector. In this model, each word will be represented by a 200 dimensional vector.
* n_players: the number of transformer encoder layers in the model. We are using two layers in this configuaration
* n_head: the number of attention heads in the multi head attention mechanism. The model will use two attention heads.
* dropout: A regularization technique which randomly select neurons are ignored during training to prevent overfitting.  Here, we set the dropout to 0.2
  

In [83]:
ntokens = len(vocab) 
emb_dim = 200
n_layers = 2
nhead = 2
dropout = 0.2

model =  CustomGPTModel(emb_dim = emb_dim,
                       vocab_size=ntokens,
                       num_head = nhead,
                       num_layers = n_layers,
                       max_seq_len=5000).to(DEVICE)

EMBEDDING DIMENSION: 200


## Prompting

In [79]:
def encode_prompt(prompt,block_size = BLOCK_SIZE):

    """take the blocksize and fit it for proper input"""
    while prompt is None:
        prompt = input("Sorry, prompt can be empty. Please enter a valid prompt: \n")

    tokens = tokenizer(prompt)

    number_of_tokens = len(tokens)

    if number_of_tokens>block_size:
        tokens = tokens[-block_size:] # taken into account the last tokens

    prompt_indices = vocab(tokens)

    prompt_encoded = torch.tensor(prompt_indices, dtype = torch.int64).reshape(-1,1)
    print(f"The shape of the prompt tensor: {prompt_encoded.shape}")

    return prompt_encoded

    

In [80]:
prompt_name = encode_prompt(None)

Sorry, prompt can be empty. Please enter a valid prompt: 
 The sky is


The shape of the prompt tensor: torch.Size([3, 1])


## Output (Logits)

In [81]:
logits = model.decoder(prompt_name,src_mask = None)
logits.shape

torch.Size([3, 1, 68813])

In [73]:
len(vocab)

68813

In [74]:
logits = logits.transpose(0,1)
print(f"The new shape of the logit: {logits.shape}")

logit_prediction = logits[:,-1] # take all the rows but the last column
logit_prediction.shape

The new shape of the logit: torch.Size([1, 3, 68813])


torch.Size([1, 68813])

In [75]:
_,next_word_index = torch.max(logit_prediction, dim = 1)
next_word_index

tensor([35805], device='mps:0')

In [76]:
predicted_word = idx_to_text(next_word_index)

predicted_word

'valcos'

## Autoregressive Text Generation`

In [77]:
prompt = 'mother is the best warrior'

start_seq = encode_prompt(prompt).to(DEVICE)

print(f"Start Sequence Shape:{start_seq.shape}")




Start Sequence Shape:torch.Size([5, 1])


In [78]:
max_new_tokens = 10 # the number of words you want to show as the output

for i in range(max_new_tokens):

    logits = model.decoder(start_seq,src_mask = None)

    logits = logits.transpose(0,1) # interchange the first two dimensions of the matrix

    print(" ")
    print(f"Shape of logits at step {i}: {logits.shape}")
    
    logit_prediction = logits[:,-1]

    next_token_encoded = torch.argmax(logit_prediction,dim = 1).reshape(-1,1)


    start_seq = torch.cat((start_seq,next_token_encoded),dim = 0).to(DEVICE)
    print(f"Probable Line: {[idx_to_text(j) for j in start_seq]}")

 
Shape of logits at step 0: torch.Size([1, 5, 68813])
Probable Line: ['mother', 'is', 'the', 'best', 'warrior', 'raining']
 
Shape of logits at step 1: torch.Size([1, 6, 68813])
Probable Line: ['mother', 'is', 'the', 'best', 'warrior', 'raining', 'nondenominational']
 
Shape of logits at step 2: torch.Size([1, 7, 68813])
Probable Line: ['mother', 'is', 'the', 'best', 'warrior', 'raining', 'nondenominational', 'hispanic']
 
Shape of logits at step 3: torch.Size([1, 8, 68813])
Probable Line: ['mother', 'is', 'the', 'best', 'warrior', 'raining', 'nondenominational', 'hispanic', 'sticked']
 
Shape of logits at step 4: torch.Size([1, 9, 68813])
Probable Line: ['mother', 'is', 'the', 'best', 'warrior', 'raining', 'nondenominational', 'hispanic', 'sticked', 'wholeheartedly']
 
Shape of logits at step 5: torch.Size([1, 10, 68813])
Probable Line: ['mother', 'is', 'the', 'best', 'warrior', 'raining', 'nondenominational', 'hispanic', 'sticked', 'wholeheartedly', 'klane']
 
Shape of logits at ste

In [90]:
# Auto generation function

def auto_generate(model,prompt = None, max_new_tokens = 500,block_size = BLOCK_SIZE,vocab = vocab, tokenizer = tokenizer):

    model.to(DEVICE)
    """architecture: prompt will converted to tensor eventually"""

    encoded_prompt = encode_prompt(prompt).to(DEVICE)

    tokens = []

    for _ in range(max_new_tokens):

        print(f"encoded prompt shape: {encoded_prompt.shape}")

        # decode the encoded prompt using the model's decoder

        logits = model(encoded_prompt,src_mask = None, key_padding_mask = None)

        print(f"Shape of the logits {logits.shape}")

        logits_reshaped = logits.transpose(0,1) # Shape before [seq_len,batch_size,vocab_size] --> shape now[batch_size,seq_len,vocab_size]

        logits_prediction = logits_reshaped[:,-1]

        predicted_logit = torch.argmax(logits_prediction, dim =-1).reshape(-1,1)

        # if the next token is the end of sequence(EOS) token, stop generation
        
        if predicted_logit.item() == EOS_IDX:
            break

        # Append the next token to the prompt encoded and keep only the last block size tokens
        prompt_encoded = torch.cat((prompt,predicted_logit),dim = 0)[-block_size:]


        #convert the token index to a token string
    return [idx_to_text(tokens) for tokens in prompt_encoded]

    

In [91]:
auto_generate_text = auto_generate(model = model, prompt = "I'd love to take the pictures")

The shape of the prompt tensor: torch.Size([8, 1])
encoded prompt shape: torch.Size([8, 1])
Forward input shape: torch.Size([8, 1])
Shape of the logits torch.Size([8, 8, 68813])


RuntimeError: a Tensor with 8 elements cannot be converted to Scalar

In [84]:
prompt = "I'd love to take the pictures"
encoded_prompt = encode_prompt(prompt)
encoded_prompt,encoded_prompt.shape

The shape of the prompt tensor: torch.Size([8, 1])


(tensor([[  13],
         [   8],
         [ 216],
         [ 174],
         [  10],
         [ 205],
         [   4],
         [1509]]),
 torch.Size([8, 1]))

In [85]:
logit.shape

torch.Size([8, 1, 68813])

## Again start from the beginning

In [115]:
# get the dataset
from torchtext.datasets import IMDB
train_iter, test_iter = IMDB()

# set the vocab and tokenizer
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _,text in data_iter:
        yield tokenizer(text)


UNK_IDX,PAD_IDX,EOS_IDX = 0,1,2
special_symbols = ['<unk>','<pad>',"<|endoftext|>"]
vocab = build_vocab_from_iterator(yield_tokens(train_iter),specials = special_symbols,special_first=True)
print(len(vocab))


def get_sample2(context_size, text_tokens):

    src_seq,trg_seq = [],[]

    if len(text_tokens)>context_size:
        start_ceiling = len(text_tokens)-context_size
        start = torch.randint(0,start_ceiling,size=(1,))
        end = start+context_size
        src_seq = text_tokens[start:end]
        trg_seq = text_tokens[start+1:end+1]

    elif context_size> len(text_tokens):
        start = torch.randint(0,len(text_tokens),size = (1,))

        end = start+context_size

        src_seq = text_tokens[start:end]
        end_seq = text_tokens[start+1:end]

        
        end_seq.append('')


    return src_seq,trg_seq

    


68813


In [124]:
src_batch,tgr_batch = [],[]

BATCH_SIZE  = 2

for i in range(BATCH_SIZE):
    _,text = next(iter(train_iter))
    text_tokens = tokenizer(text)

    src_seq,trg_seq = get_sample2(context_size=20,text_tokens=text_tokens)

    src_indices,trg_indices = vocab(src_seq),vocab(trg_seq)

    src_tensors, trg_tensors = torch.tensor(src_indices,dtype = torch.int64),torch.tensor(trg_indices,dtype = torch.int64)

    

    src_batch.append(src_seq)
    trg_batch.append(trg_seq)

    print(f"Sample: {i}\n")
    print(f"source sequence: {src_seq}\n")
    print(f"Source tensors: {src_tensors}\n")
    print(f"target sequence: {trg_seq}\n")
    print(f"target tensors: {trg_tensors}\n")
    

Sample: 0

source sequence: ['s', 'not', 'shot', 'like', 'some', 'cheaply', 'made', 'porno', '.', 'while', 'my', 'countrymen', 'mind', 'find', 'it', 'shocking', ',', 'in', 'reality', 'sex']

Source tensors: tensor([   17,    30,   308,    43,    55,  4965,    96,  2755,     3,   155,
           72, 24085,   369,   196,    12,  1582,     5,    14,   788,   338])

target sequence: ['not', 'shot', 'like', 'some', 'cheaply', 'made', 'porno', '.', 'while', 'my', 'countrymen', 'mind', 'find', 'it', 'shocking', ',', 'in', 'reality', 'sex', 'and']

target tensors: tensor([   30,   308,    43,    55,  4965,    96,  2755,     3,   155,    72,
        24085,   369,   196,    12,  1582,     5,    14,   788,   338,     7])

Sample: 1

source sequence: ['swede', 'thought', 'about', 'certain', 'political', 'issues', 'such', 'as', 'the', 'vietnam', 'war', 'and', 'race', 'issues', 'in', 'the', 'united', 'states', '.', 'in']

Source tensors: tensor([22750,   207,    52,   907,  1171,  1462,   152,    23

In [125]:
BLOCK_SIZE = 30

def collate_batch(batch):
    src_batch,trg_batch = [],[]

    for _,text in batch:
        src_seq,trg_seq = get_sample2(BLOCK_SIZE,tokenizer(text))
        src_sequence = vocab(src_seq)
        trg_sequence = vocab(trg_seq)

        src_sequence = torch.tensor(src_sequence,dtype = torch.int64)
        trg_sequence = torch.tensor(tgr_sequence,dtype = torch.int64)

        src_batch.append(src_sequence)
        tgr_batch.append(trg_sequence)


    src_batch = pad_sequence(src_batch,padding_value = PAD_IDX, batch_first = False)
    trg_batch = pad_sequence(trg_batch,padding_value = PAD_IDX, batch_first = False)


    return src_batch.to(DEVICE),trg_batch.to(DEVICE)

In [119]:
label,data = next(iter(train_iter))
data

'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far between, ev

In [111]:
for i in range(2):
    _,text = next(iter(train_iter))

    output = tokenizer(text)

In [113]:
type(output),len(output)

(list, 317)

In [101]:
len(batch_of_tokens[0])

317

In [102]:
batch_of_tokens[0][:10]

['i',
 'rented',
 'i',
 'am',
 'curious-yellow',
 'from',
 'my',
 'video',
 'store',
 'because']

In [108]:
src_batch,trg_batch = [],[]


([['i',
   'rented',
   'i',
   'am',
   'curious-yellow',
   'from',
   'my',
   'video',
   'store',
   'because',
   'of',
   'all',
   'the',
   'controversy',
   'that',
   'surrounded',
   'it',
   'when',
   'it',
   'was',
   'first',
   'released',
   'in',
   '1967',
   '.',
   'i',
   'also',
   'heard',
   'that',
   'at',
   'first',
   'it',
   'was',
   'seized',
   'by',
   'u',
   '.',
   's',
   '.',
   'customs',
   'if',
   'it',
   'ever',
   'tried',
   'to',
   'enter',
   'this',
   'country',
   ',',
   'therefore',
   'being',
   'a',
   'fan',
   'of',
   'films',
   'considered',
   'controversial',
   'i',
   'really',
   'had',
   'to',
   'see',
   'this',
   'for',
   'myself',
   '.',
   'the',
   'plot',
   'is',
   'centered',
   'around',
   'a',
   'young',
   'swedish',
   'drama',
   'student',
   'named',
   'lena',
   'who',
   'wants',
   'to',
   'learn',
   'everything',
   'she',
   'can',
   'about',
   'life',
   '.',
   'in',
   'particul