In [3]:
!pip install numpy pandas datasets sentencepiece torch matplotlib transformers



In [4]:
from datasets import load_from_disk
import pandas as pd
import sentencepiece as spm
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
import torch
import torch.nn.utils.rnn as rnn_utils
from io import BytesIO
import io
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import math
from torch import nn
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from torch.optim.lr_scheduler import StepLR

  from .autonotebook import tqdm as notebook_tqdm


ModuleNotFoundError: No module named 'sentencepiece'

In [None]:
dataset = load_dataset("roneneldan/TinyStories")

Repo card metadata block was not found. Setting CardData to empty.


In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [None]:
train_data = dataset['train']
validation_data = dataset['validation']

train_df = train_data.to_pandas()
validation_df = validation_data.to_pandas()

In [None]:
train_df.head()
train_df = train_df[:4000]

# Tokenizer

In [None]:
# Load the pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [None]:
# Get the vocabulary size
vocab_size = len(tokenizer)
print("Vocabulary size:", vocab_size)

Vocabulary size: 50257


In [None]:
#tokenizing the ids of the captions and including it into df
train_df['text_tokens_ids'] = train_df['text'].apply(lambda x: tokenizer.encode(x))

Token indices sequence length is longer than the specified maximum sequence length for this model (1106 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   text             4000 non-null   object
 1   text_tokens_ids  4000 non-null   object
dtypes: object(2)
memory usage: 62.6+ KB


In [None]:
train_df.head()

Unnamed: 0,text,text_tokens_ids
0,"One day, a little girl named Lily found a need...","[3198, 1110, 11, 257, 1310, 2576, 3706, 20037,..."
1,"Once upon a time, there was a little car named...","[7454, 2402, 257, 640, 11, 612, 373, 257, 1310..."
2,"One day, a little fish named Fin was swimming ...","[3198, 1110, 11, 257, 1310, 5916, 3706, 4463, ..."
3,"Once upon a time, in a land full of trees, the...","[7454, 2402, 257, 640, 11, 287, 257, 1956, 133..."
4,"Once upon a time, there was a little girl name...","[7454, 2402, 257, 640, 11, 612, 373, 257, 1310..."


# Dataloader

In [None]:
#Dataset class
class MyDataset(Dataset):
    def __init__(self, df, max_seq_len=20):
        self.df = df
        self.max_seq_len = max_seq_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        token_ids = self.df['text_tokens_ids'][idx] 

        # #Preprocess the captions for gpt-2
        SOS = tokenizer.bos_token_id
        EOS = tokenizer.eos_token_id
        
        input_text = token_ids.copy()
        input_text.insert(0, SOS)

        target_text = token_ids.copy()
        target_text.append(EOS)
        
        cap_len = len(input_text)
        pad_len = self.max_seq_len - cap_len
        mask = []


        if pad_len > 0:
            zero_pad = [0] * pad_len
            input_text.extend(zero_pad)
            input_text_padded = input_text
            
            target_text.extend(zero_pad)
            target_text_padded = target_text

            mask.extend([1] * cap_len)
            mask.extend([0] * pad_len)
        else:
            input_text_padded = input_text[:self.max_seq_len]
            target_text_padded = target_text[:self.max_seq_len]
            mask.extend([1] * self.max_seq_len)

        return {
            'input_tokens' : torch.tensor(input_text_padded),
            'target_tokens': torch.tensor(target_text_padded),
            'mask'         : torch.tensor(mask)
        }

# Transformer

In [None]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def scaled_dot_product(q, k, v, pad_mask=None, atn_mask=False):
    d_k = q.size()[-1]
    
    # Move q, k, and v tensors to the same device
    q, k, v = q.to(get_device()), k.to(get_device()), v.to(get_device())
    
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if atn_mask:
        dia_mask = torch.full(scaled.size(), float('-inf'), device=get_device())
        dia_mask = torch.triu(dia_mask, diagonal=1)
        scaled += dia_mask
    attention = F.softmax(scaled, dim=-1)
    if pad_mask is not None:
        pad_mask = pad_mask.unsqueeze(1).unsqueeze(1) * pad_mask.unsqueeze(1).unsqueeze(3)
        # Move pad_mask to the same device
        pad_mask = pad_mask.to(get_device())
        attention = attention.masked_fill(pad_mask==0, 0)
    values = torch.matmul(attention, v)
    return values, attention

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self, device=torch.device('cpu')):  # Pass device as an argument
        even_i = torch.arange(0, self.d_model, 2).float().to(device)  # Move tensor to device
        denominator = torch.pow(10000, even_i / self.d_model)
        position = torch.arange(self.max_sequence_length, device=device).reshape(self.max_sequence_length, 1)  # Move tensor to device
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, pad_mask=None, atn_mask=False):
        batch_size, sequence_length, d_model = x.shape
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, pad_mask, atn_mask = atn_mask)
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out


class LayerNormalization(nn.Module):
    def __init__(self, parameters_shape, eps=1e-5):
        super().__init__()
        self.parameters_shape=parameters_shape
        self.eps=eps
        self.gamma = nn.Parameter(torch.ones(parameters_shape))
        self.beta =  nn.Parameter(torch.zeros(parameters_shape))

    def forward(self, inputs):
        dims = [-(i + 1) for i in range(len(self.parameters_shape))]
        mean = inputs.mean(dim=dims, keepdim=True)
        var = ((inputs - mean) ** 2).mean(dim=dims, keepdim=True)
        std = (var + self.eps).sqrt()
        y = (inputs - mean) / std
        out = self.gamma * y + self.beta
        return out

  
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        self.self_attention1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.self_attention2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)
    
    # Override the forward method to handle None values for pad_mask
    def forward(self, y, pad_mask, atn_mask):
        _y = y
        
        # Check if pad_mask is None before attempting to move it to device
        if pad_mask is not None:
            pad_mask = pad_mask.to(get_device())
        
        y = self.self_attention1(y, pad_mask, atn_mask)
        y = self.dropout1(y) 
        y = self.norm1(y + _y) 
        _y = y
        
        y = self.ffn(y) 
        y = self.dropout3(y) 
        y = self.norm3(y + _y) 
        return y

class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        y, pad_mask, atn_mask = inputs
        for module in self._modules.values():
            y = module(y, pad_mask, atn_mask)
        return y

class Decoder(nn.Module):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers):
        super().__init__()
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, y, pad_mask = None, atn_mask = True):
        y = self.layers(y, pad_mask, atn_mask)
        return y

class Transformer(nn.Module):
    def __init__(self, 
                d_model, 
                ffn_hidden, 
                num_heads, 
                drop_prob, 
                num_layers,
                vocab_size               
                ):
        super().__init__()
        self.d_model = d_model

        self.dec_embedding = nn.Embedding(vocab_size, d_model)
        self.dec_pos_encoding = PositionalEncoding(d_model, 1)
        
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers)
        self.linear = nn.Linear(d_model, vocab_size)
        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

    def forward(self, input_tokens, pad_mask=None, atn_mask=True):
        print("Input Tokens Shape:", input_tokens.shape)

        # Move input tensors to the appropriate device
        input_tokens = input_tokens.to(self.device)
        pad_mask = pad_mask.to(self.device) if pad_mask is not None else None
        max_sequence_length = input_tokens.shape[1]  # Fix this line to get the correct sequence length
        print("Max Sequence Length:", max_sequence_length)

        # Compute token embeddings
        token_embeddings = self.dec_embedding(input_tokens) 
        print("Token Embeddings Shape:", token_embeddings.shape)

        self.dec_pos_encoding = PositionalEncoding(d_model, max_sequence_length)
        token_pos_encodings = self.dec_pos_encoding(device=self.device)  # Pass device argument
        print("Token Position Encodings Shape:", token_pos_encodings.shape)

        token_embeddings_with_pos = token_embeddings + token_pos_encodings.unsqueeze(0)
        print("Token Embeddings with Position Shape:", token_embeddings_with_pos.shape)

        # Perform the rest of the forward pass
        out = self.decoder(token_embeddings_with_pos, pad_mask, atn_mask)
        print("Decoder Output Shape:", out.shape)

        out = self.linear(out)
        print("Linear Layer Output Shape:", out.shape)

        return out


In [None]:
d_model = 8
num_heads = 8
drop_prob = 0.1
batch_size = 2
ffn_hidden = 8
num_layers = 2
vocab_size = vocab_size
num_epochs = 1


NameError: name 'vocab_size' is not defined

In [None]:
# Define a function to get the device
def get_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# Move the model to the appropriate device
transformer = Transformer(d_model, ffn_hidden, num_heads, drop_prob, num_layers, vocab_size)
transformer.to(get_device())  # Move the model to GPU if available

Transformer(
  (dec_embedding): Embedding(50257, 8)
  (dec_pos_encoding): PositionalEncoding()
  (decoder): Decoder(
    (layers): SequentialDecoder(
      (0): DecoderLayer(
        (self_attention1): MultiHeadAttention(
          (qkv_layer): Linear(in_features=8, out_features=24, bias=True)
          (linear_layer): Linear(in_features=8, out_features=8, bias=True)
        )
        (norm1): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (self_attention2): MultiHeadAttention(
          (qkv_layer): Linear(in_features=8, out_features=24, bias=True)
          (linear_layer): Linear(in_features=8, out_features=8, bias=True)
        )
        (norm2): LayerNorm((8,), eps=1e-05, elementwise_affine=True)
        (dropout2): Dropout(p=0.1, inplace=False)
        (ffn): PositionwiseFeedForward(
          (linear1): Linear(in_features=8, out_features=8, bias=True)
          (linear2): Linear(in_features=8, out_features=8, bias=Tru

In [None]:
dataset = MyDataset(train_df[:4], max_seq_len=128)
dataloader = DataLoader(dataset, batch_size=batch_size)

NameError: name 'MyDataset' is not defined

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(transformer):,} trainable parameters')

The model has 855,905 trainable parameters


In [None]:
# Define the optimizer and scheduler
optim = torch.optim.Adam(transformer.parameters(), lr=0.001, betas=(0.9, 0.98), eps=1e-9)
#scheduler = StepLR(optim, step_size=20, gamma=0.1)  # Learning rate decreases by a factor of 0.1 every 5 epochs

criterion = nn.CrossEntropyLoss()

# Iterate over epochs
for epoch in range(num_epochs):
    transformer.train()  # Set the model to training mode
    total_loss = 0
    for batch_num, batch in enumerate(dataloader):
        input_tokens, target_tokens, pad_mask = batch['input_tokens'], batch['target_tokens'], batch['mask']
        
        # Move tensors to GPU if available
        input_tokens = input_tokens.to(get_device())
        target_tokens = target_tokens.to(get_device())
        pad_mask = pad_mask.to(get_device())
        
        optim.zero_grad()
        predictions = transformer(input_tokens, pad_mask)
        loss = criterion(predictions.view(-1, vocab_size), target_tokens.view(-1))
        loss.backward()
        optim.step()
        total_loss += loss.item()
        
        # Print loss after each batch
        print_every = 50  # Define how often to print the loss
        if batch_num % print_every == 0:
           print(f'Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_num + 1}/{len(dataloader)}], Loss: {loss.item()}')
            
    
    # Print total loss after each epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Total Loss: {total_loss/len(dataloader)}')
    
    # Update the learning rate
    #scheduler.step()
    
    # Save the model after each epoch
    model_path = '/workspace/transformer.pth'
    torch.save(transformer.state_dict(), model_path)

Input Tokens Shape: torch.Size([2, 128])
Max Sequence Length: 128
Token Embeddings Shape: torch.Size([2, 128, 8])
Token Position Encodings Shape: torch.Size([128, 8])
Token Embeddings with Position Shape: torch.Size([2, 128, 8])
Decoder Output Shape: torch.Size([2, 128, 8])
Linear Layer Output Shape: torch.Size([2, 128, 50257])
Epoch [1/100], Batch [1/2], Loss: 11.043807029724121
Input Tokens Shape: torch.Size([2, 128])
Max Sequence Length: 128
Token Embeddings Shape: torch.Size([2, 128, 8])
Token Position Encodings Shape: torch.Size([128, 8])
Token Embeddings with Position Shape: torch.Size([2, 128, 8])
Decoder Output Shape: torch.Size([2, 128, 8])
Linear Layer Output Shape: torch.Size([2, 128, 50257])
Epoch [1/100], Total Loss: 11.014247417449951
Input Tokens Shape: torch.Size([2, 128])
Max Sequence Length: 128
Token Embeddings Shape: torch.Size([2, 128, 8])
Token Position Encodings Shape: torch.Size([128, 8])
Token Embeddings with Position Shape: torch.Size([2, 128, 8])
Decoder Outp

In [None]:
def inference(transformer, tokenizer, starting_word, max_length, temperature=1.0):
    
    transformer.eval()
    # Convert starting and ending words to token IDs
    starting_token_ids = tokenizer.encode(starting_word)

    # Convert token IDs to tensor
    input_tensor = torch.tensor(starting_token_ids).unsqueeze(0).to(get_device())

    # Generate tokens until ending word is reached or maximum length is reached
    with torch.no_grad():
        for _ in range(max_length):
            # Forward pass through the model
            output = transformer(input_tensor)

            # Apply temperature scaling to the logits
            scaled_output = output / temperature

            # Get the last predicted token
            last_token = scaled_output.argmax(dim=-1)[:, -1]

            # Append the last token to the input tensor
            last_token = last_token.unsqueeze(0).to(input_tensor.device)  # Ensure last_token is on the same device as input_tensor
            input_tensor = torch.cat([input_tensor, last_token], dim=-1)

            # Check if the ending word is reached
            if (last_token == tokenizer.eos_token_id):
                break

    # Decode the generated tokens
    generated_text = tokenizer.decode(input_tensor.squeeze().tolist())

    return generated_text

# Define the path where the model is saved
model_path = '/workspace/transformer.pth'

# Load the model from the saved file
transformer.load_state_dict(torch.load(model_path))

# Now you can use the inference function with the loaded and evaluated model
starting_word = "once upon a time, there was a"
max_length = 43
temperature = 0.8

generated_sequence = inference(transformer, tokenizer, starting_word
                               , max_length, temperature)
print("Generated sequence:", generated_sequence)


Input Tokens Shape: torch.Size([1, 8])
Max Sequence Length: 8
Token Embeddings Shape: torch.Size([1, 8, 8])
Token Position Encodings Shape: torch.Size([8, 8])
Token Embeddings with Position Shape: torch.Size([1, 8, 8])
Decoder Output Shape: torch.Size([1, 8, 8])
Linear Layer Output Shape: torch.Size([1, 8, 50257])
Input Tokens Shape: torch.Size([1, 9])
Max Sequence Length: 9
Token Embeddings Shape: torch.Size([1, 9, 8])
Token Position Encodings Shape: torch.Size([9, 8])
Token Embeddings with Position Shape: torch.Size([1, 9, 8])
Decoder Output Shape: torch.Size([1, 9, 8])
Linear Layer Output Shape: torch.Size([1, 9, 50257])
Input Tokens Shape: torch.Size([1, 10])
Max Sequence Length: 10
Token Embeddings Shape: torch.Size([1, 10, 8])
Token Position Encodings Shape: torch.Size([10, 8])
Token Embeddings with Position Shape: torch.Size([1, 10, 8])
Decoder Output Shape: torch.Size([1, 10, 8])
Linear Layer Output Shape: torch.Size([1, 10, 50257])
Input Tokens Shape: torch.Size([1, 11])
Max S