In [1]:
#!pip install numpy pandas datasets sentencepiece torch matplotlib transformers

In [2]:
from datasets import load_from_disk
import pandas as pd
import sentencepiece as spm
from PIL import Image
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import matplotlib.pyplot as plt
from torchvision import transforms
import torch
import torch.nn.utils.rnn as rnn_utils
from io import BytesIO
import io
import torch.nn.functional as F
from tqdm import tqdm
from torch.utils.data import DataLoader, Dataset
import math
from torch import nn
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2TokenizerFast
from torch.optim.lr_scheduler import StepLR

In [3]:
dataset = load_dataset("roneneldan/TinyStories")

Repo card metadata block was not found. Setting CardData to empty.


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 2119719
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 21990
    })
})

In [5]:
train_data = dataset['train']
validation_data = dataset['validation']

train_df = train_data.to_pandas()
validation_df = validation_data.to_pandas()
df = pd.concat([train_df, validation_df])

In [6]:
df = df[:10000]
df.head()

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."


# Tokenizer

In [7]:
# Load the pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')


from transformers import BertTokenizer

# Load the BERT tokenizer
#tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [8]:
# Get the vocabulary size
vocab_size = len(tokenizer)
print("Vocabulary size:", vocab_size)

Vocabulary size: 50257


In [9]:
#tokenizing the ids of the captions and including it into df
df['text_tokens_ids'] = df['text'].apply(lambda x: tokenizer.encode(x))

Token indices sequence length is longer than the specified maximum sequence length for this model (1106 > 1024). Running this sequence through the model will result in indexing errors


In [10]:
df.head()

Unnamed: 0,text,text_tokens_ids
0,"One day, a little girl named Lily found a need...","[3198, 1110, 11, 257, 1310, 2576, 3706, 20037,..."
1,"Once upon a time, there was a little car named...","[7454, 2402, 257, 640, 11, 612, 373, 257, 1310..."
2,"One day, a little fish named Fin was swimming ...","[3198, 1110, 11, 257, 1310, 5916, 3706, 4463, ..."
3,"Once upon a time, in a land full of trees, the...","[7454, 2402, 257, 640, 11, 287, 257, 1956, 133..."
4,"Once upon a time, there was a little girl name...","[7454, 2402, 257, 640, 11, 612, 373, 257, 1310..."


# Dataloader

In [11]:
#Dataset class
class MyDataset(Dataset):
    def __init__(self, df, max_seq_len=20):
        self.df = df
        self.max_seq_len = max_seq_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        token_ids = self.df['text_tokens_ids'][idx] 

        # #Preprocess the captions
        SOS = tokenizer.bos_token_id
        EOS = tokenizer.eos_token_id

        #SOS = tokenizer.cls_token_id
        #EOS = tokenizer.sep_token_id
        

          # # Define custom BOS and EOS tokens
        # BOS_TOKEN = "<s>"
        # EOS_TOKEN = "</s>"
        
        # # Get token IDs for BOS and EOS tokens
        # SOS = tokenizer.PieceToId(BOS_TOKEN)
        # EOS = tokenizer.PieceToId(EOS_TOKEN)
        

        
        input_text = token_ids.copy()
        input_text.insert(0, SOS)

        target_text = token_ids.copy()
        target_text.append(SOS)
        
        cap_len = len(input_text)
        pad_len = self.max_seq_len - cap_len
        mask = []


        if pad_len > 0:
            zero_pad = [0] * pad_len
            input_text.extend(zero_pad)
            input_text_padded = input_text
            
            target_text.extend(zero_pad)
            target_text_padded = target_text

            mask.extend([1] * cap_len)
            mask.extend([0] * pad_len)
        else:
            input_text_padded = input_text[:self.max_seq_len]
            target_text_padded = target_text[:self.max_seq_len]
            mask.extend([1] * self.max_seq_len)

        return {
            'input_tokens' : torch.tensor(input_text_padded),
            'target_tokens': torch.tensor(target_text_padded),
            'mask'         : torch.tensor(mask)
        }

# Transformer

In [12]:
def get_device():
    return torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

def scaled_dot_product(q, k, v, pad_mask=None, atn_mask=False):
    d_k = q.size()[-1]
    
    # Move q, k, and v tensors to the same device
    q, k, v = q.to(get_device()), k.to(get_device()), v.to(get_device())
    
    scaled = torch.matmul(q, k.transpose(-1, -2)) / math.sqrt(d_k)
    if atn_mask:
        dia_mask = torch.full(scaled.size(), float('-inf'), device=get_device())
        dia_mask = torch.triu(dia_mask, diagonal=1)
        scaled += dia_mask
    attention = F.softmax(scaled, dim=-1)
    if pad_mask is not None:
        pad_mask = pad_mask.unsqueeze(1).unsqueeze(1) * pad_mask.unsqueeze(1).unsqueeze(3)
        # Move pad_mask to the same device
        pad_mask = pad_mask.to(get_device())
        attention = attention.masked_fill(pad_mask==0, 0)
    values = torch.matmul(attention, v)
    return values, attention

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_sequence_length):
        super().__init__()
        self.max_sequence_length = max_sequence_length
        self.d_model = d_model

    def forward(self, device=torch.device('cpu')):  # Pass device as an argument
        even_i = torch.arange(0, self.d_model, 2).float().to(device)  # Move tensor to device
        denominator = torch.pow(10000, even_i / self.d_model)
        position = torch.arange(self.max_sequence_length, device=device).reshape(self.max_sequence_length, 1)  # Move tensor to device
        even_PE = torch.sin(position / denominator)
        odd_PE = torch.cos(position / denominator)
        stacked = torch.stack([even_PE, odd_PE], dim=2)
        PE = torch.flatten(stacked, start_dim=1, end_dim=2)
        return PE


class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.qkv_layer = nn.Linear(d_model , 3 * d_model)
        self.linear_layer = nn.Linear(d_model, d_model)
    
    def forward(self, x, pad_mask=None, atn_mask=False):
        batch_size, sequence_length, d_model = x.shape
        qkv = self.qkv_layer(x)
        qkv = qkv.reshape(batch_size, sequence_length, self.num_heads, 3 * self.head_dim)
        qkv = qkv.permute(0, 2, 1, 3)
        q, k, v = qkv.chunk(3, dim=-1)
        values, attention = scaled_dot_product(q, k, v, pad_mask, atn_mask = atn_mask)
        values = values.reshape(batch_size, sequence_length, self.num_heads * self.head_dim)
        out = self.linear_layer(values)
        return out

  
class PositionwiseFeedForward(nn.Module):
    def __init__(self, d_model, hidden, drop_prob=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.linear1 = nn.Linear(d_model, hidden)
        self.linear2 = nn.Linear(hidden, d_model)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=drop_prob)

    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.linear2(x)
        return x


class DecoderLayer(nn.Module):
    def __init__(self, d_model, ffn_hidden, num_heads, drop_prob):
        super(DecoderLayer, self).__init__()
        # self.norm0 = nn.LayerNorm(d_model)
        self.self_attention1 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.dropout1 = nn.Dropout(p=drop_prob)
        self.self_attention2 = MultiHeadAttention(d_model=d_model, num_heads=num_heads)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout2 = nn.Dropout(p=drop_prob)
        self.ffn = PositionwiseFeedForward(d_model=d_model, hidden=ffn_hidden, drop_prob=drop_prob)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout3 = nn.Dropout(p=drop_prob)
    
    # Override the forward method to handle None values for pad_mask
    def forward(self, y, pad_mask, atn_mask):
        _y = y
        
        # Check if pad_mask is None before attempting to move it to device
        if pad_mask is not None:
            pad_mask = pad_mask.to(get_device())
        
        # y = self.norm0(y)
        y = self.self_attention1(y, pad_mask, atn_mask)
        y = self.dropout1(y) 
        y = self.norm1(y + _y) 
        _y = y
        
        # Check if pad_mask is None before attempting to move it to device
        # if pad_mask is not None:
        #     pad_mask = pad_mask.to(get_device())
        
        #y = self.self_attention2(y)         
        #y = self.dropout2(y)
        #y = self.norm2(y + _y)  

        #_y = y  
        y = self.ffn(y) 
        y = self.dropout3(y) 
        y = self.norm3(y + _y) 
        return y



class SequentialDecoder(nn.Sequential):
    def forward(self, *inputs):
        y, pad_mask, atn_mask = inputs
        for module in self._modules.values():
            y = module(y, pad_mask, atn_mask) #30 x 200 x 512
        return y

class Decoder(nn.Module):
    def __init__(self, 
                 d_model, 
                 ffn_hidden, 
                 num_heads, 
                 drop_prob, 
                 num_layers):
        super().__init__()
        self.layers = SequentialDecoder(*[DecoderLayer(d_model, ffn_hidden, num_heads, drop_prob) for _ in range(num_layers)])

    def forward(self, y, pad_mask = None, atn_mask = True):
        y = self.layers(y, pad_mask, atn_mask)
        return y



import torch
import torch.nn as nn
from transformers import GPT2Model

class Transformer(nn.Module):
    def __init__(self, 
                d_model, 
                ffn_hidden, 
                num_heads, 
                drop_prob, 
                num_layers,
                vocab_size,
                freeze_gpt2=True          # Whether to freeze the weights of the GPT-2 model
                ):
        super().__init__()
        self.d_model = d_model

        # Load the pre-trained GPT-2 model
        self.gpt2 = GPT2Model.from_pretrained('gpt2')
        
        # Freeze the weights of the GPT-2 model if specified
        if freeze_gpt2:
            for param in self.gpt2.parameters():
                param.requires_grad = False
        
        # Decoder layers
        self.decoder = Decoder(d_model, ffn_hidden, num_heads, drop_prob, num_layers)
        self.linear = nn.Linear(d_model, vocab_size)
        self.dec_pos_encoding = PositionalEncoding(d_model, 1)  
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    def forward(self, input_tokens, pad_mask=None, atn_mask=True):
        # Move input tensors to the appropriate device
        input_tokens = input_tokens.to(self.device)
        pad_mask = pad_mask.to(self.device) if pad_mask is not None else None
        max_sequence_length = input_tokens.shape[1]  # Fix this line to get the correct sequence length

        # Get token embeddings from GPT-2 model and detach gradients
        with torch.no_grad():
            token_embeddings = self.gpt2(input_tokens)[0]
        
        # Add positional encodings
        self.dec_pos_encoding = PositionalEncoding(d_model, max_sequence_length)
        token_pos_encodings = self.dec_pos_encoding(device=self.device)
        token_embeddings_with_pos = token_embeddings + token_pos_encodings.unsqueeze(0)

        # Perform the rest of the forward pass
        out = self.decoder(token_embeddings_with_pos, pad_mask, atn_mask)
        out = self.linear(out)
        return out


In [13]:
# from transformers import BertTokenizer, BertModel
# bert_model = BertModel.from_pretrained('bert-base-uncased')

from transformers import GPT2Tokenizer, GPT2Model

# Load pre-trained GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# # Load pre-trained GPT-2 model
# gpt2_model = GPT2Model.from_pretrained('gpt2')


In [14]:
d_model = 768
num_heads = 8
drop_prob = 0.1
batch_size = 128
ffn_hidden = 768
num_layers = 4
vocab_size = vocab_size
num_epochs = 100
temperature = 1


In [15]:
# Define a function to get the device
def get_device():
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [16]:
# Move the model to the appropriate device
transformer = Transformer(d_model, ffn_hidden, num_heads, drop_prob, num_layers, vocab_size)
transformer.to(get_device())  # Move the model to GPU if available

Transformer(
  (gpt2): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): Decoder(
    (layers): SequentialDecoder(
      (0): DecoderLayer(
        (self_attention1): MultiHeadAttention(
          (qkv_layer): Linear(in_features=768, out_features=2304, b

In [17]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(transformer):,} trainable parameters')

The model has 62,289,745 trainable parameters


In [18]:
dataset = MyDataset(df, max_seq_len=128)
dataloader = DataLoader(dataset, batch_size=batch_size)

In [19]:
# Define the optimizer and scheduler
optim = torch.optim.Adam(transformer.parameters(), lr=1e-4)
#scheduler = StepLR(optim, step_size=15, gamma=0.1)  # Learning rate decreases by a factor of 0.1 every 5 epochs

criterion = nn.CrossEntropyLoss()

# Iterate over epochs
for epoch in range(num_epochs):
    transformer.train()  # Set the model to training mode
    total_loss = 0
    for batch_num, batch in enumerate(dataloader):
        input_tokens, target_tokens, pad_mask = batch['input_tokens'], batch['target_tokens'], batch['mask']
        
        # Move tensors to GPU if available
        input_tokens = input_tokens.to(get_device())
        target_tokens = target_tokens.to(get_device())
        pad_mask = pad_mask.to(get_device())
        
        optim.zero_grad()
        predictions = transformer(input_tokens, pad_mask)
        loss = criterion(predictions.view(-1, vocab_size), target_tokens.view(-1))
        loss.backward()
        optim.step()
        total_loss += loss.item()
        
        # Print loss after each batch
        print_every = 50  # Define how often to print the loss
        if batch_num % print_every == 0:
           print(f'Epoch [{epoch + 1}/{num_epochs}], Batch [{batch_num + 1}/{len(dataloader)}], Loss: {loss.item()}')
    
    # Print total loss after each epoch
    print(f'Epoch [{epoch + 1}/{num_epochs}], Total Loss: {total_loss/len(dataloader)}')
    
    # Update the learning rate
    #scheduler.step()
    
    # Save the model after each epoch
    model_path = '/workspace/transformer_v2.pth'
    torch.save(transformer.state_dict(), model_path)

Epoch [1/100], Batch [1/79], Loss: 10.951462745666504
Epoch [1/100], Batch [51/79], Loss: 6.035002708435059
Epoch [1/100], Total Loss: 6.85837719108485
Epoch [2/100], Batch [1/79], Loss: 5.588253498077393
Epoch [2/100], Batch [51/79], Loss: 5.212686538696289
Epoch [2/100], Total Loss: 5.151911494098132
Epoch [3/100], Batch [1/79], Loss: 4.780664920806885
Epoch [3/100], Batch [51/79], Loss: 4.5213470458984375
Epoch [3/100], Total Loss: 4.42597576032711
Epoch [4/100], Batch [1/79], Loss: 4.200444221496582
Epoch [4/100], Batch [51/79], Loss: 4.117523193359375
Epoch [4/100], Total Loss: 3.9842699962326242
Epoch [5/100], Batch [1/79], Loss: 3.8674051761627197
Epoch [5/100], Batch [51/79], Loss: 3.858919382095337
Epoch [5/100], Total Loss: 3.7057466808753676
Epoch [6/100], Batch [1/79], Loss: 3.6400296688079834
Epoch [6/100], Batch [51/79], Loss: 3.6512222290039062
Epoch [6/100], Total Loss: 3.4937721173974534
Epoch [7/100], Batch [1/79], Loss: 3.431352376937866
Epoch [7/100], Batch [51/79],

In [24]:
def inference(transformer, tokenizer, starting_word, max_length, temperature=1.0):
    
    transformer.eval()
    # Convert starting and ending words to token IDs
    starting_token_ids = tokenizer.encode(starting_word)

    # Convert token IDs to tensor
    input_tensor = torch.tensor(starting_token_ids).unsqueeze(0).to(get_device())

    # Generate tokens until ending word is reached or maximum length is reached
    with torch.no_grad():
        for _ in range(max_length):
            # Forward pass through the model
            output = transformer(input_tensor)

            # Apply temperature scaling to the logits
            scaled_output = output / temperature

            # Get the last predicted token
            last_token = scaled_output.argmax(dim=-1)[:, -1]

            # Append the last token to the input tensor
            last_token = last_token.unsqueeze(0).to(input_tensor.device)  # Ensure last_token is on the same device as input_tensor
            input_tensor = torch.cat([input_tensor, last_token], dim=-1)

            # Check if the ending word is reached
            if (last_token == tokenizer.eos_token_id):
                break

    # Decode the generated tokens
    generated_text = tokenizer.decode(input_tensor.squeeze().tolist())

    return generated_text

# Define the path where the model is saved
model_path = '/workspace/transformer_v2.pth'

# Load the model from the saved file
transformer.load_state_dict(torch.load(model_path))

# Now you can use the inference function with the loaded and evaluated model
starting_word = "once upon a time,"
max_length = 100
temperature = 0.8

generated_sequence = inference(transformer, tokenizer, starting_word
                               , max_length, temperature)
print("Generated sequence:", generated_sequence)


Generated sequence: once upon a time, Anna got to try a smelly blouse and a swimsuit! She started to dance and dance and dance and dance!"

Anna didn't know what to do. She ran to her mom and said, "I know I don't want to dance."

Mom finally came running and helped her dance. She didn't dance! She never did!

Once again, Anna felt ashamed. She ran to her mom and saw a big rainbow. She couldn't stop. She ran
