In [4]:
from datasets import load_dataset
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.load('tinystorycustom.model')
dataset = load_dataset("roneneldan/TinyStories")
train_data = dataset["train"]["text"]

max_length = 0
for passage_list in train_data:
        # Assume `tokenizer.encode` returns a list of tokens.
        encoded_paragraph = sp.encode_as_ids(passage_list)
        max_length = max(max_length, len(encoded_paragraph))
print(max_length)

Repo card metadata block was not found. Setting CardData to empty.


1194


In [78]:
sp.IdToPiece(1)

'<s>'

In [80]:
sp.PieceToId('</s>')

2

In [86]:
pad_token_id = sp.PieceToId('<pad>')
eos_token_id = sp.PieceToId('</s>')
print(eos_token_id)

2


In [79]:
def encode_and_pad(text, max_length, pad_token_id):
    encoded_text = sp.encode_as_ids(text)
    padded_text = encoded_text + [eos_token_id] +[pad_token_id] * (max_length - len(encoded_text)),  # Pad at the end
        
    return padded_text

padded_stories = [encode_and_pad(text, max_length, pad_token_id) for text in dataset["train"]["text"]]



In [None]:
import torch
import torch.nn as nn
import math

class TransformerBlock(nn.Module):
    def __init__(self, input_dim, num_heads, dim_feedforward, dropout=0.1):
        super(TransformerBlock, self).__init__()
        self.num_heads = num_heads
        self.dim_feedforward = dim_feedforward
        
        self.qkv_proj = nn.Linear(input_dim, 3 * input_dim)
        self.fc_out = nn.Linear(input_dim, input_dim)
        self.norm1 = nn.LayerNorm(input_dim)
        self.norm2 = nn.LayerNorm(input_dim)
        
        self.dropout = nn.Dropout(dropout)
        self.feed_forward = nn.Sequential(
            nn.Linear(input_dim, dim_feedforward),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(dim_feedforward, input_dim),
        )

    def scaled_dot_product_attention(self, q, k, v):
        d_k = q.size(-1)
        attn_logits = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(d_k)
        attention = torch.softmax(attn_logits, dim=-1)
        values = torch.matmul(attention, v)
        return values

    def forward(self, x):
        batch_size, seq_length, dim = x.size()
        
        # Linear projection splits Q, K, and V, and then reshape and transpose for multi-head attention
        qkv = self.qkv_proj(x)
        q, k, v = qkv.chunk(3, dim=-1)
        q = q.view(batch_size, seq_length, self.num_heads, dim // self.num_heads).transpose(1, 2)
        k = k.view(batch_size, seq_length, self.num_heads, dim // self.num_heads).transpose(1, 2)
        v = v.view(batch_size, seq_length, self.num_heads, dim // self.num_heads).transpose(1, 2)

        # Scaled dot-product attention
        attn = self.scaled_dot_product_attention(q, k, v)
        attn = attn.transpose(1, 2).contiguous().view(batch_size, seq_length, dim)
        attn = self.dropout(attn)

        # Add & Norm
        x = x + attn
        x = self.norm1(x)

        # Feed-forward network
        ff_out = self.feed_forward(x)
        ff_out = self.dropout(ff_out)
        
        # Add & Norm
        x = x + ff_out
        x = self.norm2(x)

        return x

class TransformerModel(nn.Module):
    def __init__(self, vocab_size, input_dim, num_heads, dim_feedforward, num_layers, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.embed = nn.Embedding(vocab_size, input_dim)
        self.pos_encoder = nn.Parameter(torch.zeros(1, 512, input_dim))
        self.layers = nn.ModuleList([TransformerBlock(input_dim, num_heads, dim_feedforward, dropout) for _ in range(num_layers)])
        self.fc_out = nn.Linear(input_dim, vocab_size)
        
    def forward(self, src):
        src = self.embed(src) * math.sqrt(self.embed.embedding_dim)
        src = src + self.pos_encoder[:,:src.size(1)]
        src = nn.Dropout(0.1)(src)

        for layer in self.layers:
            src = layer(src)
        
        output = self.fc_out(src)
        return output

# Example parameters:
vocab_size = 16000  # Vocabulary size
input_dim = 512  # Embedding dimension
num_heads = 8  # Number of heads in multi-head attention
dim_feedforward = 2048  # Hidden layer size in feed forward network
num_layers = 1  # Number of transformer blocks

# Initialize the model
transformer_model = TransformerModel(vocab_size, input_dim, num_heads, dim_feedforward, num_layers)

from torch.utils.data import DataLoader, TensorDataset

# Example data (token IDs)



def train_model(model, data_loader, epochs, criterion, optimizer):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch, tgt in data_loader:
            optimizer.zero_grad()
            
            # Shift the target sequence by one for next word prediction
            target_input = tgt[:, :-1]
            targets = tgt[:, 1:].contiguous().view(-1)

            # Forward pass
            output = transformer_model(target_input)
            output = output.view(-1, transformer_model.fc_out.out_features)
            
            # Calculate loss
            loss = criterion(output, targets)
            total_loss += loss.item()
            
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

        avg_loss = total_loss / len(data_loader)
        print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

input_sequences = padded_stories
target_sequences = torch.roll(input_sequences, shifts=-1, dims=1)

# Create dataset and data loader
dataset = TensorDataset(input_sequences, target_sequences)
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(transformer_model.parameters())

epochs = 5  # Number of epochs to train
train_model(transformer_model, data_loader, epochs, criterion, optimizer)
