# Imports

In [39]:
import os
import json
import time
import torch

from torch import nn
from tqdm import tqdm
from collections import Counter
from transformers import AdamW, get_scheduler
from torch.utils.data import DataLoader, Dataset

# Settings

In [40]:
VOCAB_SIZE = 30000  
EMBED_DIM = 128
NUM_LAYERS = 6 
NUM_HEADS = 4 

HIDDEN_DIM = 256
BATCH_SIZE = 32
SEQ_LEN = 128  

EPOCHS = 100
LEARNING_RATE = 5e-4

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\nWorking on : {DEVICE}\n")

# Data Dictory
MODEL_SAVE_DIR = "../Models"
DATA_PATH = "../Data/shakespeare.txt"


Working on : cuda



# Loading the Data

In [41]:
class TextDataset(Dataset):

    def __init__(self, file_path, tokenizer, seq_len=SEQ_LEN):

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        self.tokens = tokenizer(text)
        self.seq_len = seq_len

    def __len__(self):
        return len(self.tokens) // self.seq_len

    def __getitem__(self, idx):

        start = idx * self.seq_len
        end = start + self.seq_len

        input_ids = self.tokens[start:end]
        target_ids = input_ids[1:] + [0]
        
        return torch.tensor(input_ids), torch.tensor(target_ids)

# Building the Vocabulary

In [42]:
def build_vocab(file_path, vocab_size):

    tokenizer = lambda text: text.split()
    counter = Counter()

    with open(file_path, 'r', encoding='utf-8') as f:
        counter.update(tokenizer(f.read()))

    vocab = {word: i for i, (word, _) in enumerate(counter.most_common(vocab_size))}
    
    return vocab

vocab = build_vocab(DATA_PATH, VOCAB_SIZE)
word_to_id = vocab

id_to_word = {i: word for word, i in vocab.items()}

def tokenize(text):
    return [word_to_id.get(word, 0) for word in text.split()]

# Instanciating the Dataset

In [43]:
data = TextDataset(DATA_PATH, tokenize)
dataloader = DataLoader(data, batch_size=BATCH_SIZE, shuffle=True)

# Building the LLM Model

In [44]:
class SimpleTransformer(nn.Module):

    def __init__(self, vocab_size, embed_dim, num_heads, hidden_dim, num_layers, seq_len):

        super(SimpleTransformer, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.position_embedding = nn.Embedding(seq_len, embed_dim)

        self.transformer_blocks = nn.ModuleList([
                nn.TransformerEncoderLayer(
                    d_model=embed_dim, nhead=num_heads, dim_feedforward=hidden_dim)
                        for _ in range(num_layers)])
                        
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):

        positions = torch.arange(0, x.size(1), device=x.device).unsqueeze(0)
        x = self.embedding(x) + self.position_embedding(positions)

        for block in self.transformer_blocks:
            x = block(x)
            
        return self.fc(x)

# Instanciating the LLM Model

In [45]:
model = SimpleTransformer(

    vocab_size=VOCAB_SIZE,

    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,

    hidden_dim=HIDDEN_DIM,
    num_layers=NUM_LAYERS,
    
    seq_len=SEQ_LEN).to(DEVICE)

# Training the Model

In [46]:
loss_fn = nn.CrossEntropyLoss(ignore_index=0)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

num_training_steps = len(dataloader) * EPOCHS
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

model.train()
max_grad_norm = 1.0

for epoch in range(EPOCHS):
    loop = tqdm(dataloader, leave=True)

    for batch in loop:

        input_ids, target_ids = batch
        input_ids = input_ids.to(DEVICE)
        target_ids = target_ids.to(DEVICE)

        outputs = model(input_ids)
        loss = loss_fn(outputs.view(-1, VOCAB_SIZE), target_ids.view(-1))

        optimizer.zero_grad()
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)

        optimizer.step()
        lr_scheduler.step()

        loop.set_description(f"Epoch {epoch}")
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|██████████| 236/236 [00:25<00:00,  9.13it/s, loss=7.74]
Epoch 1: 100%|██████████| 236/236 [00:15<00:00, 14.82it/s, loss=7.5] 
Epoch 2: 100%|██████████| 236/236 [00:14<00:00, 16.21it/s, loss=7.02]
Epoch 3: 100%|██████████| 236/236 [00:13<00:00, 17.99it/s, loss=6.9] 
Epoch 4: 100%|██████████| 236/236 [00:12<00:00, 18.28it/s, loss=6.39]
Epoch 5: 100%|██████████| 236/236 [00:12<00:00, 18.22it/s, loss=6.98]
Epoch 6: 100%|██████████| 236/236 [00:12<00:00, 18.22it/s, loss=6.64]
Epoch 7: 100%|██████████| 236/236 [00:12<00:00, 18.18it/s, loss=6.83]
Epoch 8: 100%|██████████| 236/236 [00:12<00:00, 18.15it/s, loss=6.48]
Epoch 9: 100%|██████████| 236/236 [00:13<00:00, 18.14it/s, loss=6.06]
Epoch 10: 100%|██████████| 236/236 [00:13<00:00, 18.04it/s, loss=6.7] 
Epoch 11: 100%|██████████| 236/236 [00:13<00:00, 18.08it/s, loss=6.3] 
Epoch 12: 100%|██████████| 236/236 [00:13<00:00, 18.08it/s, loss=5.97]
Epoch 13: 100%|██████████| 236/236 [00:13<00:00, 18.08it/s, loss=6.36]
Epoch 14: 100%|█

# Saving the LLM Model

In [47]:
model_settings = {
    "VOCAB_SIZE": VOCAB_SIZE,
    "EMBED_DIM": EMBED_DIM,

    "NUM_LAYERS": NUM_LAYERS,
    "NUM_HEADS": NUM_HEADS,

    "HIDDEN_DIM": HIDDEN_DIM,
    "BATCH_SIZE": BATCH_SIZE,

    "SEQ_LEN": SEQ_LEN,
    "EPOCHS": EPOCHS,
    "LEARNING_RATE": LEARNING_RATE}

def save_model_settings(model, model_save_dir, settings):

    timestamp = time.strftime("%Y%m%d-%H%M%S")
    model_filename = f"shakespeare_LLM_{timestamp}.pth"
    settings_filename = f"shakespeare_LLM_{timestamp}_settings.json"

    # Save the model's state dict
    torch.save(model.state_dict(), os.path.join(model_save_dir, model_filename))

    # Save the hyperparameters/configuration
    with open(os.path.join(model_save_dir, settings_filename), "w") as f:
        json.dump(settings, f)


    print(f"Model saved: {os.path.join(model_save_dir, model_filename)}")
    print(f"Settings saved: {os.path.join(model_save_dir, settings_filename)}")

save_model_settings(model, MODEL_SAVE_DIR, model_settings)

Model saved: ../Models\shakespeare_LLM_20250417-123315.pth
Settings saved: ../Models\shakespeare_LLM_20250417-123315_settings.json
