# Importing libraries

In [1]:
import os
from dotenv import load_dotenv
from typing import Tuple
import torch
from torch.utils.data import Dataset, DataLoader
import wandb
from src.utils import set_seed, load_text, split_text
from src.config import ModelConfig, TrainConfig, GenerationConfig
from src.tokenizer import CharTokenizer
from models.GPT2 import GPT
from src.train import Trainer

# Configuration

In [2]:
model_config = ModelConfig(
    max_seq_len=256,
    d_embed=128,
    n_layers=4,
    n_heads=4,
    d_head=32,
    d_ff=512,
    router_free=True,
    n_experts=4,
    n_activated_experts=1
)

train_config = TrainConfig(
    debug=False,
    wandb_project="nanoGPT",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=512,
    gradient_accumulation_steps=512 // 256,
    num_train_epochs=1,
    learning_rate=2e-3,
    eval_steps=100,
    mixed_precision=False,
    matmul_precision="high",
)

generation_config = GenerationConfig(
    max_new_tokens=1000
)

## Weights & Biases

In [3]:
load_dotenv()
wandb.login(key=os.environ.get("WANDB_API_KEY"))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Utils

## Reproducibility

In [4]:
set_seed(train_config.seed)

Random seed set to 42


## Device

In [5]:
device = torch.device("cuda")
torch.set_float32_matmul_precision(train_config.matmul_precision)  # Tensor Cores

# Dataset

In [6]:
shakespeare_text = load_text("../datasets/Shakespeare/shakespeare.txt")

Loaded text data from ../datasets/Shakespeare/shakespeare.txt (length: 1115394 characters).


In [7]:
if train_config.debug:
    print(shakespeare_text[:1000])

# Tokenizer

In [8]:
char_tokenizer = CharTokenizer()
char_tokenizer.build_vocab(text=shakespeare_text)
model_config.vocab_size = char_tokenizer.vocab_size

Unique characters: 65


In [9]:
if train_config.debug:
    print(f"Vocabulary size: {char_tokenizer.vocab_size}")
    print("Vocabulary:", char_tokenizer.char2idx)

# Preprocessing

In [10]:
train_text, val_text = split_text(shakespeare_text, val_size=0.1)

In [11]:
class TextDataset(Dataset):
    def __init__(self, text: str, tokenizer: CharTokenizer, max_seq_len: int):
        self.encoded = tokenizer.encode(text)
        self.max_seq_len = max_seq_len

    def __len__(self) -> int:
        return len(self.encoded) - self.max_seq_len

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        input_ids = self.encoded[idx:idx + self.max_seq_len]
        target_ids = self.encoded[idx + 1:idx + self.max_seq_len + 1]
        return input_ids, target_ids

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    target_ids = torch.stack([item[1] for item in batch])
    return {"input_ids": input_ids, "target_ids": target_ids}

train_dataset = TextDataset(train_text, char_tokenizer, model_config.max_seq_len)
val_dataset = TextDataset(val_text, char_tokenizer, model_config.max_seq_len)

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=True,
    num_workers=4
)
val_loader = DataLoader(
    val_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=False,
    num_workers=4
)

In [12]:
if train_config.debug:
    sample_batch = next(iter(train_loader))
    print(f"Sample input IDs: {sample_batch['input_ids'][:5]}")
    print(f"Sample target IDs: {sample_batch['target_ids'][:5]}")

## Model

In [13]:
# Initialize the model
model = GPT(model_config).to(device)
model = torch.compile(model)
print(model)
print(f"Number of parameters: {model.num_params() / 1e6:.2f}M")
print(f"Number of active parameters: {model.num_active_params() / 1e6:.2f}M")

OptimizedModule(
  (_orig_mod): GPT(
    (embedding): Embedding(65, 128)
    (positional_encoding): Embedding(256, 128)
    (dropout): Dropout(p=0.1, inplace=False)
    (blocks): ModuleList(
      (0-3): 4 x Block(
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (qkv_proj): Linear(in_features=128, out_features=384, bias=True)
          (out_proj): Linear(in_features=128, out_features=128, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): RouterFreeMoE(
          (experts): ModuleList(
            (0-3): 4 x Expert(
              (fc1): Linear(in_features=128, out_features=512, bias=True)
              (activation): GELU(approximate='none')
              (fc2): Linear(in_features=512, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
          )
        )
      )
  

## Speedometer

# Training

In [14]:
trainer = Trainer(
        train_config=train_config,
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        device=device
)
trainer.train()

Training: 100%|██████████| 980/980 [07:11<00:00,  2.27it/s, grad_norm=0.0893, loss=1.4213, lr=0.000000]  


0,1
Epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Grad Norm,▄▄▁▁▂▂█▂▂▂▄▂▂▂▂▃▃▂▂▂▂▂▂▂▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁
Learning Rate,▂▃▅███████████▇▇▇▆▆▆▅▅▄▄▄▄▃▃▃▂▂▂▂▂▁▁▁▁▁▁
Train Loss,██▇▆▅▅▅▅▅▄▄▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Val Loss,█▆▃▂▂▁▁▁▁▁
Val Perplexity,█▅▃▂▁▁▁▁▁▁

0,1
Epoch,1.0
Grad Norm,0.0893
Learning Rate,0.0
Train Loss,1.42131
Val Loss,1.53283
Val Perplexity,4.63125


# Inference

In [15]:
user_prompt = "To be, or not to be, that is the question"
input_ids = char_tokenizer.encode(user_prompt).unsqueeze(0).to(device)
output = model.generate(
    input_ids,
    max_new_tokens=generation_config.max_new_tokens,
    temperature=generation_config.temperature,
    top_k=generation_config.top_k
)
response = char_tokenizer.decode(output[0].squeeze().cpu().numpy())

In [16]:
print("=" * 50)
print("User prompt: ")
print(user_prompt)
print("-" * 50)
print("🤖 Model Response:")
print(response)

User prompt: 
To be, or not to be, that is the question
--------------------------------------------------
🤖 Model Response:
To be, or not to be, that is the question.

BRUTUS:
I do, thank he's so one coil oath? Isable many was she
as Hath run blord in't. For his every person; till wront they
see to shall I have any natured of a matter's sacred
with disposed the king to gates of knee, good strew,
Which is news your bones on to prope ourselves,
So well not them plotesy soul and Angelo alive,
Porney, and he his nature petty is devost,
Masters death in queen command to in the time
That sun swigh-bove went that thou damn'dst,
and the listle of this particular,
And not live mine only furit day.
I'll know what we that cried you?

EDWARD:
His dayst thou decking only good?

BALTHASAR:
Here mayor, be'n away;
Which he hastes fearful Parience to die Coriolanus?

HORTENSIO:
But, give my great send of not countriar.

GLOUCESTER:

PeternIO:
By I do your premoutation.

QUEEN MARGARET:
Unless that did