# Importing libraries

In [None]:
import os
from dotenv import load_dotenv
from typing import Tuple
import torch
from torch.utils.data import Dataset, DataLoader
import wandb
from src.utils import set_seed, load_text, split_text
from src.config import ModelConfig, TrainConfig, GenerationConfig
from src.tokenizer import CharTokenizer
from models.GPT2 import GPT
from src.train import Trainer

# Configuration

In [None]:
model_config = ModelConfig(
    max_seq_len=256,
    d_embed=128,
    n_layers=4,
    n_heads=4,
    d_head=32,
    d_ff=512,
    router_free=True,
    n_experts=4,
    n_activated_experts=1
)

train_config = TrainConfig(
    debug=False,
    wandb_project="nanoGPT",
    model_name="nanoGPT",
    per_device_train_batch_size=256,
    per_device_eval_batch_size=512,
    gradient_accumulation_steps=512 // 256,
    num_train_epochs=1,
    learning_rate=2e-3,
    eval_steps=100,
    mixed_precision=False,
    matmul_precision="high",
)

generation_config = GenerationConfig(
    max_new_tokens=1000
)

In [None]:
load_dotenv()
wandb.login(key=os.environ.get("WANDB_API_KEY"))

# Utils

## Reproducibility

In [None]:
set_seed(train_config.seed)

## Device

In [None]:
device = torch.device("cuda")
print(f"Device: {torch.cuda.get_device_name(device)}")
torch.set_float32_matmul_precision(train_config.matmul_precision)  # Tensor Cores

# Dataset

In [None]:
shakespeare_text = load_text("datasets/Shakespeare/shakespeare.txt")

In [None]:
if train_config.debug:
    print(shakespeare_text[:1000])

# Tokenizer

In [None]:
char_tokenizer = CharTokenizer()
char_tokenizer.build_vocab(text=shakespeare_text)
model_config.vocab_size = char_tokenizer.vocab_size

In [None]:
if train_config.debug:
    print(f"Vocabulary size: {char_tokenizer.vocab_size}")
    print("Vocabulary:", char_tokenizer.char2idx)

# Preprocessing

In [None]:
train_text, val_text = split_text(shakespeare_text, val_size=0.1)
print(f"Training text length: {len(train_text)} characters")
print(f"Validation text length: {len(val_text)} characters")

In [None]:
class TextDataset(Dataset):
    def __init__(self, text: str, tokenizer: CharTokenizer, max_seq_len: int):
        self.encoded = tokenizer.encode(text)
        self.max_seq_len = max_seq_len

    def __len__(self) -> int:
        return len(self.encoded) - self.max_seq_len

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        input_ids = self.encoded[idx:idx + self.max_seq_len]
        target_ids = self.encoded[idx + 1:idx + self.max_seq_len + 1]
        return input_ids, target_ids

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    target_ids = torch.stack([item[1] for item in batch])
    return {"input_ids": input_ids, "target_ids": target_ids}

train_dataset = TextDataset(train_text, char_tokenizer, model_config.max_seq_len)
val_dataset = TextDataset(val_text, char_tokenizer, model_config.max_seq_len)

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=True,
    num_workers=4
)
val_loader = DataLoader(
    val_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=False,
    num_workers=4
)

In [None]:
if train_config.debug:
    sample_batch = next(iter(train_loader))
    print(f"Sample input IDs: {sample_batch['input_ids'][:5]}")
    print(f"Sample target IDs: {sample_batch['target_ids'][:5]}")

## Model

In [None]:
# Initialize the model
model = GPT(model_config).to(device)
model = torch.compile(model)
print(model)
print(f"Number of parameters: {model.num_params() / 1e6:.2f}M")
print(f"Number of active parameters: {model.num_active_params() / 1e6:.2f}M")

## Speedometer

# Training

In [None]:
trainer = Trainer(
    train_config=train_config,
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    master_process=True
)
trainer.train()

## Save the model

In [None]:
# Save model locally
output_dir = f"checkpoints/{train_config.model_name}/{train_config.run_name}"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(
    output_dir,
    safe_serialization=True
)
# Push to Hugging Face Hub
model.push_to_hub(
    repo_id=f"PathFinderKR/{train_config.model_name}-{train_config.run_name}",
    private=True,
    use_auth_token=os.environ.get("HUGGINGFACE_TOKEN")
)

In [None]:
# To load the model later, you can use:
# model = GPT(model_config)
#  model = model.from_pretrained(output_dir).to(device)

# Inference

In [None]:
user_prompt = "To be, or not to be, that is the question"
input_ids = char_tokenizer.encode(user_prompt).unsqueeze(0).to(device)
output = model.generate(
    input_ids,
    max_new_tokens=generation_config.max_new_tokens,
    temperature=generation_config.temperature,
    top_k=generation_config.top_k
)
response = char_tokenizer.decode(output[0].squeeze().cpu().numpy())

In [None]:
print("=" * 50)
print("User prompt: ")
print(user_prompt)
print("-" * 50)
print("🤖 Model Response:")
print(response)