# Importing libraries

In [1]:
import os
from dotenv import load_dotenv
from typing import Tuple
import torch
from torch.utils.data import Dataset, DataLoader
import wandb
from src.utils import set_seed, load_text, split_text, speedometer
from src.config import ModelConfig, TrainConfig, GenerationConfig
from src.tokenizer import CharTokenizer
from models.GPT import GPT
from src.train import Trainer



# Configuration

In [2]:
model_config = ModelConfig(
    vocab_size=-1,
    max_seq_len=128,
    d_embed=128,
    n_layers=4,
    n_heads=4,
    d_head=32,
    d_ff=512
)

train_config = TrainConfig(
    debug=False,
    wandb_project="nanoGPT",
    model_name="nanoGPT",
    per_device_train_batch_size=512,
    per_device_eval_batch_size=1024,
    gradient_accumulation_steps=512 // 512,
    num_train_epochs=1,
    learning_rate=5e-3,
    eval_steps=100,
    mixed_precision=True,
    matmul_precision="high",
)

generation_config = GenerationConfig(
    use_cache=True,
    max_new_tokens=1000,
    temperature=1.0,
    top_k=50
)

In [3]:
load_dotenv()
wandb.login(key=os.environ.get("WANDB_API_KEY"))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Utils

## Reproducibility

In [4]:
set_seed(train_config.seed)

Random seed set to 42


## Device

In [5]:
device = torch.device("cuda")
print(f"Device: {torch.cuda.get_device_name(device)}")
torch.set_float32_matmul_precision(train_config.matmul_precision)  # Tensor Cores

Device: NVIDIA GeForce RTX 4080 SUPER


# Dataset

In [6]:
shakespeare_text = load_text("datasets/Shakespeare/shakespeare.txt")

Loaded text data from datasets/Shakespeare/shakespeare.txt (length: 1115394 characters).


In [7]:
if train_config.debug:
    subset_shakespeare_text = shakespeare_text[:10000]
    print(subset_shakespeare_text)
    shakespeare_text = subset_shakespeare_text

# Tokenizer

In [8]:
char_tokenizer = CharTokenizer()
char_tokenizer.build_vocab(text=shakespeare_text)
char_tokenizer.save_vocab("char_vocab.json")
model_config.vocab_size = char_tokenizer.vocab_size

Vocabulary size: 69
Vocabulary saved to char_vocab.json.


In [9]:
if train_config.debug:
    print("Vocabulary:", char_tokenizer.char2idx)

# Preprocessing

In [10]:
train_text, val_text = split_text(shakespeare_text, val_size=0.1)
print(f"Training text length: {len(train_text)} characters")
print(f"Validation text length: {len(val_text)} characters")

Training text length: 1003854 characters
Validation text length: 111540 characters


In [11]:
class TextDataset(Dataset):
    def __init__(self, text: str, tokenizer: CharTokenizer, max_seq_len: int):
        self.encoded = tokenizer.encode(text)
        self.max_seq_len = max_seq_len

    def __len__(self) -> int:
        return len(self.encoded) - self.max_seq_len

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        input_ids = self.encoded[idx:idx + self.max_seq_len]
        target_ids = self.encoded[idx + 1:idx + self.max_seq_len + 1]
        return input_ids, target_ids

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    target_ids = torch.stack([item[1] for item in batch])
    return {"input_ids": input_ids, "target_ids": target_ids}

train_dataset = TextDataset(train_text, char_tokenizer, model_config.max_seq_len)
val_dataset = TextDataset(val_text, char_tokenizer, model_config.max_seq_len)

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=True,
    num_workers=4
)
val_loader = DataLoader(
    val_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=False,
    num_workers=4
)

In [12]:
if train_config.debug:
    sample_batch = next(iter(train_loader))
    print(f"Sample input IDs: {sample_batch['input_ids'][0]}")
    print(f"Sample target IDs: {sample_batch['target_ids'][0]}")

# Model

In [13]:
# Initialize the model
model = GPT(model_config).to(device)
model = torch.compile(model)
print(model)
print(f"Number of parameters: {model.num_params() / 1e6:.2f}M")

OptimizedModule(
  (_orig_mod): GPT(
    (token_embedding): Embedding(69, 128)
    (positional_encoding): Embedding(128, 128)
    (dropout): Dropout(p=0.1, inplace=False)
    (blocks): ModuleList(
      (0-3): 4 x Block(
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (attn): MultiHeadAttention(
          (qkv_proj): Linear(in_features=128, out_features=384, bias=False)
          (out_proj): Linear(in_features=128, out_features=128, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (mlp): FeedForward(
          (fc1): Linear(in_features=128, out_features=512, bias=False)
          (activation): GELU(approximate='none')
          (fc2): Linear(in_features=512, out_features=128, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (lm_head): Linear(in_fea

# Training

In [14]:
trainer = Trainer(
    model=model,
    train_config=train_config,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    master_process=True
)
trainer.train()

Training: 100%|██████████| 981/981 [00:58<00:00, 16.66it/s, epoch=1, grad_norm=0.1827, loss=1.2972, lr=0.000000]


0,1
Grad Norm,█▂▂▂▂▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Learning Rate,▁▂▂▄▄▇▇▇█████████▇▇▇▆▆▆▆▅▅▄▄▃▃▃▃▂▂▂▁▁▁▁▁
Train Loss,█▇▇▆▆▅▅▄▄▃▃▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Val Loss,█▄▂▂▁▁▁▁▁▁
Val Perplexity,█▃▂▁▁▁▁▁▁▁

0,1
Grad Norm,0.18272
Learning Rate,0.0
Train Loss,1.29722
Val Loss,1.47043
Val Perplexity,4.35112


## Save the model

In [15]:
# Save model locally
output_dir = f"checkpoints/{train_config.model_name}/{train_config.run_name}"
os.makedirs(output_dir, exist_ok=True)
try:
    model.save_pretrained(
        output_dir,
        safe_serialization=True
    )
    print("Model saved successfully.")
except Exception as e:
    print(f"Error saving model: {e}")
# Push to Hugging Face Hub
#model.push_to_hub(
#    repo_id=f"PathFinderKR/{train_config.model_name}-{train_config.run_name}",
#    private=True,
#    use_auth_token=os.environ.get("HUGGINGFACE_TOKEN")
#)
#print(f"Model pushed to Hugging Face Hub: PathFinderKR/{train_config.model_name}-{train_config.run_name}")

Model saved successfully.


In [16]:
# To load the model later, you can use:
# model = GPT(model_config)
# model = model.from_pretrained(output_dir).to(device)

# Inference

In [17]:
user_prompt = "To be, or not to be, that is the question"
input_ids = char_tokenizer.encode(user_prompt).unsqueeze(0).to(device)
output = model.generate(
    input_ids,
    use_cache=True,
    max_new_tokens=generation_config.max_new_tokens,
    temperature=generation_config.temperature,
    top_k=generation_config.top_k,
    tokenizer=char_tokenizer
)
response = char_tokenizer.decode(output[0].squeeze().cpu().numpy())

,
That you would have die: Spoke as I come.

First Citizen:
Nay, the father.

CORIOLANUSResetting KV cache
ICINIUS:
I had
You'll power, good Cabal,
I prove my son!'

CORIOLANUS:
Why, farewell:
Then, our loves as what will I would have Resetting KV cache
with thy railty at injured whens
he may die her inconcilent to lose.
If lose of Edward's father, a honour kind's but
Infess and Resetting KV cache
lie, I'll away to my wive.

SAMPSON:
To my knock pitch me speak, it will, by my young,
being given the purbrications and miniqueResetting KV cache
 Richard ktom, and yet some short resist
Upon your son that seems; he then sings you dear me.
But all these brothers condemning Resetting KV cache
be compass'd. He's grey that would I did.
A merry, my lord, I'll get on my wife,
And not to chave an enterchange to your determiResetting KV cache
m:
And both giving me life, I think thee,
But so have summon night.
I should stone in me; but this is the gates of the
night of Resetting KV cache
fair,
From 

In [18]:
print("=" * 50)
print("User prompt: ")
print(user_prompt)
print("-" * 50)
print("🤖 Model Response:")
print(response)

User prompt: 
To be, or not to be, that is the question
--------------------------------------------------
🤖 Model Response:
To be, or not to be, that is the question,
That you would have die: Spoke as I come.

First Citizen:
Nay, the father.

CORIOLANUSICINIUS:
I had
You'll power, good Cabal,
I prove my son!'

CORIOLANUS:
Why, farewell:
Then, our loves as what will I would have with thy railty at injured whens
he may die her inconcilent to lose.
If lose of Edward's father, a honour kind's but
Infess and lie, I'll away to my wive.

SAMPSON:
To my knock pitch me speak, it will, by my young,
being given the purbrications and minique Richard ktom, and yet some short resist
Upon your son that seems; he then sings you dear me.
But all these brothers condemning be compass'd. He's grey that would I did.
A merry, my lord, I'll get on my wife,
And not to chave an enterchange to your determim:
And both giving me life, I think thee,
But so have summon night.
I should stone in me; but this is the 

# Speedometer

In [34]:
speedometer(
    model=model,
    input_ids=char_tokenizer.encode("a").unsqueeze(0).to(device),
    use_cache=True,
    warmup_tokens=100,
    timing_tokens=100,
    num_runs=5
)

Average total time: 153.98 ms
Time per token: 1.54 ms
Tokens per second: 649.45


In [32]:
speedometer(
    model=model,
    input_ids=char_tokenizer.encode("a").unsqueeze(0).to(device),
    use_cache=False,
    warmup_tokens=100,
    timing_tokens=100,
    num_runs=5
)

Mean time: 1.406719970703125 ms
