# Importing libraries

In [1]:
import os
import sys
from dotenv import load_dotenv
from typing import Tuple
import torch
from torch.utils.data import Dataset, DataLoader
from torch.profiler import profile, record_function, ProfilerActivity
import wandb
from src.utils import set_seed, load_text, split_text
from src.config import ModelConfig, TrainConfig, GenerationConfig
from src.train import Trainer
from tokenizer.tokenizer import CharTokenizer
from models.GPT import GPT

In [2]:
PROJECT_ROOT = os.path.abspath(os.getcwd() + "/..")
sys.path.append(PROJECT_ROOT)
print(f"PROJECT_ROOT: {PROJECT_ROOT}")

PROJECT_ROOT: /home/pathfinder/projects/PathFinder


# Configuration

In [3]:
model_config = ModelConfig(
    vocab_size=-1,
    max_seq_len=128,
    d_embed=256,
    n_layers=4,
    attn_type="MLA",
    n_heads=4,
    d_head=64,
    rank=32,
    attn_bias=False,
    d_ff=1024,
    mlp_bias=False,
    flash=True,
    flash_decode=False
)

train_config = TrainConfig(
    debug=False,
    wandb_project="nanoGPT",
    model_name="nanoGPT",
    per_device_train_batch_size=512,
    per_device_eval_batch_size=1024,
    gradient_accumulation_steps=512 // 512,
    num_train_epochs=1,
    learning_rate=5e-4,
    weight_decay=0.01,
    attn_decay=0.5,
    eval_steps=100,
    mixed_precision=True,
    matmul_precision="medium",
)

generation_config = GenerationConfig(
    use_cache=True,
    max_new_tokens=1000,
    temperature=1.0,
    top_k=50
)

In [4]:
load_dotenv()
wandb.login(key=os.environ.get("WANDB_API_KEY"))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/pathfinder/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mpathfinderkr[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

# Utils

## Reproducibility

In [5]:
set_seed(train_config.seed)

Random seed set to 42


## Device

In [6]:
device = torch.device("cuda")
print(f"Device: {torch.cuda.get_device_name(device)}")
torch.set_float32_matmul_precision(train_config.matmul_precision)  # Tensor Cores
print(f"MatMul Precision: {train_config.matmul_precision}")

Device: NVIDIA GeForce RTX 4080 SUPER
MatMul Precision: medium


# Dataset

In [7]:
dataset_path = os.path.join(PROJECT_ROOT, "datasets/Shakespeare/shakespeare.txt")
shakespeare_text = load_text(dataset_path)

Loaded text data from /home/pathfinder/projects/PathFinder/datasets/Shakespeare/shakespeare.txt (length: 1115394 characters).


In [8]:
if train_config.debug:
    subset_shakespeare_text = shakespeare_text[:10000]
    print(subset_shakespeare_text)
    shakespeare_text = subset_shakespeare_text

# Tokenizer

In [9]:
char_tokenizer = CharTokenizer()
char_tokenizer.build_vocab(text=shakespeare_text)
vocab_path = os.path.join(PROJECT_ROOT, "datasets/Shakespeare/vocab.json")
char_tokenizer.save_vocab(vocab_path)
model_config.vocab_size = char_tokenizer.vocab_size

Vocabulary size: 69
Vocabulary saved to /home/pathfinder/projects/PathFinder/datasets/Shakespeare/vocab.json.


In [10]:
if train_config.debug:
    print("Vocabulary:", char_tokenizer.char2idx)

# Preprocessing

In [11]:
train_text, val_text = split_text(shakespeare_text, val_size=0.1)
print(f"Training text length: {len(train_text)} characters")
print(f"Validation text length: {len(val_text)} characters")

Training text length: 1003854 characters
Validation text length: 111540 characters


In [12]:
class TextDataset(Dataset):
    def __init__(self, text: str, tokenizer: CharTokenizer, max_seq_len: int):
        self.encoded = tokenizer.encode(text)
        self.max_seq_len = max_seq_len

    def __len__(self) -> int:
        return len(self.encoded) - self.max_seq_len

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        input_ids = self.encoded[idx:idx + self.max_seq_len]
        target_ids = self.encoded[idx + 1:idx + self.max_seq_len + 1]
        return input_ids, target_ids

def collate_fn(batch):
    input_ids = torch.stack([item[0] for item in batch])
    target_ids = torch.stack([item[1] for item in batch])
    return {
        "input_ids": input_ids,
        #"attention_mask": attention_mask,
        "target_ids": target_ids
    }

train_dataset = TextDataset(train_text, char_tokenizer, model_config.max_seq_len)
val_dataset = TextDataset(val_text, char_tokenizer, model_config.max_seq_len)

train_loader = DataLoader(
    train_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=True,
    num_workers=4
)
val_loader = DataLoader(
    val_dataset,
    collate_fn=collate_fn,
    batch_size=train_config.per_device_eval_batch_size,
    shuffle=False,
    num_workers=4
)

In [13]:
if train_config.debug:
    sample_batch = next(iter(train_loader))
    print(f"Sample input IDs: {sample_batch['input_ids'][0]}")
    print(f"Sample target IDs: {sample_batch['target_ids'][0]}")

# Model

In [14]:
# Initialize the model
model = GPT(model_config).to(device)
#model = torch.compile(model)
print(model)
print(f"Number of parameters: {model.get_num_params() / 1e6:.2f}M")

GPT(
  (token_embedding): Embedding(69, 256)
  (positional_encoding): Embedding(128, 256)
  (dropout): Dropout(p=0.01, inplace=False)
  (blocks): ModuleList(
    (0-3): 4 x Block(
      (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (attn): MultiHeadAttention(
        (Wq): Linear(in_features=256, out_features=256, bias=False)
        (Wkv_down): Linear(in_features=256, out_features=32, bias=False)
        (Wk_up): Linear(in_features=32, out_features=256, bias=False)
        (Wv_up): Linear(in_features=32, out_features=256, bias=False)
        (out_proj): Linear(in_features=256, out_features=256, bias=False)
        (dropout): Dropout(p=0.01, inplace=False)
      )
      (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
      (mlp): FeedForward(
        (fc1): Linear(in_features=256, out_features=1024, bias=False)
        (fc2): Linear(in_features=1024, out_features=256, bias=False)
        (activation): GELU(approximate='none')
        (dropout): Dro

# Training

In [15]:
trainer = Trainer(
    model=model,
    train_config=train_config,
    train_loader=train_loader,
    val_loader=val_loader,
    device=device,
    master_process=True
)
trainer.train()

Training: 100%|██████████| 981/981 [02:11<00:00,  7.47it/s, epoch=1, grad_norm=0.3991, loss=2.0191, lr=0.000000]


0,1
Grad Norm,▄▂▂▂▂▁▁▁▂▂▁▂▁▂█▂▂▂▅▂▂▂▂▂▃▂▂▂▁▂▁▁▂▁▂▁▁▁▁▁
Learning Rate,▁▁▂▃▆███▇▇▇▇▇▇▆▆▆▆▆▆▆▅▅▅▄▃▃▃▃▃▂▂▂▂▁▁▁▁▁▁
Train Loss,█▇▅▃▃▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
Val Loss,█▆▅▄▃▂▂▁▁▁
Val Perplexity,█▆▅▃▂▂▁▁▁▁

0,1
Grad Norm,0.39908
Learning Rate,0.0
Train Loss,2.01915
Val Loss,2.07878
Val Perplexity,7.99468


## Save the model

In [16]:
if not train_config.debug:
    pass
    #output_dir = os.path.join(PROJECT_ROOT, "checkpoints", train_config.model_name, train_config.run_name)
    #os.makedirs(output_dir, exist_ok=True)
    #try:
    #    model.save_pretrained(
    #        output_dir,
    #        safe_serialization=True
    #    )
    #    print("Model saved successfully")
    #xcept Exception as e:
    #    print(f"Error saving model: {e}")
    # Push to Hugging Face Hub
    #model.push_to_hub(
    #    repo_id=f"PathFinderKR/{train_config.model_name}-{train_config.run_name}",
    #    private=True,
    #    use_auth_token=os.environ.get("HUGGINGFACE_TOKEN")
    #)
    #print(f"Model pushed to Hugging Face Hub: PathFinderKR/{train_config.model_name}-{train_config.run_name}")

In [17]:
# To load the model later, you can use:
# model = GPT.from_pretrained(output_dir).to(device)

# Inference

In [18]:
user_prompt = "To be, or not to be, that is the question"
input_ids = char_tokenizer.encode(user_prompt).unsqueeze(0).to(device)
output = model.generate(
    input_ids,
    use_cache=True,
    max_new_tokens=generation_config.max_new_tokens,
    temperature=generation_config.temperature,
    top_k=generation_config.top_k,
    tokenizer=char_tokenizer
)
response = char_tokenizer.decode(output[0].squeeze().cpu().numpy())

g of
cockss, you by wim, excoow you that throd.

DUKE VING Pas! Ro.

CARICETIUS ICKE, I [91mReset KV cache[0m
thoth iner soad ie chind of we northe.

CLONIUS:
Iby, Mower borcrtict.

NICILA:
Thoh noms, the's deaver our flerok!
O, say, thim[91mReset KV cache[0m
 oules chice viss bet,
As yould in nol Dedoung mough to abean show.

AUCININIUS:
Nloman, and kidy knose abery mowsge to I inmor
[91mReset KV cache[0m

RENGHARET:
I this; nol you, kirse purke hemecer.

Acomenow'd lor thas I was not Ro pensusy and by shane
Whim do bucl beal and n[91mReset KV cache[0m
tm, wil
Hef anvime bealp.
If why, faingh, to bumet as in rardvepath's hay.

MISth OMERK:
If to eact will. Bay, lood Due
Thath an[91mReset KV cache[0m

We pace,
That ook deer. Wil arvoow, was amyine I ware:
I dove the'll, a the for us tobl, and tis and nom'd ouid.

LONGERE VTRT:[91mReset KV cache[0m

thy if putrcaked, who willl fir ves,
Fillong blom, Ogor deary bert thate fee,
truny.

DUCOLUS:
I wes well pacunfy prusio,
My I 

AttributeError: 'NoneType' object has no attribute '_log'

In [None]:
print("=" * 50)
print("User prompt: ")
print(user_prompt)
print("-" * 50)
print("🤖 Model Response:")
print(response)

# Profiling

In [None]:
if train_config.debug:
    input_ids = torch.randint(0, model_config.vocab_size, (1, model_config.max_seq_len), device=device)
    with profile(activities=[ProfilerActivity.CUDA], record_shapes=True) as prof:
        with record_function("model_inference"):
            model(input_ids)
    print(prof.key_averages(group_by_input_shape=True).table(sort_by="cuda_time_total", row_limit=20))