# Create Folders

In [1]:
import os
os.makedirs('configs', exist_ok=True)
os.makedirs('train', exist_ok=True)
os.makedirs('inference', exist_ok=True)
os.makedirs('data', exist_ok=True)
os.makedirs('rlhf', exist_ok=True)

In [2]:
!pip install wandb



In [3]:
import os
from kaggle_secrets import UserSecretsClient

# Load W&B key from Kaggle Secrets
os.environ["WANDB_API_KEY"] = UserSecretsClient().get_secret("WANDB_API_KEY")
os.environ["WANDB_PROJECT"] = "llm-from-scratch"
os.environ["WANDB_ENTITY"] = "rahulkrish28-california-state-university-fullerton"

In [12]:
%%writefile configs/train_base.yaml

model_name: gpt2
dataset_name: wikitext
dataset_config: wikitext-2-raw-v1

max_length: 512
batch_size: 2
num_epochs: 1

learning_rate: 5e-5
weight_decay: 0.01
warmup_ratio: 0.1
max_grad_norm: 1.0

output_dir: outputs/train_lm
output_dir2: outputs/train_lm_wandb
output_dir3: outputs/train_lm_ddp
output_dir4: outputs/train_lm_amp_ckpt
output_dir5: outputs/train_lm_lora
seed: 42

Overwriting configs/train_base.yaml


In [5]:
%%writefile configs/inference.yaml
# =========================
# Inference Configuration
# =========================

# -------------------------
# Model
# -------------------------
model_dir: "outputs/train_lm"   # same as cfg["output_dir"] during training
device: "auto"                  # auto | cuda | cpu

# -------------------------
# Tokenization
# -------------------------
max_input_length: 512           # truncate prompt if longer than this

# -------------------------
# Generation Parameters
# -------------------------
max_new_tokens: 150             # number of tokens to generate
do_sample: true                 # sampling vs greedy decoding

temperature: 0.8                # randomness (1.0 = neutral)
top_p: 0.9                      # nucleus sampling
top_k: 50                       # optional, can be null
repetition_penalty: 1.0         # >1.0 reduces repetition

# -------------------------
# Special Tokens
# -------------------------
pad_token: "eos"                # use EOS as PAD
skip_special_tokens: true

# -------------------------
# Runtime
# -------------------------
use_fp16: false                 # set true if GPU supports fp16
batch_size: 1                   # for batch inference

Writing configs/inference.yaml


In [6]:
%%writefile train/train_lm.py

import math
import yaml
import torch
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
from datasets import load_dataset
from tqdm import tqdm

# -------------------------
# Load config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

torch.manual_seed(cfg["seed"])

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg["model_name"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(cfg["model_name"])
model.to(device)

# -------------------------
# Dataset
# -------------------------
dataset = load_dataset(
    cfg["dataset_name"],
    cfg["dataset_config"]
)

def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=cfg["max_length"],
        padding=False,
    )

    # FILTER EMPTY SEQUENCES
    input_ids = []
    for ids in tokens["input_ids"]:
        if len(ids) > 0:
            input_ids.append(ids)

    return {"input_ids": input_ids}

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

train_loader = DataLoader(
    tokenized["train"],
    batch_size=cfg["batch_size"],
    shuffle=True,
    collate_fn=data_collator
)

# -------------------------
# Optimizer
# -------------------------
# ---- Defensive casting ----
cfg["learning_rate"] = float(cfg["learning_rate"])
cfg["weight_decay"] = float(cfg["weight_decay"])

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg["learning_rate"],
    weight_decay=cfg["weight_decay"]
)

num_training_steps = cfg["num_epochs"] * len(train_loader)
lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

def sanity_check(batch):
    assert batch["input_ids"].dim() == 2
    assert batch["input_ids"].size(1) > 0
    
# -------------------------
# Training Loop
# -------------------------
model.train()
progress = tqdm(range(num_training_steps))

for epoch in range(cfg["num_epochs"]):
    for batch in train_loader:
        sanity_check(batch)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress.update(1)
        progress.set_postfix(loss=loss.item())

# -------------------------
# Perplexity
# -------------------------
ppl = math.exp(loss.item())
print(f"Final Perplexity: {ppl:.2f}")

model.save_pretrained(cfg["output_dir"])
tokenizer.save_pretrained(cfg["output_dir"])

Writing train/train_lm.py


In [7]:
!python train/train_lm.py

2026-01-01 08:51:14.657984: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767257474.854291     106 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767257474.909789     106 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767257475.373935     106 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767257475.373979     106 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767257475.373983     106 computation_placer.cc:177] computation placer alr

In [8]:
%%writefile inference/inference.py

import yaml
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------
# Load Inference Config
# -------------------------
with open("configs/inference.yaml") as f:
    cfg = yaml.safe_load(f)

# -------------------------
# Device
# -------------------------
if cfg["device"] == "auto":
    device = "cuda" if torch.cuda.is_available() else "cpu"
else:
    device = cfg["device"]

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg["model_dir"])

# Pad token handling
if cfg["pad_token"] == "eos":
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(cfg["model_dir"])

if cfg.get("use_fp16", False) and device == "cuda":
    model = model.half()

model.to(device)
model.eval()

# -------------------------
# Generation Function
# -------------------------
@torch.no_grad()
def generate(prompt: str):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=cfg["max_input_length"],
    ).to(device)

    generation_kwargs = {
        "max_new_tokens": cfg["max_new_tokens"],
        "do_sample": cfg["do_sample"],
        "temperature": cfg["temperature"],
        "top_p": cfg["top_p"],
        "pad_token_id": tokenizer.eos_token_id,
        "repetition_penalty": cfg["repetition_penalty"],
    }

    # Optional top-k
    if cfg.get("top_k") is not None:
        generation_kwargs["top_k"] = cfg["top_k"]

    outputs = model.generate(**inputs, **generation_kwargs)

    return tokenizer.decode(
        outputs[0],
        skip_special_tokens=cfg["skip_special_tokens"]
    )

# -------------------------
# Example Run
# -------------------------
if __name__ == "__main__":
    prompt = "Once upon a time in a futuristic city,"
    text = generate(prompt)
    print(text)

Writing inference/inference.py


In [9]:
!python inference/inference.py

2026-01-01 09:19:13.208031: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767259153.224757     216 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767259153.229777     216 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767259153.245738     216 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767259153.245765     216 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767259153.245769     216 computation_placer.cc:177] computation placer alr

In [6]:
%%writefile train/train_lm_wandb.py

import math
import yaml
import torch
import wandb
from torch.utils.data import DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
from datasets import load_dataset
from tqdm import tqdm

# -------------------------
# Load config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

wandb.init(
    project="llm-from-scratch",
    config=cfg
)

torch.manual_seed(cfg["seed"])
device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg["model_name"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(cfg["model_name"])
model.to(device)

# -------------------------
# Dataset
# -------------------------
dataset = load_dataset(
    cfg["dataset_name"],
    cfg["dataset_config"]
)

def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=cfg["max_length"],
        padding=False,
    )

    # FILTER EMPTY SEQUENCES
    input_ids = []
    for ids in tokens["input_ids"]:
        if len(ids) > 0:
            input_ids.append(ids)

    return {"input_ids": input_ids}

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

train_loader = DataLoader(
    tokenized["train"],
    batch_size=cfg["batch_size"],
    shuffle=True,
    collate_fn=data_collator
)

# -------------------------
# Optimizer & Scheduler
# -------------------------
cfg["learning_rate"] = float(cfg["learning_rate"])
cfg["weight_decay"] = float(cfg["weight_decay"])
cfg["warmup_ratio"] = float(cfg.get("warmup_ratio", 0.0))
cfg["max_grad_norm"] = float(cfg.get("max_grad_norm", 1.0))

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg["learning_rate"],
    weight_decay=cfg["weight_decay"]
)

num_training_steps = cfg["num_epochs"] * len(train_loader)
num_warmup_steps = int(cfg["warmup_ratio"] * num_training_steps)

lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

def sanity_check(batch):
    assert batch["input_ids"].dim() == 2
    assert batch["input_ids"].size(1) > 0
    
# -------------------------
# Training Loop
# -------------------------
model.train()
global_step = 0

progress = tqdm(range(num_training_steps))

for epoch in range(cfg["num_epochs"]):
    for batch in train_loader:
        sanity_check(batch)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(
            model.parameters(),
            cfg["max_grad_norm"]
        )

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        lr = lr_scheduler.get_last_lr()[0]

        wandb.log({
            "train/loss": loss.item(),
            "train/perplexity": math.exp(loss.item()),
            "train/lr": lr,
            "train/grad_norm": grad_norm,
            "train/step": global_step
        })

        global_step += 1
        progress.update(1)
        progress.set_postfix(loss=loss.item(), lr=lr)

# -------------------------
# Save
# -------------------------
model.save_pretrained(cfg["output_dir2"])
tokenizer.save_pretrained(cfg["output_dir2"])

wandb.finish()
torch.cuda.max_memory_allocated()

Writing train/train_lm_wandb.py


In [None]:
!python train/train_lm_wandb.py

In [7]:
%%writefile train/train_lm_ddp.py

import os
import math
import yaml
import torch
import torch.distributed as dist
import wandb

from torch.nn.parallel import DistributedDataParallel as DDP
from torch.utils.data import DataLoader, DistributedSampler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
from datasets import load_dataset
from tqdm import tqdm

# -------------------------
# DDP setup
# -------------------------
def setup_ddp():
    dist.init_process_group(backend="gloo")
    rank = dist.get_rank()
    world_size = dist.get_world_size()
    return rank, world_size

def cleanup_ddp():
    dist.destroy_process_group()

rank, world_size = setup_ddp()

# -------------------------
# Load config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

torch.manual_seed(cfg["seed"])

device = torch.device("cpu")  # CPU-safe DDP

# -------------------------
# W&B init (ONLY rank 0)
# -------------------------
if rank == 0:
    wandb.init(
        project="llm-from-scratch",
        name=cfg.get("wandb_run_name", "ddp-cpu-debug"),
        config=cfg
    )

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg["model_name"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(cfg["model_name"])
model.to(device)

model = DDP(model)

# -------------------------
# Dataset
# -------------------------
dataset = load_dataset(
    cfg["dataset_name"],
    cfg["dataset_config"]
)

def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=cfg["max_length"],
        padding=False,
    )

    input_ids = [ids for ids in tokens["input_ids"] if len(ids) > 0]
    return {"input_ids": input_ids}

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

sampler = DistributedSampler(
    tokenized["train"],
    num_replicas=world_size,
    rank=rank,
    shuffle=True
)

train_loader = DataLoader(
    tokenized["train"],
    batch_size=cfg["batch_size"],
    sampler=sampler,
    collate_fn=data_collator
)

# -------------------------
# Optimizer & Scheduler
# -------------------------
cfg["learning_rate"] = float(cfg["learning_rate"])
cfg["weight_decay"] = float(cfg["weight_decay"])

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=cfg["learning_rate"],
    weight_decay=cfg["weight_decay"]
)

num_training_steps = cfg["num_epochs"] * len(train_loader)

lr_scheduler = get_scheduler(
    name="linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# -------------------------
# Training Loop
# -------------------------
def sanity_check(batch):
    assert batch["input_ids"].dim() == 2
    assert batch["input_ids"].size(1) > 0

model.train()
global_step = 0

for epoch in range(cfg["num_epochs"]):
    sampler.set_epoch(epoch)

    for batch in tqdm(train_loader, disable=(rank != 0)):
        sanity_check(batch)
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        if rank == 0:
            wandb.log({
                "train/loss": loss.item(),
                "train/lr": lr_scheduler.get_last_lr()[0],
                "epoch": epoch,
                "step": global_step
            })

        global_step += 1

    if rank == 0:
        print(f"Epoch {epoch} Loss: {loss.item():.4f}")

# -------------------------
# Save only on rank 0
# -------------------------
if rank == 0:
    model.module.save_pretrained(cfg["output_dir3"])
    tokenizer.save_pretrained(cfg["output_dir3"])
    wandb.finish()

cleanup_ddp()

Writing train/train_lm_ddp.py


In [None]:
!torchrun --nproc_per_node=2 train/train_lm_ddp.py

W1221 21:17:17.453000 101 torch/distributed/run.py:774] 
W1221 21:17:17.453000 101 torch/distributed/run.py:774] *****************************************
W1221 21:17:17.453000 101 torch/distributed/run.py:774] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1221 21:17:17.453000 101 torch/distributed/run.py:774] *****************************************
2025-12-21 21:17:39.102115: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-12-21 21:17:39.102139: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766351859.391025     106 cuda_dnn.cc:8579] Unabl

In [14]:
%%writefile train/train_lm_amp_ckpt.py

import math
import yaml
import torch
import wandb

from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)
from datasets import load_dataset
from tqdm import tqdm

# -------------------------
# Load config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

torch.manual_seed(cfg["seed"])
device = torch.device("cuda")

# -------------------------
# Initialize W&B
# -------------------------
wandb.init(
    project="llm-from-scratch",
    config=cfg,
    name="train_amp_ckpt"
)

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg["model_name"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(cfg["model_name"])
model.gradient_checkpointing_enable()  # üî• Gradient checkpointing
model.to(device)

# -------------------------
# Dataset
# -------------------------
dataset = load_dataset(
    cfg["dataset_name"],
    cfg["dataset_config"]
)

def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=cfg["max_length"],
        padding=False,
    )
    input_ids = [ids for ids in tokens["input_ids"] if len(ids) > 0]
    return {"input_ids": input_ids}

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

train_loader = DataLoader(
    tokenized["train"],
    batch_size=cfg["batch_size"],
    shuffle=True,
    collate_fn=data_collator
)

# -------------------------
# Optimizer & Scheduler
# -------------------------
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=float(cfg["learning_rate"]),
    weight_decay=float(cfg["weight_decay"])
)

num_training_steps = cfg["num_epochs"] * len(train_loader)
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

scaler = GradScaler()  # üî• AMP scaler

# -------------------------
# Training Loop
# -------------------------
def sanity_check(batch):
    assert batch["input_ids"].dim() == 2
    assert batch["input_ids"].size(1) > 0

model.train()
global_step = 0

for epoch in range(cfg["num_epochs"]):
    for batch in tqdm(train_loader):
        sanity_check(batch)
        batch = {k: v.to(device) for k, v in batch.items()}

        with autocast():  # üî• AMP
            outputs = model(**batch)
            loss = outputs.loss

        # Backprop with AMP
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()

        # Log metrics to W&B
        wandb.log({
            "train/loss": loss.item(),
            "train/perplexity": math.exp(loss.item()),
            "train/lr": lr_scheduler.get_last_lr()[0],
            "train/global_step": global_step
        })
        global_step += 1

    # Epoch-level logging
    print(f"Epoch {epoch} Loss: {loss.item():.4f} | Perplexity: {math.exp(loss.item()):.2f}")
    wandb.log({
        "epoch/loss": loss.item(),
        "epoch/perplexity": math.exp(loss.item()),
        "epoch": epoch
    })

# -------------------------
# Save
# -------------------------
model.save_pretrained(cfg["output_dir4"])
tokenizer.save_pretrained(cfg["output_dir4"])

wandb.finish()
torch.cuda.max_memory_allocated()

Overwriting train/train_lm_amp_ckpt.py


In [15]:
!python train/train_lm_amp_ckpt.py

2026-01-01 09:45:07.146887: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767260707.169873     337 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767260707.176751     337 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767260707.196049     337 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767260707.196078     337 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767260707.196083     337 computation_placer.cc:177] computation placer alr

In [16]:
%%writefile inference/inference_amp_ckpt.py

import yaml
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------
# Load Inference Config
# -------------------------
with open("configs/inference.yaml") as f:
    cfg = yaml.safe_load(f)

# -------------------------
# Device
# -------------------------
if cfg["device"] == "auto":
    device = "cuda" if torch.cuda.is_available() else "cpu"
else:
    device = cfg["device"]

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm_amp_ckpt")

# Pad token handling
if cfg["pad_token"] == "eos":
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("outputs/train_lm_amp_ckpt")

if cfg.get("use_fp16", False) and device == "cuda":
    model = model.half()

model.to(device)
model.eval()

# -------------------------
# Generation Function
# -------------------------
@torch.no_grad()
def generate(prompt: str):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=cfg["max_input_length"],
    ).to(device)

    generation_kwargs = {
        "max_new_tokens": cfg["max_new_tokens"],
        "do_sample": cfg["do_sample"],
        "temperature": cfg["temperature"],
        "top_p": cfg["top_p"],
        "pad_token_id": tokenizer.eos_token_id,
        "repetition_penalty": cfg["repetition_penalty"],
    }

    # Optional top-k
    if cfg.get("top_k") is not None:
        generation_kwargs["top_k"] = cfg["top_k"]

    outputs = model.generate(**inputs, **generation_kwargs)

    return tokenizer.decode(
        outputs[0],
        skip_special_tokens=cfg["skip_special_tokens"]
    )

# -------------------------
# Example Run
# -------------------------
if __name__ == "__main__":
    prompt = "Once upon a time in a futuristic city,"
    text = generate(prompt)
    print(text)

Writing inference/inference_amp_ckpt.py


In [17]:
!python inference/inference_amp_ckpt.py

2026-01-01 10:08:35.502188: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767262115.518883     396 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767262115.525291     396 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767262115.541475     396 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767262115.541503     396 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767262115.541507     396 computation_placer.cc:177] computation placer alr

In [22]:
%%writefile train/train_lm_lora.py

import time
import math
import yaml
import torch
import wandb
from torch.utils.data import DataLoader
from torch.cuda.amp import autocast, GradScaler

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForLanguageModeling,
    get_scheduler
)

from datasets import load_dataset
from tqdm import tqdm

# PEFT LoRA
from peft import LoraConfig, get_peft_model, TaskType

# -------------------------
# Load config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

torch.manual_seed(cfg["seed"])
device = torch.device("cuda")

# -------------------------
# W&B init
# -------------------------
wandb.init(project="llm-from-scratch", config=cfg, name="LoRA-Training")

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(cfg["model_name"])
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(cfg["model_name"])
model.gradient_checkpointing_enable()  # gradient checkpointing
model.to(device)

for name, module in model.named_modules():
    if "attn" in name:
        print(name)

# -------------------------
# Freeze base model & Add LoRA
# -------------------------
for param in model.parameters():
    param.requires_grad = False

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()  # check trainable params

# -------------------------
# Dataset
# -------------------------
dataset = load_dataset(
    cfg["dataset_name"],
    cfg["dataset_config"]
)

def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        max_length=cfg["max_length"],
        padding=False,
    )
    input_ids = [ids for ids in tokens["input_ids"] if len(ids) > 0]
    return {"input_ids": input_ids}

tokenized = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=["text"]
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

train_loader = DataLoader(
    tokenized["train"],
    batch_size=cfg["batch_size"],
    shuffle=True,
    collate_fn=data_collator
)

# -------------------------
# Optimizer & Scheduler
# -------------------------
optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=float(cfg["learning_rate"]),
    weight_decay=float(cfg["weight_decay"])
)

num_training_steps = cfg["num_epochs"] * len(train_loader)
lr_scheduler = get_scheduler(
    name="cosine",
    optimizer=optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),
    num_training_steps=num_training_steps
)

scaler = GradScaler()  # AMP

# -------------------------
# Training Loop + W&B logging
# -------------------------
def sanity_check(batch):
    assert batch["input_ids"].dim() == 2
    assert batch["input_ids"].size(1) > 0
    
model.train()
global_step = 0
step_times = []

for epoch in range(cfg["num_epochs"]):
    for batch in tqdm(train_loader):
        sanity_check(batch)
        batch = {k: v.to(device) for k, v in batch.items()}

        start = time.time()
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        lr_scheduler.step()
        end = time.time()

        step_time = end - start
        step_times.append(step_time)

        # Log GPU memory
        max_mem = torch.cuda.max_memory_allocated() / 1024**2  # MB
        lr = lr_scheduler.get_last_lr()[0]

        # W&B logging
        wandb.log({
            "train/loss": loss.item(),
            "train/perplexity": math.exp(loss.item()),
            "train/lr": lr,
            "train/max_memory_MB": max_mem,
            "train/step_time_s": step_time,
            "train/global_step": global_step
        })

        global_step += 1

# -------------------------
# Save LoRA adapters only
# -------------------------
model.save_pretrained(cfg["output_dir5"])
tokenizer.save_pretrained(cfg["output_dir5"])

wandb.finish()

Overwriting train/train_lm_lora.py


In [23]:
!python train/train_lm_lora.py

2026-01-01 10:14:41.947568: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767262481.969290     545 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767262481.975624     545 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767262481.992215     545 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767262481.992244     545 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767262481.992247     545 computation_placer.cc:177] computation placer alr

In [24]:
%%writefile inference/inference_lora.py

import yaml
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------
# Load Inference Config
# -------------------------
with open("configs/inference.yaml") as f:
    cfg = yaml.safe_load(f)

# -------------------------
# Device
# -------------------------
if cfg["device"] == "auto":
    device = "cuda" if torch.cuda.is_available() else "cpu"
else:
    device = cfg["device"]

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm_lora")

# Pad token handling
if cfg["pad_token"] == "eos":
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained("outputs/train_lm_lora")

if cfg.get("use_fp16", False) and device == "cuda":
    model = model.half()

model.to(device)
model.eval()

# -------------------------
# Generation Function
# -------------------------
@torch.no_grad()
def generate(prompt: str):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        truncation=True,
        max_length=cfg["max_input_length"],
    ).to(device)

    generation_kwargs = {
        "max_new_tokens": cfg["max_new_tokens"],
        "do_sample": cfg["do_sample"],
        "temperature": cfg["temperature"],
        "top_p": cfg["top_p"],
        "pad_token_id": tokenizer.eos_token_id,
        "repetition_penalty": cfg["repetition_penalty"],
    }

    # Optional top-k
    if cfg.get("top_k") is not None:
        generation_kwargs["top_k"] = cfg["top_k"]

    outputs = model.generate(**inputs, **generation_kwargs)

    return tokenizer.decode(
        outputs[0],
        skip_special_tokens=cfg["skip_special_tokens"]
    )

# -------------------------
# Example Run
# -------------------------
if __name__ == "__main__":
    prompt = "Once upon a time in a futuristic city,"
    text = generate(prompt)
    print(text)

Writing inference/inference_lora.py


In [25]:
!python inference/inference_lora.py

2026-01-01 10:32:10.331035: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767263530.347789     629 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767263530.353894     629 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767263530.370485     629 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767263530.370512     629 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767263530.370516     629 computation_placer.cc:177] computation placer alr

In [26]:
%%writefile data/prompts.json

[
  "Explain transformers in simple terms.",
  "What is gradient descent?",
  "Why is attention better than RNNs?",
  "Explain LoRA fine-tuning.",
  "What is RLHF?"
]

Writing data/prompts.json


In [27]:
%%writefile data/generate_preferences.py

import json
import torch
import random
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------
# Config
# -------------------------
MODEL_NAME = "gpt2"
NUM_SAMPLES = 3        # responses per prompt
MAX_NEW_TOKENS = 100
TEMPERATURES = [0.7, 1.0, 1.3]
OUTPUT_FILE = "data/preferences.json"

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Load model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# -------------------------
# Load prompts
# -------------------------
with open("data/prompts.json") as f:
    prompts = json.load(f)

# -------------------------
# Generation helper
# -------------------------
@torch.no_grad()
def generate(prompt, temperature):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=temperature,
        top_p=0.95
    )
    text = tokenizer.decode(output[0], skip_special_tokens=True)
    return text[len(prompt):].strip()

# -------------------------
# Simple heuristic scoring
# -------------------------
def heuristic_score(text):
    """
    Simple automatic preference:
    - longer is better (up to a point)
    - penalize very short / empty
    """
    length = len(text.split())
    return min(length, 200)

# -------------------------
# Generate preferences
# -------------------------
preference_data = []

for prompt in tqdm(prompts):
    candidates = []

    for t in TEMPERATURES[:NUM_SAMPLES]:
        out = generate(prompt, t)
        score = heuristic_score(out)
        candidates.append((out, score))

    candidates.sort(key=lambda x: x[1], reverse=True)

    chosen = candidates[0][0]
    rejected = candidates[-1][0]

    preference_data.append({
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected
    })

# -------------------------
# Save
# -------------------------
with open(OUTPUT_FILE, "w") as f:
    json.dump(preference_data, f, indent=2)

print(f"Saved {len(preference_data)} preference pairs to {OUTPUT_FILE}")

Writing data/generate_preferences.py


In [28]:
!python data/generate_preferences.py

2026-01-01 10:34:38.418987: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767263678.436588     661 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767263678.441614     661 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767263678.458245     661 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767263678.458275     661 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767263678.458279     661 computation_placer.cc:177] computation placer alr

In [29]:
%%writefile data/generate_preferences_multi.py

import json
import torch
import random
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM

# -------------------------
# Config
# -------------------------
MODEL_NAME = "gpt2"
NUM_CANDIDATES = 6        # total generations per prompt
NUM_REJECTIONS = 3        # how many rejected to keep
MAX_NEW_TOKENS = 120
TEMPERATURES = [0.7, 0.9, 1.1, 1.3]
TOP_P = 0.95

PROMPTS_FILE = "data/prompts.json"
OUTPUT_FILE = "data/preferences_multi.json"

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Load model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.to(device)
model.eval()

# -------------------------
# Load prompts
# -------------------------
with open(PROMPTS_FILE) as f:
    prompts = json.load(f)

# -------------------------
# Generation helper
# -------------------------
@torch.no_grad()
def generate(prompt, temperature):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=MAX_NEW_TOKENS,
        do_sample=True,
        temperature=temperature,
        top_p=TOP_P
    )
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text[len(prompt):].strip()

# -------------------------
# Heuristic scoring
# -------------------------
def score_response(text):
    """
    Simple but effective:
    - Prefer medium-length answers
    - Penalize very short / very long
    """
    length = len(text.split())
    if length < 10:
        return -10
    if length > 200:
        return 200 - length
    return length

# -------------------------
# Generate multi-rejection preferences
# -------------------------
preference_data = []

for prompt in tqdm(prompts):
    candidates = []

    for i in range(NUM_CANDIDATES):
        temp = random.choice(TEMPERATURES)
        response = generate(prompt, temp)
        score = score_response(response)
        candidates.append((response, score))

    # Sort best ‚Üí worst
    candidates.sort(key=lambda x: x[1], reverse=True)

    chosen = candidates[0][0]
    rejected = [c[0] for c in candidates[-NUM_REJECTIONS:]]

    preference_data.append({
        "prompt": prompt,
        "chosen": chosen,
        "rejected": rejected
    })

# -------------------------
# Save
# -------------------------
with open(OUTPUT_FILE, "w") as f:
    json.dump(preference_data, f, indent=2)

print(f"Saved {len(preference_data)} multi-rejection examples ‚Üí {OUTPUT_FILE}")

Writing data/generate_preferences_multi.py


In [30]:
!python data/generate_preferences_multi.py

2026-01-01 10:35:24.636037: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767263724.654334     685 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767263724.659315     685 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767263724.675689     685 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767263724.675717     685 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767263724.675721     685 computation_placer.cc:177] computation placer alr

In [37]:
%%writefile rlhf/__init__.py



Writing rlhf/__init__.py


In [38]:
%%writefile rlhf/dataset.py

from torch.utils.data import Dataset
"""Dataset for preference-based learning from human feedback.
Preference Dataset Format:
{
  "prompt": "Explain transformers",
  "chosen": "Transformers use attention...",
  "rejected": "Transformers are like RNNs..."
}
"""
class PreferenceDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def encode(self, text):
        return self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        chosen = self.encode(item["prompt"] + item["chosen"])
        rejected = self.encode(item["prompt"] + item["rejected"])

        return {
            "chosen_input_ids": chosen["input_ids"].squeeze(0),
            "chosen_attention_mask": chosen["attention_mask"].squeeze(0),
            "rejected_input_ids": rejected["input_ids"].squeeze(0),
            "rejected_attention_mask": rejected["attention_mask"].squeeze(0),
        }

class MultiPreferenceDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def encode(self, text):
        return self.tokenizer(
            text,
            truncation=True,
            padding="max_length",
            max_length=self.max_length,
            return_tensors="pt"
        )

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        chosen = self.encode(item["prompt"] + item["chosen"])

        rejected = [
            self.encode(item["prompt"] + r)
            for r in item["rejected"]
        ]

        return {
            "chosen": {
                "input_ids": chosen["input_ids"].squeeze(0),
                "attention_mask": chosen["attention_mask"].squeeze(0),
            },
            "rejected": [
                {
                    "input_ids": r["input_ids"].squeeze(0),
                    "attention_mask": r["attention_mask"].squeeze(0),
                }
                for r in rejected
            ]
        }

Overwriting rlhf/dataset.py


In [43]:
%%writefile rlhf/reward_model.py

import torch
import torch.nn as nn
from transformers import AutoModel

class RewardModel(nn.Module):
    def __init__(self, base_model_name):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(base_model_name)
        hidden_size = self.backbone.config.hidden_size
        self.reward_head = nn.Linear(hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        pooled = outputs.last_hidden_state[:, -1]
        reward = self.reward_head(pooled)
        return reward

    def save_pretrained(self, save_directory):
        self.backbone.save_pretrained(save_directory)
        torch.save(
            self.reward_head.state_dict(),
            f"{save_directory}/reward_head.pt"
        )

Overwriting rlhf/reward_model.py


In [44]:
%%writefile rlhf/train_reward_model.py

import yaml
import torch
import wandb
import torch.nn.functional as F

from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
from tqdm import tqdm

from rlhf.reward_model import RewardModel
from rlhf.dataset import PreferenceDataset

# -------------------------
# Config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

device = torch.device("cuda")
wandb.init(project="llm-from-scratch", name="reward-model")

# -------------------------
# Data
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
dataset = load_dataset("json", data_files="data/preferences.json")["train"]

pref_dataset = PreferenceDataset(
    dataset,
    tokenizer,
    cfg["max_length"]
)

loader = DataLoader(pref_dataset, batch_size=cfg["batch_size"], shuffle=True)

# -------------------------
# Model
# -------------------------
model = RewardModel("outputs/train_lm").to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

# -------------------------
# Pairwise Ranking Loss
# -------------------------
def reward_loss(r_chosen, r_rejected):
    return -F.logsigmoid(r_chosen - r_rejected).mean()

# -------------------------
# Training Loop
# -------------------------
model.train()
for epoch in range(3):
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        r_chosen = model(
            batch["chosen_input_ids"],
            batch["chosen_attention_mask"]
        )
        r_rejected = model(
            batch["rejected_input_ids"],
            batch["rejected_attention_mask"]
        )

        loss = reward_loss(r_chosen, r_rejected)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        wandb.log({"reward_loss": loss.item()})

model.save_pretrained("models/reward_model")
wandb.finish()

Overwriting rlhf/train_reward_model.py


In [45]:
!python -m rlhf.train_reward_model

[34m[1mwandb[0m: Currently logged in as: [33mrahulkrish28[0m ([33mrahulkrish28-california-state-university-fullerton[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: [38;5;178m‚¢ø[0m Waiting for wandb.init()...
[34m[1mwandb[0m: [38;5;178m‚£ª[0m Waiting for wandb.init()...
[34m[1mwandb[0m: Tracking run with wandb version 0.22.2
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20260101_104531-1q66xlq1[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mreward-model[0m
[34m[1mwandb[0m: ‚≠êÔ∏è View project at [34m[4mhttps://wandb.ai/rahulkrish28-california-state-university-fullerton/llm-from-scratch[0m
[34m[1mwandb[0m: üöÄ View run at [34m[4mhttps://wandb.ai/rahulkrish28-california-state-university-fullerton/llm-from-scratch/runs/1q66xlq1[0m
2026-01-01 10:45:33.502534: E external/local_xla/xla/stream_execut

In [46]:
%%writefile rlhf/utils.py

import torch

def sequence_logprob(model, input_ids, attention_mask):
    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask
    )
    logits = outputs.logits[:, :-1]
    labels = input_ids[:, 1:]

    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
    token_logprobs = log_probs.gather(
        2, labels.unsqueeze(-1)
    ).squeeze(-1)

    mask = attention_mask[:, 1:]
    return (token_logprobs * mask).sum(dim=1)

Writing rlhf/utils.py


In [60]:
%%writefile rlhf/ppo_train.py

import yaml
import torch
import wandb
import torch.nn.functional as F

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

from rlhf.reward_model import RewardModel
from rlhf.dataset import PreferenceDataset
from rlhf.utils import sequence_logprob

# -------------------------
# Config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

device = "cuda"
wandb.init(project="llm-from-scratch", name="PPO-LoRA")

# -------------------------
# Models
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
tokenizer.pad_token = tokenizer.eos_token

policy = AutoModelForCausalLM.from_pretrained("outputs/train_lm")

ref = AutoModelForCausalLM.from_pretrained("outputs/train_lm").to(device)
ref.eval()
for p in ref.parameters():
    p.requires_grad = False

reward_model = RewardModel("outputs/train_lm").to(device)
reward_model.reward_head.load_state_dict(
    torch.load("models/reward_model/reward_head.pt", map_location=device)
)
reward_model.eval()

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

policy = get_peft_model(policy, lora_cfg).to(device)
optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-5)

# -------------------------
# Data
# -------------------------
dataset = load_dataset("json", data_files="data/preferences.json")["train"]
loader = DataLoader(
    PreferenceDataset(dataset, tokenizer, cfg["max_length"]),
    batch_size=cfg["batch_size"],
    shuffle=True
)

# -------------------------
# PPO Training.        L=min(rt ‚ÄãAt‚Äã,clip(rt‚Äã,1‚àíœµ,1+œµ) At‚Äã)‚àí Œ≤KL(œÄ‚à£‚à£œÄref‚Äã)
# -------------------------
clip_eps = 0.2
kl_beta = 0.1

for epoch in range(3):
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        logp_new = sequence_logprob(
            policy,
            batch["chosen_input_ids"],
            batch["chosen_attention_mask"]
        )

        with torch.no_grad():
            logp_old = sequence_logprob(
                ref,
                batch["chosen_input_ids"],
                batch["chosen_attention_mask"]
            )
            reward = reward_model(
                batch["chosen_input_ids"],
                batch["chosen_attention_mask"]
            ).squeeze()

        ratio = torch.exp(logp_new - logp_old)
        advantage = reward - reward.mean()
        
        unclipped = ratio * advantage
        clipped = torch.clamp(ratio, 1 - clip_eps, 1 + clip_eps) * advantage
        
        policy_loss = -torch.min(unclipped, clipped).mean()
        kl = (logp_new - logp_old).mean()
        
        loss = policy_loss + kl_beta * kl

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        wandb.log({
            "ppo/loss": loss.item(),
            "ppo/reward": reward.mean().item(),
            "ppo/kl": kl.item()
        })

policy.save_pretrained("models/ppo_lora")
wandb.finish()

Overwriting rlhf/ppo_train.py


In [61]:
!python -m rlhf.ppo_train.py

2026-01-01 11:09:12.989092: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767265753.010385    1120 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767265753.016771    1120 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767265753.033203    1120 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767265753.033231    1120 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767265753.033234    1120 computation_placer.cc:177] computation placer alr

In [62]:
%%writefile inference/ppo_inference.py

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from rlhf.reward_model import RewardModel
from rlhf.utils import sequence_logprob

device = "cuda"

tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
tokenizer.pad_token = tokenizer.eos_token

base = AutoModelForCausalLM.from_pretrained("outputs/train_lm").to(device)
policy = PeftModel.from_pretrained(base, "models/ppo_lora").to(device)
policy.eval()

@torch.no_grad()
def generate(model, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

prompt = "Explain reinforcement learning from human feedback."

print("=== Base Model ===")
print(generate(base, prompt))

print("\n=== PPO-RLHF Model ===")
print(generate(policy, prompt))

reward_model = RewardModel("outputs/train_lm").to(device)
reward_model.reward_head.load_state_dict(
    torch.load("models/reward_model/reward_head.pt", map_location=device)
)
reward_model.eval()

@torch.no_grad()
def score(text):
    tokens = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        max_length=512
    ).to(device)
    return reward_model(
        tokens["input_ids"],
        tokens["attention_mask"]
    ).item()

base_out = generate(base, prompt)
ppo_out = generate(policy, prompt)

print("Reward(Base):", score(base_out))
print("Reward(PPO): ", score(ppo_out))


@torch.no_grad()
def kl_div(prompt):
    tokens = tokenizer(prompt, return_tensors="pt").to(device)
    logp_policy = sequence_logprob(
        policy, tokens["input_ids"], tokens["attention_mask"]
    )
    logp_ref = sequence_logprob(
        base, tokens["input_ids"], tokens["attention_mask"]
    )
    return (logp_policy - logp_ref).item()

print("KL divergence:", kl_div(prompt))


Overwriting inference/ppo_inference.py


In [63]:
!python -m inference.ppo_inference

2026-01-01 11:09:35.145737: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767265775.168221    1173 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767265775.174556    1173 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767265775.190874    1173 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767265775.190903    1173 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767265775.190908    1173 computation_placer.cc:177] computation placer alr

In [64]:
%%writefile rlhf/dpo_train.py

import yaml
import torch
import wandb
import torch.nn.functional as F

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import get_peft_model, LoraConfig, TaskType
from torch.utils.data import DataLoader
from tqdm import tqdm

from rlhf.dataset import PreferenceDataset

# -------------------------
# Config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

device = torch.device("cuda")
wandb.init(project="llm-from-scratch", name="DPO-LoRA")

# -------------------------
# Tokenizer & Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained("outputs/train_lm")
base_model.to(device)
base_model.eval()  # reference model

policy_model = AutoModelForCausalLM.from_pretrained("outputs/train_lm")

lora_cfg = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

policy_model = get_peft_model(policy_model, lora_cfg).to(device)

# -------------------------
# Data
# -------------------------
dataset = load_dataset("json", data_files="data/preferences.json")["train"]
pref_dataset = PreferenceDataset(dataset, tokenizer, cfg["max_length"])
loader = DataLoader(pref_dataset, batch_size=cfg["batch_size"], shuffle=True)

optimizer = torch.optim.AdamW(policy_model.parameters(), lr=1e-5)

# -------------------------
# DPO Loss.      logœÉ(Œ≤[(œÄc‚Äã‚àíœÄr‚Äã)‚àí(œÄref,c‚Äã‚àí œÄref,r‚Äã)])
# -------------------------
def dpo_loss(policy_chosen, policy_rejected,
             ref_chosen, ref_rejected,
             beta=0.1):
    pi_logratios = policy_chosen - policy_rejected
    ref_logratios = ref_chosen - ref_rejected
    return -F.logsigmoid(beta * (pi_logratios - ref_logratios)).mean()

# -------------------------
# Training Loop
# -------------------------
policy_model.train()
with torch.no_grad():
    base_model.eval()

for epoch in range(3):
    for batch in tqdm(loader):
        batch = {k: v.to(device) for k, v in batch.items()}

        def logp(model, ids, mask):
            out = model(input_ids=ids, attention_mask=mask)
            return out.logits[:, -1, :].log_softmax(-1).mean()

        p_c = logp(policy_model,
                   batch["chosen_input_ids"],
                   batch["chosen_attention_mask"])
        p_r = logp(policy_model,
                   batch["rejected_input_ids"],
                   batch["rejected_attention_mask"])

        with torch.no_grad():
            r_c = logp(base_model,
                       batch["chosen_input_ids"],
                       batch["chosen_attention_mask"])
            r_r = logp(base_model,
                       batch["rejected_input_ids"],
                       batch["rejected_attention_mask"])

        loss = dpo_loss(p_c, p_r, r_c, r_r)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        wandb.log({"dpo_loss": loss.item()})

policy_model.save_pretrained("models/dpo_lora")
wandb.finish()

Writing rlhf/dpo_train.py


In [65]:
!python -m rlhf.dpo_train.py

2026-01-01 11:11:45.622735: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767265905.644600    1196 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767265905.651093    1196 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767265905.668494    1196 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767265905.668524    1196 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767265905.668529    1196 computation_placer.cc:177] computation placer alr

In [66]:
%%writefile inference/dpo_inference.py

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Load tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
tokenizer.pad_token = tokenizer.eos_token

# -------------------------
# Load base (reference) model
# -------------------------
base_model = AutoModelForCausalLM.from_pretrained("outputs/train_lm").to(device)
base_model.eval()

# -------------------------
# Load DPO policy (base + LoRA)
# -------------------------
policy_model = AutoModelForCausalLM.from_pretrained("outputs/train_lm")
policy_model = PeftModel.from_pretrained(policy_model, "models/dpo_lora")
policy_model = policy_model.to(device)
policy_model.eval()

# -------------------------
# Helper: sequence log-prob
# -------------------------
def sequence_logprob(model, input_ids, attention_mask):
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits[:, :-1]
        labels = input_ids[:, 1:]

        log_probs = F.log_softmax(logits, dim=-1)
        token_logp = log_probs.gather(
            -1, labels.unsqueeze(-1)
        ).squeeze(-1)

        token_logp = token_logp * attention_mask[:, 1:]
        return token_logp.sum(dim=1)  # [B]

# -------------------------
# Generation
# -------------------------
def generate(model, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# -------------------------
# Evaluation example
# -------------------------
prompt = "Explain why reinforcement learning is useful for LLM alignment."

base_text = generate(base_model, prompt)
policy_text = generate(policy_model, prompt)

print("\n=== PROMPT ===")
print(prompt)

print("\n=== BASE MODEL ===")
print(base_text)

print("\n=== DPO MODEL ===")
print(policy_text)

# -------------------------
# DPO preference score
# -------------------------
inputs_base = tokenizer(base_text, return_tensors="pt", truncation=True).to(device)
inputs_policy = tokenizer(policy_text, return_tensors="pt", truncation=True).to(device)

logp_base = sequence_logprob(
    base_model,
    inputs_base["input_ids"],
    inputs_base["attention_mask"]
)

logp_policy = sequence_logprob(
    policy_model,
    inputs_policy["input_ids"],
    inputs_policy["attention_mask"]
)

print("\n=== DPO Preference Score ===")
print(f"Policy ‚àí Base logp: {(logp_policy - logp_base).item():.4f}")

Writing inference/dpo_inference.py


In [67]:
!python -m inference.dpo_inference.py

2026-01-01 11:13:59.901845: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767266039.923128    1250 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767266039.929631    1250 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767266039.946861    1250 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767266039.946891    1250 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767266039.946896    1250 computation_placer.cc:177] computation placer alr

In [68]:
%%writefile rlhf/grpo_train.py

import yaml
import torch
import wandb

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm import tqdm

from rlhf.dataset import MultiPreferenceDataset
from rlhf.utils import sequence_logprob

# -------------------------
# Config
# -------------------------
with open("configs/train_base.yaml") as f:
    cfg = yaml.safe_load(f)

device = "cuda"
wandb.init(project="llm-from-scratch", name="GRPO-LoRA-Multi")

# -------------------------
# Model
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
tokenizer.pad_token = tokenizer.eos_token

policy = AutoModelForCausalLM.from_pretrained("outputs/train_lm")
policy = get_peft_model(
    policy,
    LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=["c_attn", "c_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type=TaskType.CAUSAL_LM
    )
).to(device)

optimizer = torch.optim.AdamW(policy.parameters(), lr=1e-5)

# -------------------------
# Data
# -------------------------
dataset = load_dataset(
    "json",
    data_files="data/preferences_multi.json"
)["train"]

loader = DataLoader(
    MultiPreferenceDataset(dataset, tokenizer, cfg["max_length"]),
    batch_size=cfg["batch_size"],
    shuffle=True
)

# -------------------------
# GRPO Training (TRUE GROUP VERSION)
# -------------------------
policy.train()

for epoch in range(3):
    for batch in tqdm(loader):

        # ----- chosen -----
        chosen_ids = batch["chosen"]["input_ids"].to(device)
        chosen_mask = batch["chosen"]["attention_mask"].to(device)

        lp_chosen = sequence_logprob(
            policy,
            chosen_ids,
            chosen_mask
        )

        # ----- rejected group -----
        rejected_logps = []

        for r in batch["rejected"]:
            r_ids = r["input_ids"].to(device)
            r_mask = r["attention_mask"].to(device)

            lp_r = sequence_logprob(
                policy,
                r_ids,
                r_mask
            )
            rejected_logps.append(lp_r)

        # Shape: [B, K]
        rejected_logps = torch.stack(rejected_logps, dim=1)

        # ----- group normalization -----
        all_scores = torch.cat(
            [lp_chosen.unsqueeze(1), rejected_logps],
            dim=1
        )  # [B, 1 + K]

        advantages = all_scores - all_scores.mean(dim=1, keepdim=True)

        # ----- GRPO loss -----
        loss = -advantages[:, 0].mean()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        wandb.log({
            "grpo/loss": loss.item(),
            "grpo/advantage_chosen": advantages[:, 0].mean().item(),
            "grpo/num_rejected": rejected_logps.size(1)
        })

policy.save_pretrained("models/grpo_lora")
wandb.finish()

Writing rlhf/grpo_train.py


In [69]:
!python -m rlhf.grpo_train.py

2026-01-01 11:15:57.474707: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767266157.495787    1273 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767266157.502296    1273 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767266157.519005    1273 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767266157.519037    1273 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767266157.519042    1273 computation_placer.cc:177] computation placer alr

In [70]:
%%writefile inference/grpo_inference.py

import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
from datasets import load_dataset
from tqdm import tqdm

from rlhf.utils import sequence_logprob

device = "cuda" if torch.cuda.is_available() else "cpu"

# -------------------------
# Load tokenizer
# -------------------------
tokenizer = AutoTokenizer.from_pretrained("outputs/train_lm")
tokenizer.pad_token = tokenizer.eos_token

# -------------------------
# Load base (SFT) model
# -------------------------
base_model = AutoModelForCausalLM.from_pretrained("outputs/train_lm").to(device)
base_model.eval()

# -------------------------
# Load GRPO policy (base + LoRA)
# -------------------------
policy_model = AutoModelForCausalLM.from_pretrained("outputs/train_lm")
policy_model = PeftModel.from_pretrained(policy_model, "models/grpo_lora")
policy_model = policy_model.to(device)
policy_model.eval()

# -------------------------
# Helper: generate
# -------------------------
def generate(model, prompt, max_new_tokens=100):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        temperature=0.7,
        top_p=0.9
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# -------------------------
# GRPO score (group advantage)
# -------------------------
def grpo_advantage(model, chosen_text, rejected_texts):
    # chosen
    chosen = tokenizer(chosen_text, return_tensors="pt", truncation=True).to(device)
    lp_chosen = sequence_logprob(
        model,
        chosen["input_ids"],
        chosen["attention_mask"]
    )

    # rejected group
    rejected_logps = []
    for txt in rejected_texts:
        r = tokenizer(txt, return_tensors="pt", truncation=True).to(device)
        lp_r = sequence_logprob(
            model,
            r["input_ids"],
            r["attention_mask"]
        )
        rejected_logps.append(lp_r)

    rejected_logps = torch.stack(rejected_logps, dim=1)  # [B=1, K]

    all_scores = torch.cat(
        [lp_chosen.unsqueeze(1), rejected_logps],
        dim=1
    )

    advantages = all_scores - all_scores.mean(dim=1, keepdim=True)
    return advantages[:, 0].item()  # chosen advantage

# -------------------------
# Example inference
# -------------------------
prompt = "Explain why reinforcement learning is useful for aligning large language models."

# Generate responses
base_text = generate(base_model, prompt)
policy_text = generate(policy_model, prompt)

print("\n=== PROMPT ===")
print(prompt)

print("\n=== BASE MODEL ===")
print(base_text)

print("\n=== GRPO MODEL ===")
print(policy_text)

# -------------------------
# Group comparison
# -------------------------
rejected_examples = [
    base_text,
    "I don't know.",
    "Reinforcement learning is not important."
]

adv = grpo_advantage(
    policy_model,
    chosen_text=policy_text,
    rejected_texts=rejected_examples
)

print("\n=== GRPO ADVANTAGE ===")
print(f"Chosen advantage over group: {adv:.4f}")

Writing inference/grpo_inference.py


In [71]:
!python -m inference.grpo_inference.py

2026-01-01 11:18:20.454421: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1767266300.475946    1328 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1767266300.482599    1328 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1767266300.499325    1328 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767266300.499353    1328 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1767266300.499356    1328 computation_placer.cc:177] computation placer alr