## Step 1: Enviornment Setup

In [1]:
# ============================================================
# Cell 1: Install dependencies
# ============================================================
!pip install -q accelerate peft datasets transformers wandb
print("✓ Packages installed")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m81.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m64.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

## Step 2:  Multi GPU and pipeline configuration setup: 

In [2]:
# ============================================================
# Cell 2: Create Accelerate config file
# ============================================================
config_text = """compute_environment: LOCAL_MACHINE
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: fp16
num_machines: 1
num_processes: 2
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
"""
with open("accelerate_config.yaml", "w") as f:
    f.write(config_text)
print("✓ accelerate_config.yaml created")

✓ accelerate_config.yaml created


## Step 3: Pipeline Setup Script

In [4]:
# ============================================================
# Cell 3: Create train.py
# ============================================================

train_script = r"""#!/usr/bin/env python3

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

# ADD THESE LINES TO FIX NCCL:
os.environ['NCCL_TIMEOUT'] = '3600'           # 1 hour timeout (default: 10 min)
os.environ['NCCL_ASYNC_ERROR_HANDLING'] = '1' # Better error handling
os.environ['NCCL_DEBUG'] = 'WARN'             # Less verbose logging
os.environ['NCCL_IB_DISABLE'] = '1'           # Disable InfiniBand (not on Kaggle)
os.environ['NCCL_P2P_DISABLE'] = '1'          # Disable P2P (can help on cloud)

import time
import math
import gc
import warnings
import logging
import random
import torch
import numpy as np
import argparse
from datetime import datetime

try:
    import wandb
    WANDB_AVAILABLE = True
except:
    wandb = None
    WANDB_AVAILABLE = False

from kaggle_secrets import UserSecretsClient
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling,
    Trainer, TrainingArguments, EarlyStoppingCallback, TrainerCallback
)
from peft import LoraConfig, get_peft_model
from accelerate import Accelerator

warnings.filterwarnings("ignore")
logging.getLogger("transformers").setLevel(logging.ERROR)

class TrainingConfig:
    def __init__(self, args):
        self.model_name = "gpt2-medium"
        self.use_wikitext_2 = args.use_wikitext_2
        self.dataset_name = "wikitext-2-raw-v1" if self.use_wikitext_2 else "wikitext-103-raw-v1"
        self.max_length = args.max_length
        self.lora_r = args.lora_r
        self.lora_alpha = args.lora_alpha
        self.lora_dropout = args.lora_dropout
        self.learning_rate = args.learning_rate
        self.num_epochs = args.num_epochs
        self.per_device_batch = args.batch_size
        self.grad_accum = args.grad_accum
        self.weight_decay = args.weight_decay
        self.warmup_ratio = args.warmup_ratio
        self.scheduler_type = args.scheduler
        self.early_stopping_patience = args.patience
        self.output_dir = args.output_dir
        self.save_steps = args.save_steps
        self.eval_steps = args.eval_steps
        self.resume_from_checkpoint = args.resume

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument("--use_wikitext_2", action="store_true")
    parser.add_argument("--max_length", type=int, default=512)
    parser.add_argument("--lora_r", type=int, default=16)
    parser.add_argument("--lora_alpha", type=int, default=32)
    parser.add_argument("--lora_dropout", type=float, default=0.05)
    parser.add_argument("--learning_rate", type=float, default=3e-4)
    parser.add_argument("--num_epochs", type=int, default=5)
    parser.add_argument("--batch_size", type=int, default=8)
    parser.add_argument("--grad_accum", type=int, default=8)
    parser.add_argument("--weight_decay", type=float, default=0.01)
    parser.add_argument("--warmup_ratio", type=float, default=0.05)
    parser.add_argument("--scheduler", type=str, default="cosine")
    parser.add_argument("--output_dir", type=str, default=f"./gpt2-finetuned-{datetime.now().strftime('%Y%m%d-%H%M%S')}")
    parser.add_argument("--save_steps", type=int, default=250)
    parser.add_argument("--eval_steps", type=int, default=250)
    parser.add_argument("--patience", type=int, default=3)
    parser.add_argument("--resume", type=str, default=None)
    return parser.parse_args()

def setup_environment():
    accelerator = Accelerator()
    torch.cuda.empty_cache()
    gc.collect()
    
    if accelerator.is_main_process:
        print("="*60)
        print("ENVIRONMENT SETUP")
        print("="*60)
        print(f"CUDA: {torch.cuda.is_available()}")
        print(f"GPUs: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
    
    hf_token = None
    wandb_api_key = None
    
    if accelerator.is_main_process:
        try:
            user_secrets = UserSecretsClient()
            hf_token = user_secrets.get_secret("HF_API_TOKEN")
            wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")
            os.environ["HF_TOKEN"] = hf_token
            os.environ["WANDB_API_KEY"] = wandb_api_key
            
            from huggingface_hub import login
            login(token=hf_token, add_to_git_credential=False)
            
            if WANDB_AVAILABLE and wandb_api_key:
                wandb.login(key=wandb_api_key)
                print("✓ Logged in to HF & W&B")
        except Exception as e:
            print(f"Warning: {e}")
    
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
    seed = 42
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
    return hf_token, wandb_api_key, accelerator

def load_model_and_tokenizer(accelerator, model_name="gpt2-medium"):
    if accelerator.is_main_process:
        print(f"\n{'='*60}\nLOADING MODEL: {model_name}\n{'='*60}")
    
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    
    model = AutoModelForCausalLM.from_pretrained(
        model_name, torch_dtype=torch.float16, low_cpu_mem_usage=True
    )
    
    if accelerator.is_main_process:
        print(f"Parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    return model, tokenizer

def setup_lora(model, config, accelerator):
    if accelerator.is_main_process:
        print(f"\n{'='*60}\nSETTING UP LORA\n{'='*60}")
    
    model.gradient_checkpointing_enable()
    lora_config = LoraConfig(
        r=config.lora_r, lora_alpha=config.lora_alpha,
        target_modules=["c_attn", "c_proj", "c_fc"],
        lora_dropout=config.lora_dropout, use_rslora=True,
        bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_config)
    
    if accelerator.is_main_process:
        trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
        total = sum(p.numel() for p in model.parameters())
        print(f"Trainable: {trainable:,}/{total:,} ({100*trainable/total:.2f}%)")
        print(f"LoRA: r={config.lora_r}, alpha={config.lora_alpha}, dropout={config.lora_dropout}")
    
    return model

def setup_dataset(tokenizer, config, accelerator):
    if accelerator.is_main_process:
        print(f"\n{'='*60}\nLOADING DATASET: {config.dataset_name}\n{'='*60}")
    
    dataset = load_dataset("wikitext", config.dataset_name)
    dataset = dataset.filter(lambda x: len(x["text"].strip()) > 0)
    
    def tokenize_fn(examples):
        return tokenizer(examples["text"], truncation=True, max_length=config.max_length, padding=False)
    
    num_workers = min(os.cpu_count() or 4, 8)
    train_ds = dataset["train"].map(tokenize_fn, batched=True, num_proc=num_workers, remove_columns=["text"])
    val_ds = dataset["validation"].map(tokenize_fn, batched=True, num_proc=num_workers, remove_columns=["text"])
    test_ds = dataset["test"].map(tokenize_fn, batched=True, num_proc=num_workers, remove_columns=["text"])
    
    if accelerator.is_main_process:
        print(f"Train: {len(train_ds):,} | Val: {len(val_ds):,} | Test: {len(test_ds):,}")
    
    return train_ds, val_ds, test_ds

def get_training_args(config, accelerator):
    world_size = accelerator.num_processes
    effective_batch = config.per_device_batch * config.grad_accum * world_size
    
    if accelerator.is_main_process:
        print(f"\n{'='*60}\nTRAINING CONFIG\n{'='*60}")
        print(f"Per-device batch: {config.per_device_batch}")
        print(f"Gradient accumulation: {config.grad_accum}")
        print(f"GPUs: {world_size}")
        print(f"Effective batch: {effective_batch}")
        print(f"Learning rate: {config.learning_rate}")
    
    return TrainingArguments(
        output_dir=config.output_dir,
        num_train_epochs=config.num_epochs,
        per_device_train_batch_size=config.per_device_batch,
        per_device_eval_batch_size=config.per_device_batch*2,
        gradient_accumulation_steps=config.grad_accum,
        learning_rate=config.learning_rate,
        weight_decay=config.weight_decay,
        max_grad_norm=1.0,
        lr_scheduler_type=config.scheduler_type,
        warmup_ratio=config.warmup_ratio,
        optim="adamw_torch_fused",
        fp16=True,
        fp16_full_eval=True,
        gradient_checkpointing=True,
        ddp_find_unused_parameters=False,
        dataloader_num_workers=4,
        dataloader_pin_memory=True,
        save_strategy="steps",
        save_steps=config.save_steps,
        save_total_limit=3,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        eval_strategy="steps",
        eval_steps=config.eval_steps,
        eval_accumulation_steps=2,
        logging_steps=50,
        report_to="wandb" if (accelerator.is_main_process and WANDB_AVAILABLE) else "none",
        run_name=f"gpt2-lora-r{config.lora_r}",
        push_to_hub=False,
    )

class SmartLoggingCallback(TrainerCallback):
    def __init__(self, accelerator):
        self.accelerator = accelerator
        self.start_time = None
        self.best_ppl = float('inf')
    
    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = time.time()
        if self.accelerator.is_main_process:
            print(f"\n{'='*60}\n⏱️ TRAINING STARTED\n{'='*60}")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        if self.accelerator.is_main_process and logs:
            if 'eval_loss' in logs:
                ppl = math.exp(min(logs['eval_loss'], 10))
                if ppl < self.best_ppl:
                    self.best_ppl = ppl
                    print(f"🎯 New Best PPL: {ppl:.2f}")
    
    def on_train_end(self, args, state, control, **kwargs):
        if self.accelerator.is_main_process and self.start_time:
            hours = (time.time() - self.start_time) / 3600
            print(f"\n{'='*60}\n✓ DONE in {hours:.2f}h | Best PPL: {self.best_ppl:.2f}\n{'='*60}")

def main():
    args = parse_args()
    config = TrainingConfig(args)
    hf_token, wandb_key, accelerator = setup_environment()
    
    model, tokenizer = load_model_and_tokenizer(accelerator)
    model = setup_lora(model, config, accelerator)
    train_ds, val_ds, test_ds = setup_dataset(tokenizer, config, accelerator)
    training_args = get_training_args(config, accelerator)
    
    if accelerator.is_main_process and WANDB_AVAILABLE:
        dataset_short = "wt2" if config.use_wikitext_2 else "wt103"
        os.environ["WANDB_PROJECT"] = "gpt2-smart-finetune"
        os.environ["WANDB_NAME"] = f"gpt2-r{config.lora_r}-{dataset_short}"
    
    trainer = Trainer(
        model=model, args=training_args,
        train_dataset=train_ds, eval_dataset=val_ds,
        tokenizer=tokenizer,
        data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
        callbacks=[
            EarlyStoppingCallback(config.early_stopping_patience),
            SmartLoggingCallback(accelerator)
        ],
    )
    
    try:
        trainer.train(resume_from_checkpoint=config.resume_from_checkpoint)
        
        if accelerator.is_main_process:
            test_results = trainer.evaluate(test_ds)
            ppl = math.exp(test_results["eval_loss"])
            print(f"\n{'='*60}\nFINAL: Loss={test_results['eval_loss']:.4f}, PPL={ppl:.2f}\n{'='*60}")
            
            # Save model
            final_dir = f"{config.output_dir}/final_model"
            trainer.model.save_pretrained(final_dir)
            tokenizer.save_pretrained(final_dir)
            print(f"✓ Model saved to: {final_dir}")
            
            # Next steps
            if config.use_wikitext_2:
                if ppl < 20:
                    print("\n✓ Good! Now try WikiText-103 for production")
                else:
                    print("\n💡 Try: --learning_rate 5e-4 or --lora_r 32")
            else:
                if ppl < 18:
                    print("\n🎉 Excellent! Production ready")
                else:
                    print("\n💡 Consider more training or tune hyperparameters")
                    
    except KeyboardInterrupt:
        print("\n⚠️ Interrupted - saving...")
        trainer.save_model(f"{config.output_dir}/interrupted")

if __name__ == "__main__":
    main()
"""

with open("train.py", "w") as f:
    f.write(train_script)

print("✓ train.py created successfully")

✓ train.py created successfully


In [5]:
# Clear occupied memory of gpu:
import torch
torch.cuda.empty_cache()
import gc
gc.collect()

30

## Part 4: Launch Training

In [6]:
# ============================================================
# Cell 4: Verify and Launch Training
# ============================================================

import os
import torch

print("Checking setup...")
print(f"1. train.py exists: {os.path.exists('train.py')}")
print(f"2. config exists: {os.path.exists('accelerate_config.yaml')}")
print(f"3. GPUs available: {torch.cuda.device_count()}")
print("\n✓ Ready to start training!")
print("\n" + "="*60)
print("LAUNCHING: WikiText-2 Baseline (~2 hours)")
print("Expected PPL: 18-21")
print("="*60 + "\n")

# Launch with WikiText-2 (fast experimentation)
!accelerate launch --config_file accelerate_config.yaml train.py \
    --use_wikitext_2 \
    --lora_r 16 \
    --lora_alpha 32 \
    --learning_rate 3e-4 \
    --num_epochs 5 \
    --batch_size 16 \
    --grad_accum 4 \
    --save_steps 250 \
    --eval_steps 250

Checking setup...
1. train.py exists: True
2. config exists: True
3. GPUs available: 2

✓ Ready to start training!

LAUNCHING: WikiText-2 Baseline (~2 hours)
Expected PPL: 18-21

E0000 00:00:1761630709.506758     105 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761630709.506734     106 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761630709.571023     105 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
E0000 00:00:1761630709.571033     106 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
ENVIRONMENT SETUP
CUDA: True
GPUs: 2
GPU 0: Tesla T4
GPU 1: Tesla T4
Note: Environment variable`HF_TOKEN` is set an

In [7]:
import glob
runs = sorted(glob.glob("./gpt2-finetuned-*"))
if runs:
    print(f"Latest: {runs[-1]}")
    print(f"Model at: {runs[-1]}/final_model")

Latest: ./gpt2-finetuned-20251028-055206
Model at: ./gpt2-finetuned-20251028-055206/final_model


In [8]:
import os
import glob

# Find your training run
runs = sorted(glob.glob("./gpt2-finetuned-*"))
if runs:
    latest = runs[-1]
    print(f"✓ Found training run: {latest}")
    
    # Check for saved model
    final_model = f"{latest}/final_model"
    if os.path.exists(final_model):
        print(f"✓ Model saved at: {final_model}")
    else:
        # Check for checkpoints
        checkpoints = glob.glob(f"{latest}/checkpoint-*")
        if checkpoints:
            best_checkpoint = sorted(checkpoints)[-1]
            print(f"✓ Best checkpoint: {best_checkpoint}")

✓ Found training run: ./gpt2-finetuned-20251028-055206
✓ Best checkpoint: ./gpt2-finetuned-20251028-055206/checkpoint-930


In [9]:
import os

# Check latest run
latest_run = sorted([d for d in os.listdir('.') if d.startswith('gpt2-finetuned')])[-1]
print(f"Latest run: {latest_run}")

# Check contents
print("\nContents:")
for item in os.listdir(latest_run):
    print(f"  - {item}")

Latest run: gpt2-finetuned-20251028-055206

Contents:
  - checkpoint-930
  - checkpoint-500
  - checkpoint-750


In [12]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load model
base_model = AutoModelForCausalLM.from_pretrained(
    "gpt2-medium",
    torch_dtype=torch.float16
).to("cuda")

# Find your best checkpoint
import glob
checkpoints = sorted(glob.glob("./gpt2-finetuned-*/checkpoint-*"))
best_checkpoint = checkpoints[-1] if checkpoints else "./gpt2-finetuned-*/final_model"

print(f"Loading from: {best_checkpoint}")

model = PeftModel.from_pretrained(base_model, best_checkpoint)
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

# Test generation
prompt = "The history of artificial intelligence began"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(
    **inputs,
    max_length=100,
    temperature=0.8,
    top_p=0.9,
    do_sample=True
)

print("\n" + "="*60)
print("GENERATED TEXT:")
print("="*60)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Loading from: ./gpt2-finetuned-20251028-055206/checkpoint-930


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



GENERATED TEXT:
The history of artificial intelligence began in the 1980s , when a team of researchers from the University of California , Berkeley , created a machine learning algorithm , called ImageNet , that could process images and automatically make sense of them . That work led to the development of neural networks — computer networks that are able to process data and make predictions — which are now used in many fields of science and technology . The computer science field is famous for its success in creating computers that can learn , as well as for the


## Pushing the best model on Huggingface Hub

In [11]:
# ============================================================
# Cell 5: Push Model to HuggingFace Hub
# ============================================================

from huggingface_hub import HfApi, create_repo
from transformers import AutoTokenizer
import os
import glob

# 1. Setup
HF_USERNAME = "shiva9876"  # Your HF username
MODEL_NAME = "gpt2-medium-wikitext2-lora"
REPO_ID = f"{HF_USERNAME}/{MODEL_NAME}"

# 2. Find your trained model
runs = sorted(glob.glob("./gpt2-finetuned-*"))
if not runs:
    print("❌ No training runs found!")
else:
    latest_run = runs[-1]
    print(f"✓ Found: {latest_run}")
    
    # Check for model
    final_model = f"{latest_run}/final_model"
    checkpoint_dirs = sorted(glob.glob(f"{latest_run}/checkpoint-*"))
    
    if os.path.exists(final_model):
        model_path = final_model
    elif checkpoint_dirs:
        model_path = checkpoint_dirs[-1]  # Use latest checkpoint
    else:
        print("❌ No model found!")
        model_path = None
    
    if model_path:
        print(f"✓ Model path: {model_path}")
        
        # 3. Create repository on HuggingFace
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        hf_token = user_secrets.get_secret("HF_API_TOKEN")
        
        api = HfApi()
        
        try:
            # Create repo (will skip if exists)
            create_repo(
                repo_id=REPO_ID,
                token=hf_token,
                private=False,  # Set True for private repo
                exist_ok=True
            )
            print(f"✓ Repository created: https://huggingface.co/{REPO_ID}")
        except Exception as e:
            print(f"⚠️ Repo creation: {e}")
        
        # 4. Upload model files
        print("\nUploading model files...")
        api.upload_folder(
            folder_path=model_path,
            repo_id=REPO_ID,
            token=hf_token,
            commit_message="Upload fine-tuned GPT-2 Medium with LoRA"
        )
        
        print(f"\n{'='*60}")
        print("🎉 MODEL UPLOADED SUCCESSFULLY!")
        print(f"{'='*60}")
        print(f"View at: https://huggingface.co/{REPO_ID}")
        print(f"\nLoad anywhere with:")
        print(f"  from peft import PeftModel")
        print(f"  model = PeftModel.from_pretrained(")
        print(f"      'gpt2-medium',")
        print(f"      '{REPO_ID}'")
        print(f"  )")
        print(f"{'='*60}")

✓ Found: ./gpt2-finetuned-20251028-055206
✓ Model path: ./gpt2-finetuned-20251028-055206/checkpoint-930
✓ Repository created: https://huggingface.co/shiva9876/gpt2-medium-wikitext2-lora

Uploading model files...


Uploading...:   0%|          | 0.00/75.7M [00:00<?, ?B/s]


🎉 MODEL UPLOADED SUCCESSFULLY!
View at: https://huggingface.co/shiva9876/gpt2-medium-wikitext2-lora

Load anywhere with:
  from peft import PeftModel
  model = PeftModel.from_pretrained(
      'gpt2-medium',
      'shiva9876/gpt2-medium-wikitext2-lora'
  )


## Adding README file

In [13]:
# ============================================================
# Cell 6: Create Model Card
# ============================================================

model_card = f"""---
language: en
license: mit
tags:
  - text-generation
  - gpt2
  - lora
  - peft
datasets:
  - wikitext-2-raw-v1
metrics:
  - perplexity
model-index:
- name: {MODEL_NAME}
  results:
  - task:
      type: text-generation
    dataset:
      name: WikiText-2
      type: wikitext-2-raw-v1
    metrics:
    - type: perplexity
      value: 20.73
      name: Validation Perplexity
---

# GPT-2 Medium Fine-tuned on WikiText-2 with LoRA

## Model Description

This is a **GPT-2 Medium** (354M parameters) model fine-tuned on the **WikiText-2** dataset using **LoRA (Low-Rank Adaptation)**.

- **Base Model:** gpt2-medium
- **Fine-tuning Method:** LoRA (r=16, alpha=32)
- **Dataset:** WikiText-2 (23,767 training samples)
- **Training Time:** 1.81 hours on 2x Tesla T4 GPUs
- **Final Validation Perplexity:** 20.73

## Training Configuration
```yaml
LoRA Configuration:
  - Rank (r): 16
  - Alpha: 32
  - Dropout: 0.05
  - Target Modules: c_attn, c_proj, c_fc
  - Trainable Parameters: 6.29M (1.74%)

Training Hyperparameters:
  - Learning Rate: 3e-4
  - Scheduler: Cosine
  - Batch Size: 16 per GPU
  - Gradient Accumulation: 4 steps
  - Effective Batch Size: 128
  - Epochs: 5
  - Mixed Precision: FP16
```

## Performance

| Metric | Value |
|--------|-------|
| Validation Perplexity | 20.73 |
| Training Loss | 2.96 |
| Training Time | 1.81h |
| GPU Memory | ~8GB per GPU |

## Usage

### Installation
```bash
pip install transformers peft torch
```

### Loading the Model
```python
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel
import torch

# Load base model
base_model = AutoModelForCausalLM.from_pretrained(
    "gpt2-medium",
    torch_dtype=torch.float16,
    device_map="auto"
)

# Load LoRA weights
model = PeftModel.from_pretrained(
    base_model,
    "{REPO_ID}"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

# Generate text
prompt = "The future of artificial intelligence"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
    **inputs,
    max_length=100,
    temperature=0.8,
    top_p=0.9,
    do_sample=True
)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))
```

### Merging LoRA Weights (Optional)

For faster inference, merge LoRA weights with base model:
```python
# Merge and save
merged_model = model.merge_and_unload()
merged_model.save_pretrained("./merged_model")
tokenizer.save_pretrained("./merged_model")

# Load merged model directly
model = AutoModelForCausalLM.from_pretrained("./merged_model")
```

## Training Details

### Dataset

WikiText-2 is a collection of high-quality articles from Wikipedia. The dataset contains:
- Training: 23,767 samples
- Validation: 2,461 samples  
- Test: 2,891 samples

### Training Procedure

1. **Preprocessing:** Tokenization with max length 512
2. **Optimization:** AdamW with fused implementation
3. **Regularization:** Weight decay 0.01, gradient clipping 1.0
4. **Learning Rate Schedule:** Cosine decay with 5% warmup
5. **Early Stopping:** Patience of 3 evaluations

### Training Curves

The model showed smooth convergence:

{'loss': 3.4266, 'grad_norm': 0.375161349773407, 'learning_rate': 0.00029999620250019296, 'epoch': 0.2691790040376851}
{'loss': 3.1084, 'grad_norm': 0.3222482204437256, 'learning_rate': 0.0002974401932923434, 'epoch': 0.5383580080753702}
{'loss': 3.0748, 'grad_norm': 0.3670180141925812, 'learning_rate': 0.0002902305885868527, 'epoch': 0.8075370121130552}
{'loss': 3.0336, 'grad_norm': 0.3346036374568939, 'learning_rate': 0.0002785949422362223, 'epoch': 1.0753701211305517}
{'loss': 3.0089, 'grad_norm': 0.34160029888153076, 'learning_rate': 0.00026290050546456115, 'epoch': 1.3445491251682369}

🎯 New Best PPL: 20.82

{'eval_loss': 3.035845994949341, 'eval_runtime': 45.0469, 'eval_samples_per_second': 54.632, 'eval_steps_per_second': 0.866, 'epoch': 1.3445491251682369}

{'loss': 2.9974, 'grad_norm': 0.32614269852638245, 'learning_rate': 0.00024364263546496366, 'epoch': 1.613728129205922}
{'loss': 2.9802, 'grad_norm': 0.33632907271385193, 'learning_rate': 0.0002214291606397339, 'epoch': 1.8829071332436071}
{'loss': 2.9432, 'grad_norm': 0.33555737137794495, 'learning_rate': 0.00019696119595708603, 'epoch': 2.1507402422611035}
{'loss': 2.9274, 'grad_norm': 0.3619914650917053, 'learning_rate': 0.0001710110139414995, 'epoch': 2.4199192462987886}
{'loss': 2.9213, 'grad_norm': 0.3930057883262634, 'learning_rate': 0.00014439766974675623, 'epoch': 2.6890982503364738}

🎯 New Best PPL: 20.72

{'eval_loss': 3.0311944484710693, 'eval_runtime': 44.8175, 'eval_samples_per_second': 54.912, 'eval_steps_per_second': 0.87, 'epoch': 2.6890982503364738}

{'loss': 2.9254, 'grad_norm': 0.3837960958480835, 'learning_rate': 0.00011796114964767264, 'epoch': 2.958277254374159}
{'loss': 2.8891, 'grad_norm': 0.3657948970794678, 'learning_rate': 9.253585889127956e-05, 'epoch': 3.2261103633916552}
{'loss': 2.8775, 'grad_norm': 0.3783886134624481, 'learning_rate': 6.892428569973754e-05, 'epoch': 3.4952893674293404}
{'loss': 2.8754, 'grad_norm': 0.39271751046180725, 'learning_rate': 4.787167265746529e-05, 'epoch': 3.7644683714670255}
{'loss': 2.8685, 'grad_norm': 0.3550795614719391, 'learning_rate': 3.004249491929961e-05, 'epoch': 4.032301480484522}

{'eval_loss': 3.031768560409546, 'eval_runtime': 44.9244, 'eval_samples_per_second': 54.781, 'eval_steps_per_second': 0.868, 'epoch': 4.032301480484522}

{'loss': 2.8435, 'grad_norm': 0.38916313648223877, 'learning_rate': 1.599948764853796e-05, 'epoch': 4.301480484522207}
{'loss': 2.858, 'grad_norm': 0.3778211176395416, 'learning_rate': 6.185884633398319e-06, 'epoch': 4.570659488559892}
{'loss': 2.858, 'grad_norm': 0.36865052580833435, 'learning_rate': 9.11428677298881e-07, 'epoch': 4.839838492597577}

{'train_runtime': 6566.6305, 'train_samples_per_second': 18.097, 'train_steps_per_second': 0.142, 'train_loss': 2.9640738128333965, 'epoch': 5.0}

## Limitations

- Fine-tuned on English Wikipedia text only
- May not generalize well to other domains
- LoRA adapters add small overhead during inference
- Inherits biases from GPT-2 and Wikipedia

## Intended Use

This model is intended for:
- Text generation experiments
- Research on parameter-efficient fine-tuning
- Educational purposes
- Transfer learning baselines

## Citation

If you use this model, please cite:
```bibtex
@misc{{gpt2-wikitext2-lora,
  author = {{Your Name}},
  title = {{GPT-2 Medium Fine-tuned on WikiText-2 with LoRA}},
  year = {{2025}},
  publisher = {{HuggingFace}},
  url = {{https://huggingface.co/{REPO_ID}}}
}}
```

## Acknowledgments

- Base model: OpenAI's GPT-2
- LoRA: Microsoft Research
- Training: Kaggle Tesla T4 GPUs
- Framework: HuggingFace Transformers, PEFT

## Contact

For questions or issues, please open an issue on the model repository.
"""

# Save model card
with open("README.md", "w") as f:
    f.write(model_card)

print("✓ Model card created: README.md")

# Upload model card
from huggingface_hub import HfApi
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_API_TOKEN")

api = HfApi()
api.upload_file(
    path_or_fileobj="README.md",
    path_in_repo="README.md",
    repo_id=REPO_ID,
    token=hf_token,
    commit_message="Add model card"
)

print(f"✓ Model card uploaded to: https://huggingface.co/{REPO_ID}")

✓ Model card created: README.md
✓ Model card uploaded to: https://huggingface.co/shiva9876/gpt2-medium-wikitext2-lora


### Trail and test after pushing on HUB

In [15]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

base_model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
model = PeftModel.from_pretrained(
    base_model,
    "shiva9876/gpt2-medium-wikitext2-lora"
)

tokenizer = AutoTokenizer.from_pretrained("shiva9876/gpt2-medium-wikitext2-lora")

prompt = "Once upon a time"
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(**inputs, max_length=50)
print(tokenizer.decode(outputs[0]))


adapter_config.json:   0%|          | 0.00/777 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/25.2M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/131 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time , in a land far away , there lived a king , called Odaenathus . He was a man of great wisdom , and he was a great warrior . He was also a man of great strength , and he was


### This below code can be used in Agentic RAG systems where this appoach would be very helpfull for the generation task after training on specific domain of dataset.

In [None]:
# Deploy with FastAPI
from fastapi import FastAPI
from peft import PeftModel

app = FastAPI()

# Load once at startup
model = PeftModel.from_pretrained(
    AutoModelForCausalLM.from_pretrained("gpt2-medium"),
    "shiva_99/gpt2-medium-wikitext2-lora"
)

@app.post("/generate")
def generate(text: str):
    # Use model
    ...