In [None]:
# GPT-OSS-120B Fine-Tuning on 2x H200

## Optimizations

This notebook is optimized for '2x NVIDIA H200 141GB GPUs' (282GB total GPU memory):

### Hardware Configuration
- 2x H200 141GB SXM GPUs
- 282GB total GPU memory
- BF16 mixed precision training
- Flash Attention 2 enabled

### Memory Optimizations
- 4-bit quantization with bitsandbytes
- 135GB per GPU allocation (6GB headroom)
- Minimal CPU offloading
- ZeRO-2 optimizer state sharding

### Training Optimizations
- Batch size: 3 per device
- Gradient accumulation: 4 steps
- **Effective batch size: 24** (2 GPUs × 3 × 4)
- BF16 for H200 (better than FP16)
- Fused AdamW optimizer
- More frequent checkpointing

### Expected Performance
- Model loading: 4-8 minutes
- Training (38 samples, 3 epochs): ~30-45 seconds
- Total time: ~8-12 minutes



In [None]:
!pip install -U "transformers>=4.35.0" accelerate deepspeed bitsandbytes peft sentence-transformers safetensors

In [None]:
import os, subprocess, time, getpass, sys
import torch, gc

user = getpass.getuser()
print("Current user:", user)
print("\nInitial nvidia-smi:")
print(subprocess.check_output(["nvidia-smi", "-L"]).decode())
print(subprocess.check_output(["nvidia-smi"]).decode())

# parse nvidia-smi compute apps to see PIDs using GPU
try:
    out = subprocess.check_output(["nvidia-smi", "--query-compute-apps=pid,process_name,used_memory,username", "--format=csv,noheader,nounits"])
    lines = [l.strip() for l in out.decode().splitlines() if l.strip()]
    if not lines:
        print("\nNo compute apps reported by nvidia-smi.")
    else:
        print("\nProcesses reported by nvidia-smi (pid, name, used_MB, username):")
        for l in lines:
            print(" ", l)
except Exception as e:
    print("Could not query compute apps:", e)

# Attempt to kill python processes owned by current user that show in ps and likely are stale.
# WARNING: this kills user python processes. Only proceed if you started them or are sure.
print("\nListing python processes for this user (you):")
try:
    ps_out = subprocess.check_output(["ps", "-u", user, "-o", "pid,comm,args"]).decode()
    print(ps_out)
except Exception as e:
    print("ps listing failed:", e)

# Identify PIDs from nvidia-smi compute app output and try to kill them if they belong to current user
killed_any = False
try:
    for l in lines:
        parts = [p.strip() for p in l.split(",")]
        if len(parts) >= 4:
            pid_str, name, used_mb, owner = parts[0], parts[1], parts[2], parts[3]
            try:
                pid = int(pid_str)
            except:
                continue
            if owner == user:
                print(f"\nKilling PID {pid} (owner==current user) -> {name} using {used_mb} MB")
                try:
                    subprocess.run(["kill", "-TERM", str(pid)], check=False)
                    time.sleep(1)
                    subprocess.run(["kill", "-9", str(pid)], check=False)
                    killed_any = True
                except Exception as e:
                    print("Kill failed:", e)
            else:
                print(f"\nSkipping PID {pid} (owner {owner} != current user).")
except Exception as e:
    print("Parsing or kill attempt failed:", e)

# Wait and clear python-level cuda caches
print("\nWaiting 2s and clearing torch cache...")
time.sleep(2)
gc.collect()
torch.cuda.empty_cache()
time.sleep(1)

print("\nPost-kill nvidia-smi:")
print(subprocess.check_output(["nvidia-smi"]).decode())

# If still memory used but no processes listed by nvidia-smi, we may need GPU reset or a runtime restart.
# Try a GPU reset (requires sudo) - wrap in try/except since it might fail on managed providers.
try:
    # only attempt reset if there's leftover memory and command is available
    nvsmi = subprocess.check_output(["nvidia-smi", "--query-gpu=memory.total,memory.used --format=csv,noheader,nounits"]).decode().strip()
    gpus = [line.strip() for line in nvsmi.splitlines() if line.strip()]
    leftover = any(int(line.split(",")[1].strip()) > 5000 for line in gpus)  # >5GB used heuristic
    if leftover:
        print("\nAttempting per-GPU reset (requires sudo) to clear driver-held allocations. This may fail on managed hosts.")
        for i in range(len(gpus)):
            try:
                print("Resetting GPU", i)
                subprocess.run(["sudo", "nvidia-smi", "--gpu-reset", "-i", str(i)], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
                print("Reset succeeded for GPU", i)
            except subprocess.CalledProcessError as ex:
                print("GPU reset failed for GPU", i, ":", ex)
except Exception as e:
    print("GPU reset step skipped or failed (likely no sudo):", e)

print("\nFinal nvidia-smi (after cleanup attempts):")
print(subprocess.check_output(["nvidia-smi"]).decode())

if "No running processes found" not in subprocess.check_output(["nvidia-smi"]).decode():
    print("\nIf GPUs still show large allocated memory, the safe action is to restart the notebook/runtime via RunPod UI.")
else:
    print("\nGPUs appear free now. Proceed to model load cell.")


In [None]:
# Quick fix: Clear cache and set custom cache directory
import os
import subprocess
import shutil

# Set cache to /tmp which usually has more space
custom_cache = "/tmp/hf_cache"
os.makedirs(custom_cache, exist_ok=True)
os.environ["HF_HOME"] = custom_cache
os.environ["TRANSFORMERS_CACHE"] = os.path.join(custom_cache, "transformers")
os.environ["HF_DATASETS_CACHE"] = os.path.join(custom_cache, "datasets")

print(f"✅ Set Hugging Face cache to: {custom_cache}")

# Check space
try:
    result = subprocess.check_output(['df', '-h', custom_cache]).decode()
    print(f"Disk space at {custom_cache}:")
    print(result)
except:
    pass

# Clear old cache if needed
old_cache = os.path.expanduser("~/.cache/huggingface")
if os.path.exists(old_cache):
    try:
        # Only remove hub cache, keep other caches
        hub_cache = os.path.join(old_cache, "hub")
        if os.path.exists(hub_cache):
            for item in os.listdir(hub_cache):
                if "gpt-oss" not in item:  # Keep current model cache
                    item_path = os.path.join(hub_cache, item)
                    if os.path.isdir(item_path):
                        shutil.rmtree(item_path)
                        print(f"  Removed: {item}")
    except Exception as e:
        print(f"  Cache cleanup note: {e}")

In [2]:
# Quick fix: Ensure accelerate is properly installed and imported
import sys
import subprocess

try:
    import accelerate
    from accelerate import __version__
    print(f"✅ accelerate is installed: version {__version__}")
except ImportError:
    print("⚠️  accelerate not found. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "accelerate"])
    import accelerate
    from accelerate import __version__
    print(f"✅ accelerate installed: version {__version__}")

# Additional verification
try:
    from accelerate.utils import is_accelerate_available
    if is_accelerate_available():
        print("✅ accelerate is available and ready for device_map")
    else:
        print("⚠️  accelerate is installed but not available - may need restart")
except:
    print("⚠️  Could not verify accelerate availability")

✅ accelerate is installed: version 1.11.0
⚠️  Could not verify accelerate availability


In [3]:
pip install hf_transfer

Note: you may need to restart the kernel to use updated packages.


In [None]:
# Cell 3 - Optimized GPU-only model loader for 2x H200
import os, gc, time, torch, sys
from transformers import AutoTokenizer, AutoModelForCausalLM
from collections import defaultdict, OrderedDict
import subprocess

# ===== CRITICAL: Ensure accelerate is installed and imported =====
try:
    import accelerate
    print(f"✅ accelerate version: {accelerate.__version__}")
except ImportError:
    print("⚠️  accelerate not found. Installing...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "accelerate"])
    import accelerate
    print(f"✅ accelerate installed: {accelerate.__version__}")

# Verify accelerate is working
try:
    from accelerate.utils import is_accelerate_available
    if is_accelerate_available():
        print("✅ accelerate is available for device_map support\n")
    else:
        print("⚠️  accelerate is installed but not available - may need restart\n")
except ImportError:
    # Older accelerate versions don't have is_accelerate_available
    print("✅ accelerate is available for device_map support\n")

# Force Transformers to recognise the newly installed accelerate package
try:
    import transformers.utils.import_utils as _transformers_import_utils
    import transformers.modeling_utils as _transformers_modeling_utils
    if hasattr(_transformers_import_utils, "accelerate_available"):
        _transformers_import_utils.accelerate_available = True
    if hasattr(_transformers_import_utils, "is_accelerate_available"):
        _transformers_import_utils.is_accelerate_available = lambda *args, **kwargs: True
    if hasattr(_transformers_modeling_utils, "is_accelerate_available"):
        _transformers_modeling_utils.is_accelerate_available = lambda: True
    print("✅ Patched transformers accelerate availability\n")
except Exception as patch_err:
    print(f"⚠️  Could not patch transformers accelerate check: {patch_err}\n")

MODEL_ID = "openai/gpt-oss-120b"   # <-- set your model id here if different
# Set Hugging Face token - hardcoded for authentication
HF_TOKEN = "Add Your HF token here"
# Also set as environment variable for other libraries
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN
OFFLOAD_FOLDER = "./offload"
os.makedirs(OFFLOAD_FOLDER, exist_ok=True)

# ===== FIX: Set custom cache directory with more space =====
# Check available disk space and set cache to location with most space
def get_disk_space(path):
    """Get free disk space in GB for a given path"""
    try:
        result = subprocess.check_output(['df', '-BG', path], stderr=subprocess.STDOUT).decode()
        lines = result.strip().split('\n')
        if len(lines) >= 2:
            parts = lines[1].split()
            if len(parts) >= 4:
                return float(parts[3].replace('G', ''))  # Available space in GB
    except:
        pass
    return 0.0

# Try different potential cache locations
potential_cache_dirs = [
    "/tmp/hf_cache",           # Usually has more space
    "/mnt/hf_cache",            # Common mount point
    "./hf_cache",               # Current directory
    "/workspace/hf_cache",      # Common workspace location
    os.path.expanduser("~/hf_cache"),  # Home directory alternative
]

best_cache_dir = None
best_space = 0.0

print("Checking disk space for cache locations...")
for cache_dir in potential_cache_dirs:
    try:
        os.makedirs(cache_dir, exist_ok=True)
        space = get_disk_space(cache_dir)
        print(f"  {cache_dir}: {space:.2f} GB available")
        if space > best_space:
            best_space = space
            best_cache_dir = cache_dir
    except Exception as e:
        print(f"  {cache_dir}: Cannot use ({e})")

if best_cache_dir and best_space > 10:  # Need at least 10GB
    os.environ["HF_HOME"] = best_cache_dir
    os.environ["TRANSFORMERS_CACHE"] = os.path.join(best_cache_dir, "transformers")
    os.environ["HF_DATASETS_CACHE"] = os.path.join(best_cache_dir, "datasets")
    print(f"\n✅ Using cache directory: {best_cache_dir} ({best_space:.2f} GB available)")
else:
    # Fallback: try to clear old cache and use default location
    print("\n⚠️  No suitable cache location found. Attempting to clear old cache...")
    try:
        default_cache = os.path.expanduser("~/.cache/huggingface")
        if os.path.exists(default_cache):
            # Remove old model caches (keep tokenizers)
            import shutil
            hub_cache = os.path.join(default_cache, "hub")
            if os.path.exists(hub_cache):
                for item in os.listdir(hub_cache):
                    item_path = os.path.join(hub_cache, item)
                    if os.path.isdir(item_path) and "gpt-oss" not in item:
                        try:
                            shutil.rmtree(item_path)
                            print(f"  Removed old cache: {item}")
                        except:
                            pass
        print("  Using default cache location")
    except Exception as e:
        print(f"  Cache cleanup failed: {e}")

# Optimized CUDA memory allocation for 2x H200 (282GB total)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:256,roundup_power2_divisions:8"
print("PYTORCH_CUDA_ALLOC_CONF:", os.environ["PYTORCH_CUDA_ALLOC_CONF"])

# Verify 2 GPUs available
num_gpus = torch.cuda.device_count()
print(f"\ntorch.cuda.device_count(): {num_gpus}")
if num_gpus < 2:
    raise RuntimeError(f"Expected 2 GPUs, but only {num_gpus} available. Please use a 2x H200 pod.")

for i in range(num_gpus):
    prop = torch.cuda.get_device_properties(i)
    print(f" GPU {i}: {prop.name} - {round(prop.total_memory/1024**3,2)} GiB")

# Optimized max_memory for 2x H200: use 135GB per GPU (leave 6GB headroom from 141GB)
max_memory = {}
for i in range(2):  # Explicitly configure for 2 GPUs
    max_memory[i] = "135GiB"
max_memory["cpu"] = "50GiB"  # Minimal CPU offload
print("max_memory (optimized for 2x H200):", max_memory)

token_kwargs = {"use_fast": False, "trust_remote_code": True}
if HF_TOKEN:
    token_kwargs["token"] = HF_TOKEN

print("\nLoading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, **token_kwargs)
print("Tokenizer loaded.")

# Load model - model is already quantized with Mxfp4Config, so don't pass quantization_config
print("\nLoading model (already quantized) across 2 GPUs...")
print("This will take 4-8 minutes. Please wait...\n")

# Load model with automatic device mapping across 2 GPUs
# Decide attention implementation: try FlashAttention2, fall back to eager if unavailable
attn_impl = "eager"  # default that is supported by GptOssForCausalLM
try:
    import flash_attn  # type: ignore
    attn_impl = "flash_attention_2"
    print("✅ flash_attn detected: using FlashAttention 2")
except ImportError:
    print("⚠️ flash_attn not installed: falling back to eager attention (attn_implementation='eager')")

# Model is already quantized, so we don't pass quantization_config
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    max_memory=max_memory,
    offload_folder=OFFLOAD_FOLDER,
    low_cpu_mem_usage=True,
    trust_remote_code=True,
    dtype=torch.bfloat16,  # Use dtype instead of torch_dtype (deprecated)
    attn_implementation=attn_impl,
    token=HF_TOKEN if HF_TOKEN else None,
)

print("✅ Model loaded successfully across 2 GPUs with existing quantization!")

# freeze params (if training with LoRA later)
for p in model.parameters():
    p.requires_grad = False

# show hf_device_map summary
device_map_res = getattr(model, "hf_device_map", None)
if device_map_res:
    placement = defaultdict(list)
    for k,v in device_map_res.items():
        placement[v].append(k)
    print("\nResulting hf_device_map summary:")
    for dev,mods in placement.items():
        print(f"  {dev}: {len(mods)} modules (example: {mods[:6]})")
else:
    print("\nhf_device_map not present (maybe model loaded on CPU or loading deferred).")

# quick test tokenization + forward pass (use tiny input)
try:
    inputs = tokenizer("Hello", return_tensors="pt").to(next(model.parameters()).device)
    with torch.no_grad():
        out = model(**inputs, output_hidden_states=False, return_dict=True)
    print("\nQuick forward succeeded. Example output keys:", list(out.keys()))
except Exception as e_test:
    print("\nQuick forward failed (not fatal). Exception:")
    import traceback; traceback.print_exc()
    print("If forward failed due to OOM, model is loaded but memory insufficient for forward on GPU; consider lower batch/token length or CPU inference.")

✅ accelerate version: 1.11.0
✅ accelerate is available for device_map support

✅ Patched transformers accelerate availability

Checking disk space for cache locations...
  /tmp/hf_cache: 189.00 GB available
  /mnt/hf_cache: 189.00 GB available
  ./hf_cache: 84472.00 GB available
  /workspace/hf_cache: 84472.00 GB available
  /root/hf_cache: 189.00 GB available

✅ Using cache directory: ./hf_cache (84472.00 GB available)
PYTORCH_CUDA_ALLOC_CONF: expandable_segments:True,max_split_size_mb:256,roundup_power2_divisions:8

torch.cuda.device_count(): 2
 GPU 0: NVIDIA H200 - 139.81 GiB
 GPU 1: NVIDIA H200 - 139.81 GiB
max_memory (optimized for 2x H200): {0: '135GiB', 1: '135GiB', 'cpu': '50GiB'}

Loading tokenizer...


MXFP4 quantization requires Triton and kernels installed: CUDA requires Triton >= 3.4.0, XPU requires Triton >= 3.5.0, we will default to dequantizing the model to bf16


Tokenizer loaded.

Loading model (already quantized) across 2 GPUs...
This will take 4-8 minutes. Please wait...

⚠️ flash_attn not installed: falling back to eager attention (attn_implementation='eager')


Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

✅ Model loaded successfully across 2 GPUs with existing quantization!

Resulting hf_device_map summary:
  0: 19 modules (example: ['model.embed_tokens', 'model.layers.0', 'model.layers.1', 'model.layers.2', 'model.layers.3', 'model.layers.4'])
  1: 21 modules (example: ['model.layers.18', 'model.layers.19', 'model.layers.20', 'model.layers.21', 'model.layers.22', 'model.layers.23'])

Quick forward succeeded. Example output keys: ['logits', 'past_key_values']


In [5]:
!pip install -U "transformers>=4.35.0" accelerate deepspeed bitsandbytes peft datasets sentence-transformers safetensors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [6]:
# Cell 2 - attach LoRA (PEFT) to the loaded model (model already present in session)
from peft import LoraConfig, get_peft_model, TaskType
import torch

# LoRA hyperparams — tune r / alpha later
lora_r = 8
lora_alpha = 32
target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]  # common for many causal LMs; adjust if your model uses different names
lora_dropout = 0.05

lora_config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

print("Attaching LoRA with config:", lora_config)
model = get_peft_model(model, lora_config)

# Print trainable params summary
def print_trainable(model):
    total, trainable = 0, 0
    for n, p in model.named_parameters():
        num = p.numel()
        total += num
        if p.requires_grad:
            trainable += num
    print(f"Total params: {total:,}, trainable params: {trainable:,} ({100 * trainable/total:.4f}%)")
print_trainable(model)

# Set dtype for LoRA params if desired
# For stability in mixed precision training, leave main model frozen and allow LoRA params to be in fp32 or fp16 by Trainer config.


Attaching LoRA with config: LoraConfig(task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, peft_version='0.18.0', base_model_name_or_path=None, revision=None, inference_mode=False, r=8, target_modules={'q_proj', 'k_proj', 'v_proj', 'o_proj'}, exclude_modules=None, lora_alpha=32, lora_dropout=0.05, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', trainable_token_indices=None, loftq_config={}, eva_config=None, corda_config=None, use_dora=False, alora_invocation_tokens=None, use_qalora=False, qalora_group_size=16, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False), lora_bias=False, target_parameters=None, arrow_config=None, ensure_weight_tying=False)
Total params: 116,835,128,640, trainable params: 5,971,968 (0.0051%)


In [7]:
# === Fixed: robust dataset loading + preprocessing + tokenization ===
# Paste this into one notebook cell and run.

from datasets import load_dataset
from transformers import AutoTokenizer
import os, glob, math, json

# ----- User-configurable -----
MODEL_ID = "openai/gpt-oss-120b"    # model id used to create tokenizer
# Accept either a single file, dict, or glob pattern(s).
# Example single file: {"train": "/mnt/data/6_mapped.json"}
# Example glob: {"train": "./data/*.json", "validation": "./val/*.json"}
data_files = {"train": "./data/*.json"}   # <-- change to your files or keep glob
# -----------------------------

# Validate data_files format
if not isinstance(data_files, dict):
    raise TypeError("data_files must be a dict like {'train': './data/*.json', 'validation': './val/*.json'}")

# Expand globs for visibility (not strictly required by load_dataset but helpful for debug)
expanded = {}
for split, pattern in data_files.items():
    if isinstance(pattern, (list, tuple)):
        matches = []
        for p in pattern:
            matches += glob.glob(p)
    else:
        matches = glob.glob(pattern)
    expanded[split] = matches
print("Resolved dataset files (per split):")
for s, files in expanded.items():
    print(f"  {s}: {len(files)} file(s) matched. Example: {files[:3]}")

# Basic sanity check
if not any(expanded.values()):
    raise FileNotFoundError(f"No files matched any pattern in data_files: {data_files}")

# Use the original dict with patterns for load_dataset (HF supports globs)
ds = load_dataset("json", data_files=data_files)
print("Loaded dataset splits:", list(ds.keys()))
print("Number of examples per split:")
for k in ds:
    print(f"  {k}: {len(ds[k])}")

# Reuse tokenizer if available in session, else create new
try:
    tokenizer
    print("Reusing existing tokenizer object.")
except NameError:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True)
    print("Tokenizer instantiated for", MODEL_ID)

# ---------- Prompt builder & label extractor ----------
def build_prompt_from_item(item):
    parts = []
    parts.append("You are an automated grader. Given the student's answers and rubric alignment, output a single numeric score (a number) for the whole submission).")
    # optional top-level context
    if item.get("feedback"):
        parts.append(f"Context/feedback: {item.get('feedback')}")
    # include optional top-level meta
    if item.get("max_score") is not None and item.get("score") is not None:
        parts.append(f"Ground-truth (for reference): {item.get('score')}/{item.get('max_score')}")
    elif item.get("score") is not None:
        parts.append(f"Ground-truth (for reference): {item.get('score')}")
    # include per-question rubric alignment if present
    if "rubric_alignment" in item and isinstance(item["rubric_alignment"], list):
        parts.append("Student per-question answers and alignment:")
        for i, qobj in enumerate(item["rubric_alignment"], start=1):
            q = qobj.get("question", "") or qobj.get("prompt", "")
            a = qobj.get("answer", "") or qobj.get("student_answer", "")
            # normalize whitespace
            q = " ".join(str(q).split())
            a = " ".join(str(a).split())
            # optionally include a per-question reference score if provided
            qscore = qobj.get("score")
            if qscore is not None:
                parts.append(f"Q{i}: {q}\nAnswer: {a}\n(ref_score: {qscore})")
            else:
                parts.append(f"Q{i}: {q}\nAnswer: {a}")
    else:
        # fallback keys
        if item.get("question") and item.get("student_answer"):
            parts.append(f"Question: {item.get('question')}\nStudent answer: {item.get('student_answer')}")
    parts.append("\nInstruction: Respond with a single numeric score (e.g., 3.5) and nothing else.\nScore: ")
    return "\n\n".join(parts)

def extract_numeric_label(item):
    # 1) explicit label
    if "label" in item and item["label"] is not None:
        return float(item["label"])
    # 2) top-level score
    if "score" in item and item["score"] is not None:
        return float(item["score"])
    # 3) normalized_score * max_score
    if "normalized_score" in item and item["normalized_score"] is not None:
        norm = float(item["normalized_score"])
        if item.get("max_score") is not None:
            return float(norm) * float(item.get("max_score"))
        return float(norm) * 100.0
    # 4) sum per-question scores in rubric_alignment
    if "rubric_alignment" in item and isinstance(item["rubric_alignment"], list):
        s = 0.0; found = False
        for q in item["rubric_alignment"]:
            if "score" in q and q["score"] is not None:
                try:
                    s += float(q["score"])
                    found = True
                except Exception:
                    continue
        if found:
            return s
    # 5) try alternative keys that may exist
    for alt in ("final_score", "total_score", "marks", "grade"):
        if alt in item and item[alt] is not None:
            return float(item[alt])
    # If none found, raise clear error with keys summary
    raise ValueError(f"No numeric score found. Available keys: {list(item.keys())}")

# ---------- Tokenization / assembly ----------
max_length = 512         # prompt + target truncation ceiling (tune down if OOM)
target_max_length = 16   # short numeric target length

def preprocess(example):
    # Build prompt and extract numeric label
    prompt = build_prompt_from_item(example)
    label_value = extract_numeric_label(example)   # may raise, which helps debug problematic example
    target_str = str(round(float(label_value), 2))
    # Tokenize prompt and target separately to allow masking
    enc_prompt = tokenizer(prompt, truncation=True, max_length=max_length, add_special_tokens=False)
    enc_target = tokenizer(target_str, truncation=True, max_length=target_max_length, add_special_tokens=False)
    input_ids = enc_prompt["input_ids"] + enc_target["input_ids"] + [tokenizer.eos_token_id]
    labels = [-100] * len(enc_prompt["input_ids"]) + enc_target["input_ids"] + [tokenizer.eos_token_id]
    # If too long, truncate from the left (keep the last max_length tokens)
    if len(input_ids) > max_length:
        input_ids = input_ids[-max_length:]
        labels = labels[-max_length:]
    return {"input_ids": input_ids, "labels": labels, "attention_mask": [1]*len(input_ids)}

# Map dataset (single-process for robust error tracebacks)
print("Tokenizing and preprocessing dataset (single-process)...")
for split in ds:
    print(f"  Preprocessing split: {split} with {len(ds[split])} examples")
    ds[split] = ds[split].map(preprocess, remove_columns=ds[split].column_names, num_proc=1)

# Set pytorch format for Trainer
for split in ds:
    ds[split].set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])

# Sample output summary
print("\nTokenization complete. Splits and sizes:")
for split in ds:
    print(f"  {split}: {len(ds[split])} examples")
print("\nExample tokenized item (first example of train split):")
print(ds["train"][0])


Resolved dataset files (per split):
  train: 46 file(s) matched. Example: ['./data/37_mapped.json', './data/48_mapped.json', './data/36_mapped.json']


Resolving data files:   0%|          | 0/46 [00:00<?, ?it/s]

Loaded dataset splits: ['train']
Number of examples per split:
  train: 46
Reusing existing tokenizer object.
Tokenizing and preprocessing dataset (single-process)...
  Preprocessing split: train with 46 examples

Tokenization complete. Splits and sizes:
  train: 46 examples

Example tokenized item (first example of train split):
{'input_ids': tensor([117864,     13,  29844,    290, 119115,  14716,    326, 191553,  30525,
            11,   4733,    261,   4590,  52077,   8429,    350,     64,   2086,
             8,    395,    290,   6062,  33837,   3991,   2522,     14,  61011,
            25,   1608,  28058,    261,   5985,  10335,    328,   4919,   4169,
         47018,  23753,     11,   1118,    382,    261,   1899,   1604,     13,
          5551,     11,    634,  14716,   4783,  11728,    290,   4857,   4878,
           326,  19536,  35428,   3759,    540,    495,   3211,     13,   2214,
          4934,     11,    481,  20323,  17800,    484, 136747,   8655,    402,
          4169

In [8]:
# === Cell 4 - Build Trainer for in-notebook fine-tuning (2x H200) ===
from datasets import DatasetDict
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
import json, os

assert 'ds' in globals(), "Dataset 'ds' not found. Run the data prep cell first."
assert 'model' in globals(), "Model not loaded. Run the model loader + LoRA cells first."

# Ensure DeepSpeed config still exists for the external script
os.makedirs("configs", exist_ok=True)
ds_config_path = "configs/ds_config.json"
ds_config = {
    "train_batch_size": "auto",
    "train_micro_batch_size_per_gpu": "auto",
    "gradient_accumulation_steps": "auto",
    "bf16": {"enabled": True},
    "zero_optimization": {
        "stage": 2,
        "allgather_partitions": True,
        "allgather_bucket_size": 5e8,
        "overlap_comm": True,
        "reduce_scatter": True,
        "reduce_bucket_size": 5e8,
        "contiguous_gradients": True,
        "round_robin_gradients": True
    },
    "gradient_clipping": 1.0,
    "steps_per_print": 50,
    "wall_clock_breakdown": False,
    "zero_allow_untested_optimizer": True
}
with open(ds_config_path, "w") as f:
    json.dump(ds_config, f, indent=2)

# Guarantee we have a validation split if evaluation is desired
if "validation" not in ds or len(ds["validation"]) == 0:
    print("No validation split found — creating a 10% validation holdout from train.")
    split = ds["train"].train_test_split(test_size=0.10, seed=42)
    ds = DatasetDict({"train": split["train"], "validation": split["test"]})
    print(f"Created validation split: train={len(ds['train'])}, validation={len(ds['validation'])}")
else:
    print(f"Existing dataset splits: train={len(ds['train'])}, validation={len(ds['validation'])}")

has_validation = "validation" in ds and len(ds["validation"]) > 0
output_dir = "./t2l_lora_checkpoints"

training_kwargs = dict(
    output_dir=output_dir,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="steps",
    save_steps=100,
    learning_rate=2e-4,
    warmup_steps=2,
    bf16=True,
    bf16_full_eval=has_validation,
    deepspeed=None,  # Keep in-notebook training single-process (device_map='auto' safe)
    remove_unused_columns=False,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    report_to="none",
    ddp_find_unused_parameters=False,
    gradient_checkpointing=False,
    optim="adamw_torch_fused",
)

if has_validation:
    training_kwargs.update({"eval_strategy": "steps", "eval_steps": 50})
    print("Validation split detected — eval_strategy='steps', eval_steps=50")
else:
    training_kwargs.update({"eval_strategy": "no"})
    print("No validation split available — eval_strategy='no'")

try:
    training_args = TrainingArguments(**training_kwargs)
except TypeError:
    if "eval_strategy" in training_kwargs:
        training_kwargs["evaluation_strategy"] = training_kwargs.pop("eval_strategy")
    training_args = TrainingArguments(**training_kwargs)

print("\nTrainingArguments created. Summary:")
for k in ["output_dir", "per_device_train_batch_size", "gradient_accumulation_steps", "num_train_epochs", "learning_rate", "bf16", "deepspeed"]:
    print(f"  {k}: {getattr(training_args, k, None)}")

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"] if has_validation else None,
    data_collator=data_collator,
)

print("\nTrainer object ready. Run the next cell when you're ready to fine-tune in-notebook.")


No validation split found — creating a 10% validation holdout from train.
Created validation split: train=41, validation=5
Validation split detected — eval_strategy='steps', eval_steps=50

TrainingArguments created. Summary:
  output_dir: ./t2l_lora_checkpoints
  per_device_train_batch_size: 3
  gradient_accumulation_steps: 4
  num_train_epochs: 3
  learning_rate: 0.0002
  bf16: True
  deepspeed: None


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status



Trainer object ready. Run the next cell when you're ready to fine-tune in-notebook.


In [None]:
# === Cell 5 - Run in-notebook LoRA fine-tuning ===
from transformers.trainer_utils import get_last_checkpoint

assert 'trainer' in globals(), "Trainer not built yet. Run the previous cell first."

# Set to a checkpoint path if you want to resume training
resume_from_checkpoint = None
if resume_from_checkpoint is None:
    last_checkpoint = get_last_checkpoint("./t2l_lora_checkpoints")
    if last_checkpoint:
        print(f"Checkpoint detected at {last_checkpoint}. Set `resume_from_checkpoint` above to resume.")

print("Starting trainer.train() ...")
train_result = trainer.train(resume_from_checkpoint=resume_from_checkpoint)
trainer.save_model("./t2l_lora_checkpoints")
trainer.save_state()
print("Training finished. Trainer metrics:", train_result.metrics)

if getattr(trainer, "eval_dataset", None) is not None:
    eval_metrics = trainer.evaluate()
    print("Validation metrics:", eval_metrics)
else:
    print("No validation dataset attached — skipping evaluation.")


Starting trainer.train() ...


Step,Training Loss,Validation Loss


In [None]:
# === Cell 8: write train_script.py and launch with deepspeed across 2 GPUs ===
import textwrap
import os
import subprocess
import sys
import json

TRAIN_SCRIPT = "train_script.py"
DS_CONFIG = "configs/ds_config.json"

# Ensure MODEL_ID and data_files are defined
MODEL_ID = "openai/gpt-oss-120b"  # Always use GPT-OSS-120B, no fallback model

if 'data_files' not in globals():
    print("ERROR: data_files not defined. Please define it before running this cell.")
    print("Example: data_files = {'train': 'train.json', 'validation': 'val.json'}")
    sys.exit(1)

# Check if deepspeed config exists before proceeding
if not os.path.exists(DS_CONFIG):
    print(f"ERROR: DeepSpeed config not found at {DS_CONFIG}")
    print("Please create the config file first.")
    sys.exit(1)

# Convert data_files to a string representation for the script
data_files_str = repr(data_files)

# Build the training script content
script = f'''import os
import math
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType
import torch

MODEL_ID = "{MODEL_ID}"
# Get Hugging Face token from environment variable or use placeholder
HF_TOKEN = os.environ.get("HF_TOKEN", "your_huggingface_token_here")
# Also set as environment variable
os.environ["HF_TOKEN"] = HF_TOKEN
os.environ["HUGGINGFACE_TOKEN"] = HF_TOKEN
data_files = {data_files_str}

# ---- tokenizer ----
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=False, trust_remote_code=True, token=HF_TOKEN)

# Set padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# ---- load json files (glob supported) ----
ds = load_dataset("json", data_files=data_files)

# ---- preprocessing utils (same as in-notebook) ----
def build_prompt_from_item(item):
    parts = []
    parts.append("You are an automated grader. Given the student's answers and rubric alignment, output a single numeric score (a number) for the whole submission.")
    if item.get("feedback"):
        parts.append(f"Context/feedback: {{item.get('feedback')}}")
    if item.get("max_score") is not None and item.get("score") is not None:
        parts.append(f"Ground-truth (for reference): {{item.get('score')}}/{{item.get('max_score')}}")
    elif item.get("score") is not None:
        parts.append(f"Ground-truth (for reference): {{item.get('score')}}")
    if "rubric_alignment" in item and isinstance(item["rubric_alignment"], list):
        parts.append("Student per-question answers and alignment:")
        for i, qobj in enumerate(item["rubric_alignment"], start=1):
            q = qobj.get("question", "") or qobj.get("prompt", "")
            a = qobj.get("answer", "") or qobj.get("student_answer", "")
            q = " ".join(str(q).split())
            a = " ".join(str(a).split())
            qscore = qobj.get("score")
            if qscore is not None:
                parts.append(f"Q{{i}}: {{q}}\\nAnswer: {{a}}\\n(ref_score: {{qscore}})")
            else:
                parts.append(f"Q{{i}}: {{q}}\\nAnswer: {{a}}")
    else:
        if item.get("question") and item.get("student_answer"):
            parts.append(f"Question: {{item.get('question')}}\\nStudent answer: {{item.get('student_answer')}}")
    parts.append("\\nInstruction: Respond with a single numeric score (e.g., 3.5) and nothing else.\\nScore: ")
    return "\\n\\n".join(parts)

def extract_numeric_label(item):
    if "label" in item and item["label"] is not None:
        return float(item["label"])
    if "score" in item and item["score"] is not None:
        return float(item["score"])
    if "normalized_score" in item and item["normalized_score"] is not None:
        norm = float(item["normalized_score"])
        if item.get("max_score") is not None:
            return float(norm) * float(item.get("max_score"))
        return float(norm) * 100.0
    if "rubric_alignment" in item and isinstance(item["rubric_alignment"], list):
        s = 0.0
        found = False
        for q in item["rubric_alignment"]:
            if "score" in q and q["score"] is not None:
                try:
                    s += float(q["score"])
                    found = True
                except:
                    pass
        if found:
            return s
    for alt in ("final_score", "total_score", "marks", "grade"):
        if alt in item and item[alt] is not None:
            return float(item[alt])
    raise ValueError("No numeric score found for item; keys: " + str(list(item.keys())))

# ---- tokenization/preprocessing ----
max_length = 512
target_max_length = 16

def preprocess(example):
    prompt = build_prompt_from_item(example)
    label_value = extract_numeric_label(example)
    target_str = str(round(float(label_value), 2))
    enc_prompt = tokenizer(prompt, truncation=True, max_length=max_length, add_special_tokens=False)
    enc_target = tokenizer(target_str, truncation=True, max_length=target_max_length, add_special_tokens=False)
    input_ids = enc_prompt["input_ids"] + enc_target["input_ids"] + [tokenizer.eos_token_id]
    labels = [-100]*len(enc_prompt["input_ids"]) + enc_target["input_ids"] + [tokenizer.eos_token_id]
    if len(input_ids) > max_length:
        input_ids = input_ids[-max_length:]
        labels = labels[-max_length:]
    return {{
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": [1]*len(input_ids)
    }}

print("Dataset splits before mapping:", ds)
for split in ds:
    print(f"Processing split: {{split}}, size: {{len(ds[split])}}")
    ds[split] = ds[split].map(preprocess, remove_columns=ds[split].column_names, num_proc=1)
    ds[split].set_format(type="torch", columns=["input_ids", "labels", "attention_mask"])

# ---- load base model ----
print("Loading base model (trust_remote_code=True)...")
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
    token=HF_TOKEN
)

# ---- attach LoRA adapters via PEFT ----
print("Attaching LoRA adapters...")
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)

# ---- TrainingArguments & Trainer (optimized for 2x H200) ----
training_args = TrainingArguments(
    output_dir="./t2l_lora_checkpoints",
    per_device_train_batch_size=3,  # Optimized for 2x H200
    per_device_eval_batch_size=3,
    gradient_accumulation_steps=4,  # 2 GPUs × 3 × 4 = 24 effective
    num_train_epochs=3,
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    save_steps=100,
    learning_rate=2e-4,
    warmup_steps=2,
    bf16=True,  # BF16 for H100
    bf16_full_eval=True,
    deepspeed="{DS_CONFIG}",
    remove_unused_columns=False,
    dataloader_num_workers=4,
    dataloader_pin_memory=True,
    report_to="none",
    optim="adamw_torch_fused",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Check if validation split exists
eval_dataset = ds.get('validation') if 'validation' in ds else None
if eval_dataset is None:
    print("Warning: No validation split found. Training without evaluation.")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds['train'],
    eval_dataset=eval_dataset,
    data_collator=data_collator
)

print("Starting trainer.train() (this will be executed under DeepSpeed by the launcher).")
trainer.train()
trainer.save_model("./t2l_lora_saved_by_deepspeed")
print("Training complete. Saved adapters to ./t2l_lora_saved_by_deepspeed")
'''

# Write the script file
os.makedirs(os.path.dirname(TRAIN_SCRIPT) if os.path.dirname(TRAIN_SCRIPT) else ".", exist_ok=True)
with open(TRAIN_SCRIPT, "w") as f:
    f.write(script)
print(f"✓ Wrote {TRAIN_SCRIPT}")

# Launch deepspeed with 2 GPUs
cmd = ["deepspeed", "--num_gpus=2", TRAIN_SCRIPT]
print(f"Launching training with command: {' '.join(cmd)}")
print(f"Using 2x H200 GPUs for optimized training")
print("-" * 80)

# Stream output
proc = subprocess.Popen(
    cmd,
    stdout=subprocess.PIPE,
    stderr=subprocess.STDOUT,
    bufsize=1,
    universal_newlines=True
)

try:
    for line in proc.stdout:
        print(line, end="")
    proc.wait()
    print("-" * 80)
    if proc.returncode == 0:
        print(f"✓ Training completed successfully (exit code: {proc.returncode})")
    else:
        print(f"✗ Training failed with exit code: {proc.returncode}")
except KeyboardInterrupt:
    print("\n⚠ User interrupted; terminating deepspeed process...")
    proc.terminate()
    proc.wait()
    print(f"Terminated. Return code: {proc.returncode}")
except Exception as e:
    print(f"✗ Error during training: {e}")
    proc.terminate()
    proc.wait()