In [None]:
import json
from datasets import Dataset

with open("all_climbs.json", "r") as f:
    climbs = json.load(f)

data = []
for climb in climbs:
    grade = climb["grade"]
    angle = climb["angle"]
    holds = climb["holds"][0] if isinstance(climb["holds"], list) else climb["holds"]

    prompt = f"Create a {grade.upper()} boulder problem at {angle} degrees."
    completion = json.dumps(holds, indent=2)

    data.append({"prompt": prompt, "completion": completion})

# Save as a Hugging Face Dataset
dataset = Dataset.from_list(data)
dataset.save_to_disk("climb_dataset")
print(f"✅ Saved dataset with {len(data)} samples.")

In [None]:
from huggingface_hub import login
login()

In [None]:
# INSTALL DEPENDENCIES
!pip install --upgrade pip
!pip install transformers datasets accelerate bitsandbytes
!pip install git+https://github.com/huggingface/trl.git

# Restart the runtime after installing if Colab asks
print("✅ All dependencies installed. Restart the runtime if prompted.")

# IMPORT LIBRARIES
import os
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer

# VERIFY GPU
if torch.cuda.is_available():
    print(f"GPU is available: {torch.cuda.get_device_name(0)}")
else:
    print("No GPU detected. Training will be very slow!")

# OPTIONAL: Check versions
import transformers
import bitsandbytes
import trl

print(f"Transformers: {transformers.__version__}")
print(f"BitsAndBytes: {bitsandbytes.__version__}")
print(f"TRL: {trl.__version__}")

In [None]:
# UPLOAD YOUR JSONL DATASET
print("🔹 Upload your 'all_climbs.jsonl' file")
uploaded = files.upload()  # Select the file from your computer

INPUT_JSONL = list(uploaded.keys())[0]  # get uploaded filename
print(f"Uploaded file: {INPUT_JSONL}")

# CONFIG
MODEL_NAME = "google/gemma-2b-it"  # Ensure you have access
OUTPUT_DIR = "./gemma-climb-stream"
MAX_LENGTH = 256
EPOCHS = 2
BATCH_SIZE = 1
ACCUM_STEPS = 4
LR = 2e-4
FP16 = True
OFFLOAD_DIR = "offload"

# LOAD STREAMING DATASET
dataset = load_dataset("json", data_files=INPUT_JSONL, split="train", streaming=True)
print(f"Streaming dataset loaded with {len(list(dataset))} samples (will stream during training)")

# LOAD TOKENIZER & MODEL
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    offload_folder=OFFLOAD_DIR,
    quantization_config={
        "load_in_4bit": True,
        "bnb_4bit_use_double_quant": True,
        "bnb_4bit_quant_type": "nf4",
        "bnb_4bit_compute_dtype": "float16"
    }
)

# TOKENIZATION FUNCTION (on-the-fly)
def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=MAX_LENGTH
    )

# TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=ACCUM_STEPS,
    learning_rate=LR,
    num_train_epochs=EPOCHS,
    fp16=FP16,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
)

# SFT TRAINER
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset.map(tokenize_fn, batched=True),
    peft_config=None,
    args=training_args
)

# TRAIN
trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Fine-tuning complete! Model saved to {OUTPUT_DIR}")
