In [None]:
# ============================================================
# Complete StarCoder2-3B Training with MLflow + Retraining
# Copy this ENTIRE script into ONE cell in Colab
# ============================================================

# Clean start
import os, shutil
for d in ["./mlruns", "./mlartifacts"]:
    if os.path.exists(d):
        shutil.rmtree(d)
print("✓ Cleaned directories\n")

# Install packages
import subprocess, sys
packages = ["transformers", "datasets", "peft", "accelerate", "bitsandbytes", "mlflow"]
print("Installing packages...")
for pkg in packages:
    subprocess.run([sys.executable, "-m", "pip", "install", "-q", pkg],
                   stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
print("✓ Packages installed\n")

# Import everything
import torch
import mlflow
from datetime import datetime
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer,
    DataCollatorForLanguageModeling, BitsAndBytesConfig, TrainerCallback
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, PeftModel

print("✓ Imports complete\n")

# Setup MLflow
os.makedirs("./mlruns", exist_ok=True)
mlflow.set_tracking_uri("file://./mlruns")
exp_name = f"starcoder-{datetime.now().strftime('%Y%m%d%H%M%S')}"
mlflow.create_experiment(exp_name)
mlflow.set_experiment(exp_name)
print(f"✓ MLflow experiment: {exp_name}\n")

# Config
CONFIG = {
    "model_name": "/content/drive/MyDrive//starcoder2-3b",
    "output_dir": "./starcoder-finetuned",
    "dataset_path": "/content/drive/MyDrive/code_dataset.json",
    "lora_r": 16, "lora_alpha": 32, "lora_dropout": 0.05,
    "batch_size": 4, "gradient_accumulation_steps": 4,
    "num_epochs": 3, "learning_rate": 2e-4,
    "max_length": 512, "warmup_steps": 50,
}

# GPU check
USE_GPU = torch.cuda.is_available()
if USE_GPU:
    print(f"✓ GPU: {torch.cuda.get_device_name(0)}\n")
else:
    print("⚠ Using CPU (slow)\n")

# MLflow callback
class MLflowCallback(TrainerCallback):
    def on_log(self, args, state, control, logs=None, **kwargs):
        if logs:
            for k, v in logs.items():
                if isinstance(v, (int, float)):
                    try:
                        mlflow.log_metric(k, v, step=state.global_step)
                    except:
                        pass

# Load model function
def load_model(name):
    print(f"Loading {name}...")
    tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
    tokenizer.pad_token = tokenizer.eos_token

    if USE_GPU:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True
        )
        model = AutoModelForCausalLM.from_pretrained(
            name, quantization_config=bnb_config, device_map="auto", trust_remote_code=True
        )
        model = prepare_model_for_kbit_training(model)
    else:
        model = AutoModelForCausalLM.from_pretrained(
            name, torch_dtype=torch.float32, low_cpu_mem_usage=True, trust_remote_code=True
        )

    model.config.use_cache = False
    print("✓ Model loaded\n")
    return model, tokenizer

# Apply LoRA
def apply_lora(model, cfg):
    # Freeze all base model parameters first
    for param in model.parameters():
        param.requires_grad = False

    lora_cfg = LoraConfig(
        r=cfg["lora_r"], lora_alpha=cfg["lora_alpha"],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        lora_dropout=cfg["lora_dropout"], bias="none", task_type="CAUSAL_LM"
    )
    model = get_peft_model(model, lora_cfg)

    # Enable gradient checkpointing and input gradients
    model.enable_input_require_grads()

    print()
    model.print_trainable_parameters()
    print()
    return model

# Load dataset
def load_data(path, tokenizer, max_len):
    print(f"Loading {path}...")
    ds = load_dataset("json", data_files=path, split="train")
    print(f"✓ {len(ds)} examples\n")

    def preprocess(examples):
        texts = examples["text"] if "text" in examples else [
            f"### Instruction:\n{i}\n\n### Response:\n{o}"
            for i, o in zip(examples["instruction"], examples["output"])
        ]
        result = tokenizer(texts, truncation=True, max_length=max_len,
                          padding="max_length", return_tensors=None)
        result["labels"] = result["input_ids"].copy()
        return result

    print("Tokenizing...")
    tokenized = ds.map(preprocess, batched=True, remove_columns=ds.column_names)
    print("✓ Tokenized\n")
    return tokenized

# Train
def train(cfg, run_name=None, tags=None):
    if run_name is None:
        run_name = f"run-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

    with mlflow.start_run(run_name=run_name):
        print(f"MLflow Run: {mlflow.active_run().info.run_id}\n")
        mlflow.log_params(cfg)

        # Add tags if provided
        if tags:
            for key, value in tags.items():
                mlflow.set_tag(key, value)

        model, tokenizer = load_model(cfg["model_name"])
        model = apply_lora(model, cfg)
        dataset = load_data(cfg["dataset_path"], tokenizer, cfg["max_length"])

        mlflow.log_param("dataset_size", len(dataset))

        collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

        args = TrainingArguments(
            output_dir=cfg["output_dir"],
            per_device_train_batch_size=cfg["batch_size"],
            gradient_accumulation_steps=cfg["gradient_accumulation_steps"],
            num_train_epochs=cfg["num_epochs"],
            learning_rate=cfg["learning_rate"],
            fp16=USE_GPU, logging_steps=5, save_strategy="epoch", save_total_limit=2,
            optim="paged_adamw_8bit" if USE_GPU else "adamw_torch",
            warmup_steps=cfg["warmup_steps"], lr_scheduler_type="cosine",
            gradient_checkpointing=False,  # Disable to avoid conflicts with PEFT
            report_to="none", dataloader_pin_memory=USE_GPU,
            remove_unused_columns=False  # Important for PEFT
        )

        trainer = Trainer(
            model=model, args=args, train_dataset=dataset,
            data_collator=collator, callbacks=[MLflowCallback()]
        )

        print("="*60)
        print("Training...")
        print("="*60 + "\n")

        result = trainer.train()

        mlflow.log_metrics({
            "final_loss": result.training_loss,
            "train_runtime": result.metrics["train_runtime"],
            "train_samples_per_second": result.metrics.get("train_samples_per_second", 0),
        })

        print("\nSaving...")
        trainer.save_model(cfg["output_dir"])
        tokenizer.save_pretrained(cfg["output_dir"])
        mlflow.log_artifacts(cfg["output_dir"], artifact_path="model")

        run_id = mlflow.active_run().info.run_id

        print("\n" + "="*60)
        print("✓ Training Complete!")
        print("="*60)
        print(f"Model: {cfg['output_dir']}")
        print(f"Run ID: {run_id}")
        print("="*60 + "\n")

        return run_id, model, tokenizer

# Retrain function
def retrain(base_run_id=None, new_dataset_path=None, config_updates=None):
    """
    Retrain a model with new data or updated configuration

    Args:
        base_run_id: Previous MLflow run ID (for tracking lineage)
        new_dataset_path: Path to new training data
        config_updates: Dict of config parameters to update

    Returns:
        run_id, model, tokenizer
    """
    print("\n" + "="*60)
    print("RETRAINING MODEL")
    print("="*60 + "\n")

    # Create new config based on updates
    retrain_config = CONFIG.copy()

    if new_dataset_path:
        retrain_config["dataset_path"] = new_dataset_path
        print(f"→ Using new dataset: {new_dataset_path}")

    if config_updates:
        retrain_config.update(config_updates)
        print(f"→ Config updates: {config_updates}")

    print()

    # Create run name
    if base_run_id:
        run_name = f"retrain-{base_run_id[:8]}-{datetime.now().strftime('%H%M%S')}"
    else:
        run_name = f"retrain-{datetime.now().strftime('%Y%m%d-%H%M%S')}"

    # Prepare tags
    tags = {
        "retrained": "true",
        "retrain_timestamp": datetime.now().isoformat(),
    }
    if base_run_id:
        tags["base_run_id"] = base_run_id

    # Train with new config
    run_id, model, tokenizer = train(retrain_config, run_name=run_name, tags=tags)

    print("✓ Retraining complete!\n")

    return run_id, model, tokenizer

# Compare runs function
def compare_runs(experiment_name=None):
    """Compare all runs in the experiment"""
    import pandas as pd

    if experiment_name is None:
        experiment_name = mlflow.get_experiment_by_name(exp_name).name

    experiment = mlflow.get_experiment_by_name(experiment_name)
    if not experiment:
        print(f"Experiment '{experiment_name}' not found")
        return None

    runs = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=["start_time DESC"]
    )

    print("\n" + "="*80)
    print("MODEL COMPARISON")
    print("="*80)

    # Select relevant columns
    cols = ["run_id", "start_time", "status",
            "params.learning_rate", "params.num_epochs", "params.lora_r",
            "metrics.final_loss", "metrics.train_runtime"]

    available_cols = [col for col in cols if col in runs.columns]

    if len(runs) > 0:
        comparison = runs[available_cols]
        print(comparison.to_string(index=False))
    else:
        print("No runs found")

    print("="*80 + "\n")

    return runs

# Get best model
def get_best_run(metric="final_loss", ascending=True):
    """Get the best run based on a metric"""
    experiment = mlflow.get_experiment_by_name(exp_name)

    order = "ASC" if ascending else "DESC"
    runs = mlflow.search_runs(
        experiment_ids=[experiment.experiment_id],
        order_by=[f"metrics.{metric} {order}"],
        max_results=1
    )

    if len(runs) == 0:
        print("No runs found")
        return None

    best_run = runs.iloc[0]

    print("\n" + "="*60)
    print(f"BEST MODEL (by {metric})")
    print("="*60)
    print(f"Run ID: {best_run['run_id']}")
    print(f"{metric}: {best_run[f'metrics.{metric}']}")
    print(f"Learning Rate: {best_run.get('params.learning_rate', 'N/A')}")
    print(f"Epochs: {best_run.get('params.num_epochs', 'N/A')}")
    print("="*60 + "\n")

    return best_run['run_id']

# Test
def test(model, tokenizer, prompts=None):
    if prompts is None:
        prompts = ["def fibonacci(n):", "function sum(arr) {", "class Model:"]

    print("\n" + "="*60)
    print("TESTING MODEL")
    print("="*60 + "\n")

    for prompt in prompts:
        print(f"Prompt: {prompt}")
        inputs = tokenizer(prompt, return_tensors="pt")
        if USE_GPU:
            inputs = {k: v.to("cuda") for k, v in inputs.items()}

        outputs = model.generate(
            **inputs, max_length=150, temperature=0.7,
            do_sample=True, pad_token_id=tokenizer.eos_token_id
        )
        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        print("-"*60 + "\n")

# ============================================================
# MAIN EXECUTION
# ============================================================

print("\n" + "="*60)
print("TRAINING INITIAL MODEL")
print("="*60 + "\n")

# Step 1: Train initial model
run_id_1, model_1, tokenizer_1 = train(CONFIG)

# Step 2: Test initial model
#test(model_1, tokenizer_1)

# ============================================================
# RETRAINING EXAMPLES
# ============================================================

print("\n" + "="*60)
print("RETRAINING OPTIONS")
print("="*60)
print("""
# Example 1: Retrain with new dataset
run_id_2, model_2, tokenizer_2 = retrain(
    base_run_id=run_id_1,
    new_dataset_path="new_code_dataset.json"
)

# Example 2: Retrain with different hyperparameters
run_id_3, model_3, tokenizer_3 = retrain(
    base_run_id=run_id_1,
    config_updates={
        "num_epochs": 2,
        "learning_rate": 1e-4,
        "lora_r": 8
    }
)

# Example 3: Retrain with both new data and config
run_id_4, model_4, tokenizer_4 = retrain(
    base_run_id=run_id_1,
    new_dataset_path="new_data.json",
    config_updates={"num_epochs": 5}
)

# Compare all runs
compare_runs()

# Get best model
best_run_id = get_best_run(metric="final_loss", ascending=True)
""")

# Uncomment below to actually run retraining
print("\n" + "="*60)
print("RETRAINING WITH NEW PARAMETERS")
print("="*60 + "\n")

run_id_2, model_2, tokenizer_2 = retrain(
    base_run_id=run_id_1,
    config_updates={
        "num_epochs": 2,
        "learning_rate": 1e-4
    }
)
#
# test(model_2, tokenizer_2)
#
# # Compare all runs
# compare_runs()

print("\n" + "="*60)
print("ALL DONE!")
print("="*60)
print(f"\nInitial model saved to: {CONFIG['output_dir']}")
print(f"Initial run ID: {run_id_1}")
print("\nView results:")
print("  mlflow ui --backend-store-uri ./mlruns")
print("  http://localhost:5000")
print("="*60)

✓ Cleaned directories

Installing packages...
✓ Packages installed

✓ Imports complete

✓ MLflow experiment: starcoder-20251125042334

✓ GPU: Tesla T4


TRAINING INITIAL MODEL

MLflow Run: c1731b7e6b3f4c828b8a816f3d87f5be

Loading /content/drive/MyDrive//starcoder2-3b...
✓ Model loaded


trainable params: 9,093,120 || all params: 3,039,464,448 || trainable%: 0.2992

Loading /content/drive/MyDrive/code_dataset.json...


Generating train split: 0 examples [00:00, ? examples/s]

✓ 42 examples

Tokenizing...


Map:   0%|          | 0/42 [00:00<?, ? examples/s]

✓ Tokenized

Training...



  return fn(*args, **kwargs)


Step,Training Loss
5,2.4623


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)



Saving...

✓ Training Complete!
Model: ./starcoder-finetuned
Run ID: c1731b7e6b3f4c828b8a816f3d87f5be


RETRAINING OPTIONS

# Example 1: Retrain with new dataset
run_id_2, model_2, tokenizer_2 = retrain(
    base_run_id=run_id_1,
    new_dataset_path="new_code_dataset.json"
)

# Example 2: Retrain with different hyperparameters
run_id_3, model_3, tokenizer_3 = retrain(
    base_run_id=run_id_1,
    config_updates={
        "num_epochs": 2,
        "learning_rate": 1e-4,
        "lora_r": 8
    }
)

# Example 3: Retrain with both new data and config
run_id_4, model_4, tokenizer_4 = retrain(
    base_run_id=run_id_1,
    new_dataset_path="new_data.json",
    config_updates={"num_epochs": 5}
)

# Compare all runs
compare_runs()

# Get best model
best_run_id = get_best_run(metric="final_loss", ascending=True)


RETRAINING WITH NEW PARAMETERS


RETRAINING MODEL

→ Config updates: {'num_epochs': 2, 'learning_rate': 0.0001}

MLflow Run: 2c057b3a65b442c3ad175ef21e2ff19e

Loading /content/driv

  return fn(*args, **kwargs)


Step,Training Loss
5,2.4626


  return fn(*args, **kwargs)



Saving...

✓ Training Complete!
Model: ./starcoder-finetuned
Run ID: 2c057b3a65b442c3ad175ef21e2ff19e

✓ Retraining complete!


ALL DONE!

Initial model saved to: ./starcoder-finetuned
Initial run ID: c1731b7e6b3f4c828b8a816f3d87f5be

View results:
  mlflow ui --backend-store-uri ./mlruns
  http://localhost:5000
