# ReDSM5 LLM Fine-tuning on Google Colab

This notebook enables training decoder-only LLMs (Llama/Qwen) with TPU/GPU acceleration on Google Colab.

## Features
- ✅ Automatic TPU/GPU/CPU detection
- ✅ LoRA/QLoRA support for efficient fine-tuning
- ✅ Multi-label DSM-5 symptom classification
- ✅ Sliding window for long documents
- ✅ Threshold optimization and model export

## Before Starting
1. Go to **Runtime > Change runtime type**
2. Select **T4 GPU** or **TPU v2** for hardware accelerator
3. Click **Save**

In [None]:
# Cell 1: Setup and Installation
!pip install -q transformers>=4.36.0 datasets>=2.16.0 accelerate>=0.25.0
!pip install -q peft>=0.7.0 bitsandbytes>=0.41.0 scipy scikit-learn
!pip install -q wandb optuna pyyaml pandas

# Clone repository
!git clone https://github.com/your-username/LLM_Agents_ReDSM5.git
%cd LLM_Agents_ReDSM5

print("✅ Installation complete!")

In [None]:
# Cell 2: Hardware Detection
import torch
import sys

# Try TPU detection
try:
    import torch_xla
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    USE_TPU = True
    print(f"✅ TPU detected: {device}")
except ImportError:
    USE_TPU = False
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    if torch.cuda.is_available():
        print(f"✅ GPU detected: {torch.cuda.get_device_name(0)}")
        print(f"   Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    else:
        print("⚠️  CPU only - training will be slow")

print(f"\nDevice: {device}")
print(f"Python: {sys.version.split()[0]}")
print(f"PyTorch: {torch.__version__}")

In [None]:
# Cell 3: Mount Google Drive (Optional - for data/checkpoints)
from google.colab import drive
import os

# Uncomment to mount Drive
# drive.mount('/content/drive')
# DATA_DIR = '/content/drive/MyDrive/redsm5_data'

# Or use sample data
print("Using local sample data for demonstration")

In [None]:
# Cell 4: Generate Sample Data
from tests.fixtures.data import generate_synthetic_dataset
from pathlib import Path

# Create sample dataset
DATA_DIR = Path('/content/sample_data')
generate_synthetic_dataset(DATA_DIR, num_samples=200, seed=42)

print(f"✅ Generated sample data in {DATA_DIR}")
print(f"   Train samples: 140")
print(f"   Dev samples: 30")
print(f"   Test samples: 30")

In [None]:
# Cell 5: Hugging Face Login (for gated models)
from huggingface_hub import notebook_login

# Uncomment if using gated models like Llama-2
# notebook_login()

print("Skip HF login for open models")

In [None]:
# Cell 6: Configure Training
import yaml

config = {
    # Model settings
    'model_id': 'meta-llama/Llama-2-7b-hf',  # or 'Qwen/Qwen2.5-7B'
    'method': 'lora',  # 'full_ft', 'lora', or 'qlora'
    
    # Training hyperparameters
    'num_train_epochs': 3,
    'per_device_train_batch_size': 8 if USE_TPU else 4,
    'per_device_eval_batch_size': 16 if USE_TPU else 8,
    'gradient_accumulation_steps': 2,
    'learning_rate': 2e-5,
    'warmup_ratio': 0.1,
    'weight_decay': 0.01,
    
    # LoRA settings (if method='lora' or 'qlora')
    'lora_r': 16,
    'lora_alpha': 32,
    'lora_dropout': 0.05,
    'lora_target_modules': ['q_proj', 'v_proj', 'k_proj', 'o_proj'],
    
    # Document processing
    'max_length': 2048 if USE_TPU else 1024,
    'doc_stride': 512,
    'truncation_strategy': 'window_pool',
    'pooler': 'mean',  # 'max', 'mean', or 'logit_sum'
    
    # Loss settings
    'loss_type': 'bce',  # or 'focal'
    'class_weighting': 'sqrt_inv',  # 'none', 'inv', or 'sqrt_inv'
    'label_smoothing': 0.0,
    
    # Optimization
    'bf16': USE_TPU or torch.cuda.is_bf16_supported(),
    'fp16': not USE_TPU and torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
    'tf32': True,
    'gradient_checkpointing': True,
    
    # Evaluation
    'evaluation_strategy': 'epoch',
    'save_strategy': 'epoch',
    'load_best_model_at_end': True,
    'metric_for_best_model': 'macro_f1',
    
    # Data
    'data_dir': str(DATA_DIR),
    'train_split': 'train',
    'dev_split': 'dev',
    'test_split': 'test',
    'seed': 42,
}

# Save config
config_path = Path('/content/colab_config.yaml')
with open(config_path, 'w') as f:
    yaml.dump(config, f)

print("✅ Configuration saved")
print(f"\nKey settings:")
print(f"  Model: {config['model_id']}")
print(f"  Method: {config['method']}")
print(f"  Epochs: {config['num_train_epochs']}")
print(f"  Batch size: {config['per_device_train_batch_size']}")
print(f"  Max length: {config['max_length']}")

In [None]:
# Cell 7: Start Training
OUTPUT_DIR = Path('/content/outputs')
LABELS_PATH = Path('configs/labels.yaml')

!python -m src.train \
    --config {config_path} \
    --labels {LABELS_PATH} \
    --out_dir {OUTPUT_DIR} \
    --use_wandb false

print("\n✅ Training complete!")

In [None]:
# Cell 8: View Training Results
import json
import pandas as pd

# Load metrics
metrics_path = OUTPUT_DIR / 'metrics_dev.json'
with open(metrics_path) as f:
    metrics = json.load(f)

print("Development Set Results:")
print(f"  Macro F1: {metrics['macro_f1']:.4f}")
print(f"  Micro F1: {metrics['micro_f1']:.4f}")
print(f"  Weighted F1: {metrics['weighted_f1']:.4f}")

# Load per-label report
report_path = OUTPUT_DIR / 'label_report_dev.csv'
df = pd.read_csv(report_path)
print("\nPer-Label Performance:")
print(df[['label', 'f1', 'precision', 'recall']].to_string(index=False))

In [None]:
# Cell 9: Evaluate on Test Set
BEST_CKPT = OUTPUT_DIR / 'best'

!python -m src.eval \
    --ckpt {BEST_CKPT} \
    --labels {LABELS_PATH} \
    --data_dir {DATA_DIR} \
    --split test

print("\n✅ Test evaluation complete!")

# Load test metrics
test_metrics_path = BEST_CKPT / 'eval_test' / 'metrics.json'
with open(test_metrics_path) as f:
    test_metrics = json.load(f)

print("\nTest Set Results:")
print(f"  Macro F1: {test_metrics['macro_f1']:.4f}")
print(f"  Micro F1: {test_metrics['micro_f1']:.4f}")

In [None]:
# Cell 10: Visualize Predictions
import matplotlib.pyplot as plt
import seaborn as sns

# Load predictions
pred_path = BEST_CKPT / 'eval_test' / 'predictions.csv'
pred_df = pd.read_csv(pred_path)

# Get label columns
label_cols = [
    'depressed_mood', 'diminished_interest', 'weight_appetite_change',
    'sleep_disturbance', 'psychomotor', 'fatigue',
    'worthlessness_guilt', 'concentration_indecision', 'suicidality'
]

# Plot label distribution
true_cols = [f'{label}_true' for label in label_cols]
pred_cols = [f'{label}_pred' for label in label_cols]

if all(col in pred_df.columns for col in true_cols):
    fig, ax = plt.subplots(figsize=(12, 6))
    
    true_counts = pred_df[true_cols].sum()
    pred_counts = pred_df[pred_cols].sum()
    
    x = range(len(label_cols))
    width = 0.35
    
    ax.bar([i - width/2 for i in x], true_counts, width, label='True')
    ax.bar([i + width/2 for i in x], pred_counts, width, label='Predicted')
    
    ax.set_xlabel('Symptom Label')
    ax.set_ylabel('Count')
    ax.set_title('True vs Predicted Label Distribution')
    ax.set_xticks(x)
    ax.set_xticklabels([label.replace('_', ' ').title() for label in label_cols], rotation=45, ha='right')
    ax.legend()
    
    plt.tight_layout()
    plt.show()
else:
    print("Prediction columns not found in expected format")

In [None]:
# Cell 11: Export Model to Drive
from shutil import copytree

# Uncomment to save to Google Drive
# DRIVE_OUTPUT = '/content/drive/MyDrive/redsm5_models/best_model'
# copytree(BEST_CKPT, DRIVE_OUTPUT, dirs_exist_ok=True)
# print(f"✅ Model saved to {DRIVE_OUTPUT}")

# Download as ZIP
!cd {OUTPUT_DIR} && zip -r best_model.zip best/
print("\n✅ Model packaged as best_model.zip")
print(f"   Location: {OUTPUT_DIR}/best_model.zip")

from google.colab import files
# files.download(str(OUTPUT_DIR / 'best_model.zip'))

In [None]:
# Cell 12: Inference Example
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

# Load model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(BEST_CKPT)
tokenizer = AutoTokenizer.from_pretrained(BEST_CKPT)
model.eval()
model.to(device)

# Load thresholds
thresholds_path = OUTPUT_DIR / 'thresholds.json'
with open(thresholds_path) as f:
    thresholds_data = json.load(f)
    thresholds = torch.tensor(thresholds_data['thresholds'])

# Example text
text = "I feel so sad and hopeless. I can't sleep and have no energy to do anything."

# Tokenize
inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).cpu()
    preds = (probs > thresholds).int()

# Display results
print(f"Text: {text}\n")
print("Predicted Symptoms:")
for i, label in enumerate(label_cols):
    if preds[0, i] == 1:
        print(f"  ✓ {label.replace('_', ' ').title()} (prob: {probs[0, i]:.3f})")