# Language Model Training - Google Colab

This notebook demonstrates how to train Gemma-2B, Llama-3-8B, and LLaVA-1.6 models using your custom datasets.

## Setup Instructions:
1. Upload your datasets to Google Drive
2. Choose your model configuration
3. Run the training cells
4. Monitor training progress
5. Download your trained model

## 1. Environment Setup

In [None]:
# Check GPU availability
!nvidia-smi

# Install required packages
!pip install -q transformers peft datasets torch accelerate tensorboard PyYAML

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

print("Environment setup complete!")

## 2. Project Setup

In [None]:
# Clone your repository or upload files
import os
os.chdir('/content')

# Option 1: Clone from GitHub (recommended)
# !git clone https://github.com/yourusername/chrono-membench.git
# os.chdir('/content/chrono-membench')

# Option 2: Copy from Google Drive
!cp -r /content/drive/MyDrive/chrono-membench /content/
os.chdir('/content/chrono-membench')

# Verify structure
!ls -la

## 3. Dataset Preparation

In [None]:
# If using DVC, pull the datasets
# !dvc pull

# Or copy datasets from Google Drive
!mkdir -p data/raw
!cp /content/drive/MyDrive/chrono-membench/data/raw/*.jsonl data/raw/

# Verify datasets
!ls -lah data/raw/*.jsonl

## 4. Model Configuration

In [None]:
# Choose your model and configuration
MODEL_TYPE = "gemma-2b"  # Options: "gemma-2b", "llama-3-8b", "llava-1.6-7b"
MODEL_SOURCE = "huggingface"  # Options: "local", "huggingface"
CONFIG_FILE = f"configs/{MODEL_TYPE}.yaml"

print(f"Training {MODEL_TYPE} using {MODEL_SOURCE} source")
print(f"Configuration file: {CONFIG_FILE}")

# View configuration
!cat {CONFIG_FILE}

## 5. HuggingFace Authentication (if using HuggingFace models)

In [None]:
# Login to HuggingFace if using HuggingFace models
if MODEL_SOURCE == "huggingface":
    from huggingface_hub import login
    
    # Enter your HuggingFace token
    token = input("Enter your HuggingFace token: ")
    login(token)
    
    print("HuggingFace authentication successful!")
else:
    print("Using local models - no authentication needed")

## 6. Training Configuration Update

In [None]:
# Update configuration for Colab environment
import yaml

with open(CONFIG_FILE, 'r') as f:
    config = yaml.safe_load(f)

# Update for Colab
config['model']['source'] = MODEL_SOURCE
config['environment']['platform'] = 'colab'
config['environment']['mixed_precision'] = 'fp16'  # Good for Colab
config['training']['batch_size'] = 2  # Conservative for Colab
config['training']['gradient_accumulation_steps'] = 8
config['colab']['mount_drive'] = True
config['colab']['install_requirements'] = True

# Save updated config
colab_config_file = f"configs/{MODEL_TYPE}_colab.yaml"
with open(colab_config_file, 'w') as f:
    yaml.dump(config, f, default_flow_style=False)

print(f"Updated configuration saved to {colab_config_file}")

## 7. Start Training

In [None]:
# Start training
import subprocess
import sys

# Set up Python path
sys.path.append('/content/chrono-membench/src')

# Run training
cmd = [
    'python', '/content/chrono-membench/src/chrono/train.py',
    '--config', colab_config_file,
    '--output_dir', f'/content/outputs/{MODEL_TYPE}',
    '--base_path', '/content/chrono-membench'
]

print("Starting training...")
print(f"Command: {' '.join(cmd)}")

# Run training
result = subprocess.run(cmd, capture_output=True, text=True)

print("STDOUT:")
print(result.stdout)

if result.stderr:
    print("\nSTDERR:")
    print(result.stderr)

print(f"\nTraining completed with exit code: {result.returncode}")

## 8. Monitor Training (Alternative - Interactive Training)

In [None]:
# Alternative: Run training interactively to see progress
!cd /content/chrono-membench && python src/chrono/train.py \
    --config {colab_config_file} \
    --output_dir /content/outputs/{MODEL_TYPE} \
    --base_path /content/chrono-membench

## 9. View Training Logs

In [None]:
# View training logs
!ls -la /content/outputs/{MODEL_TYPE}/

# Check if TensorBoard logs exist
!ls -la /content/outputs/{MODEL_TYPE}/logs/

# Load TensorBoard (optional)
%load_ext tensorboard
%tensorboard --logdir /content/outputs/{MODEL_TYPE}/logs/

## 10. Test the Trained Model

In [None]:
# Test the trained model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# Load the trained model
model_path = f"/content/outputs/{MODEL_TYPE}"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto"
)

# Test generation
def generate_text(prompt, max_length=100):
    inputs = tokenizer(prompt, return_tensors="pt")
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            num_return_sequences=1,
            temperature=0.7,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True
        )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Test prompts
test_prompts = [
    "The future of artificial intelligence is",
    "In mathematics, the concept of infinity",
    "Climate change is a global challenge that",
    "Question: What is 2+2? Answer:"
]

print("Testing trained model:")
print("=" * 50)

for prompt in test_prompts:
    generated = generate_text(prompt)
    print(f"Prompt: {prompt}")
    print(f"Generated: {generated}")
    print("-" * 50)

## 11. Save Model to Google Drive

In [None]:
# Save the trained model to Google Drive
import shutil

# Create destination directory
drive_model_path = f"/content/drive/MyDrive/trained_models/{MODEL_TYPE}"
!mkdir -p {drive_model_path}

# Copy model files
source_path = f"/content/outputs/{MODEL_TYPE}"
!cp -r {source_path}/* {drive_model_path}/

print(f"Model saved to Google Drive: {drive_model_path}")

# Also save training logs
logs_path = f"/content/drive/MyDrive/training_logs/{MODEL_TYPE}"
!mkdir -p {logs_path}
!cp -r {source_path}/logs/* {logs_path}/

print(f"Training logs saved to: {logs_path}")

## 12. Cleanup

In [None]:
# Clean up temporary files to free space
!rm -rf /content/outputs
!rm -rf /content/chrono-membench

print("Cleanup completed!")