# Domain Name Generator: Model Training & Experiments

This notebook covers comprehensive model training experiments with reproducible results and model version tracking.

## Overview
- Train Llama-3.2-1B and Phi-3-Mini models (focused on best performers)
- Track model versions and hyperparameters
- Compare training performance with progress bars and timing
- M1 optimization validation
- Reproducible experiment setup

In [None]:
# Setup and imports
import sys
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pathlib import Path
import time
from datetime import datetime
import hashlib
import wandb

sys.path.append('../src')

from domain_generator.models.jupyter_compatible import JupyterDomainGenerator, create_generator
from domain_generator.models.trainer import create_model_configs
from domain_generator.utils.config import Config

# Set random seeds for reproducibility
import torch
import random
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

print(f"🎯 Reproducible Setup Complete")
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'MPS (M1 GPU)' if torch.backends.mps.is_available() else 'CPU'}")

## 1. Model Version Tracking System

In [None]:
class ModelVersionTracker:
    """Track model versions, hyperparameters, and results for reproducibility"""
    
    def __init__(self, tracking_dir: str = "../models/tracking"):
        self.tracking_dir = Path(tracking_dir)
        self.tracking_dir.mkdir(parents=True, exist_ok=True)
        self.experiments_file = self.tracking_dir / "experiments.json"
        
        # Load existing experiments
        if self.experiments_file.exists():
            with open(self.experiments_file, 'r') as f:
                self.experiments = json.load(f)
        else:
            self.experiments = {}
    
    def create_experiment_id(self, model_name: str, config: dict) -> str:
        """Create unique experiment ID based on model and config"""
        config_str = json.dumps(config, sort_keys=True)
        config_hash = hashlib.md5(config_str.encode()).hexdigest()[:8]
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        return f"{model_name}_{timestamp}_{config_hash}"
    
    def log_experiment(self, experiment_id: str, model_name: str, config: dict, 
                      results: dict = None, model_path: str = None) -> str:
        """Log experiment details"""
        experiment_data = {
            "experiment_id": experiment_id,
            "model_name": model_name,
            "timestamp": datetime.now().isoformat(),
            "config": config,
            "model_path": model_path,
            "results": results or {},
            "status": "running"
        }
        
        self.experiments[experiment_id] = experiment_data
        self.save_experiments()
        return experiment_id
    
    def update_experiment(self, experiment_id: str, results: dict = None, 
                         status: str = None, model_path: str = None):
        """Update experiment with results"""
        if experiment_id in self.experiments:
            if results:
                self.experiments[experiment_id]["results"].update(results)
            if status:
                self.experiments[experiment_id]["status"] = status
            if model_path:
                self.experiments[experiment_id]["model_path"] = model_path
            
            self.save_experiments()
    
    def save_experiments(self):
        """Save experiments to file"""
        with open(self.experiments_file, 'w') as f:
            json.dump(self.experiments, f, indent=2)
    
    def get_experiment_summary(self) -> pd.DataFrame:
        """Get summary of all experiments"""
        if not self.experiments:
            return pd.DataFrame()
        
        summary_data = []
        for exp_id, exp_data in self.experiments.items():
            summary = {
                "experiment_id": exp_id,
                "model_name": exp_data["model_name"],
                "timestamp": exp_data["timestamp"],
                "status": exp_data["status"],
                "epochs": exp_data["config"].get("training_config", {}).get("num_epochs", "N/A"),
                "batch_size": exp_data["config"].get("training_config", {}).get("per_device_train_batch_size", "N/A"),
                "learning_rate": exp_data["config"].get("training_config", {}).get("learning_rate", "N/A"),
                "final_loss": exp_data["results"].get("final_eval_loss", "N/A")
            }
            summary_data.append(summary)
        
        return pd.DataFrame(summary_data)

# Initialize tracker
tracker = ModelVersionTracker()
print("✅ Model version tracker initialized")

# Show existing experiments
existing_experiments = tracker.get_experiment_summary()
if not existing_experiments.empty:
    print(f"📊 Found {len(existing_experiments)} existing experiments")
    display(existing_experiments)
else:
    print("📝 No existing experiments found")

## 2. Available Model Configurations

In [None]:
# Get all available model configurations
model_configs = create_model_configs()

print("🤖 Available Model Configurations:")
print("=" * 50)

config_summary = []
for name, config in model_configs.items():
    model_name = config["model_name"]
    epochs = config["training_config"].num_epochs
    batch_size = config["training_config"].per_device_train_batch_size
    lr = config["training_config"].learning_rate
    lora_r = config["lora_config"].r
    
    # Estimate model size
    size_map = {
        'meta-llama/Llama-3.2-1B-Instruct': '1B (~3.5GB)',
        'microsoft/Phi-3-mini-4k-instruct': '3.8B (~3.8GB)',
        'microsoft/DialoGPT-medium': '355M (~1.4GB)',
        'gpt2': '124M (~500MB)',
        'distilgpt2': '82M (~330MB)'
    }
    size = size_map.get(model_name, 'Unknown')
    
    config_summary.append({
        'config_id': name,
        'model_name': model_name,
        'size': size,
        'epochs': epochs,
        'batch_size': batch_size,
        'learning_rate': lr,
        'lora_r': lora_r
    })
    
    print(f"📱 {name}:")
    print(f"   Model: {model_name}")
    print(f"   Size: {size}")
    print(f"   Training: {epochs} epochs, batch={batch_size}, lr={lr}")
    print(f"   LoRA: r={lora_r}")
    print()

# Create DataFrame for easy comparison
config_df = pd.DataFrame(config_summary)
display(config_df)

## 3. Training Experiment Runner

In [None]:
def run_training_experiment(model_config_id: str, dataset_path: str = "../data/processed/training_dataset.json",
                          use_wandb: bool = True, dry_run: bool = False) -> dict:
    """Run a single training experiment with full tracking"""
    
    # Get model configuration
    if model_config_id not in model_configs:
        raise ValueError(f"Model config '{model_config_id}' not found")
    
    config = model_configs[model_config_id]
    
    # Create experiment ID and log
    experiment_config = {
        "model_name": config["model_name"],
        "lora_config": {
            "r": config["lora_config"].r,
            "lora_alpha": config["lora_config"].lora_alpha,
            "lora_dropout": config["lora_config"].lora_dropout
        },
        "training_config": {
            "num_epochs": config["training_config"].num_epochs,
            "per_device_train_batch_size": config["training_config"].per_device_train_batch_size,
            "learning_rate": config["training_config"].learning_rate,
            "gradient_accumulation_steps": config["training_config"].gradient_accumulation_steps
        },
        "dataset_path": dataset_path,
        "use_wandb": use_wandb,
        "device": "mps" if torch.backends.mps.is_available() else "cpu"
    }
    
    experiment_id = tracker.create_experiment_id(model_config_id, experiment_config)
    
    print(f"🚀 Starting experiment: {experiment_id}")
    print(f"📱 Model: {config['model_name']}")
    print(f"📊 Config: {model_config_id}")
    
    if dry_run:
        print("🧪 DRY RUN - Not actually training")
        mock_results = {
            "training_time": 3600,  # Mock 1 hour
            "final_eval_loss": 2.5 + np.random.normal(0, 0.1),
            "total_steps": 1000,
            "best_eval_loss": 2.3 + np.random.normal(0, 0.1)
        }
        
        tracker.log_experiment(experiment_id, model_config_id, experiment_config, 
                             results=mock_results, model_path=f"../models/{experiment_id}")
        tracker.update_experiment(experiment_id, status="completed (dry run)")
        
        return {
            "experiment_id": experiment_id,
            "results": mock_results,
            "model_path": f"../models/{experiment_id}",
            "status": "completed (dry run)"
        }
    
    # Log experiment start
    tracker.log_experiment(experiment_id, model_config_id, experiment_config)
    
    try:
        # Create generator and train
        generator = create_generator(model_config_id)
        
        start_time = time.time()
        
        # Train model
        output_dir = f"../models/{experiment_id}"
        model_path = generator.train_model(
            dataset_path=dataset_path,
            output_dir=output_dir,
            use_wandb=use_wandb
        )
        
        training_time = time.time() - start_time
        
        # Collect results (you would parse these from training logs in practice)
        results = {
            "training_time": training_time,
            "model_path": model_path,
            "status": "completed"
        }
        
        # Update experiment with results
        tracker.update_experiment(experiment_id, results=results, 
                                status="completed", model_path=model_path)
        
        print(f"✅ Training completed in {training_time:.2f} seconds")
        return {
            "experiment_id": experiment_id,
            "results": results,
            "model_path": model_path,
            "status": "completed"
        }
        
    except Exception as e:
        error_msg = str(e)
        print(f"❌ Training failed: {error_msg}")
        
        tracker.update_experiment(experiment_id, 
                                results={"error": error_msg}, 
                                status="failed")
        
        return {
            "experiment_id": experiment_id,
            "error": error_msg,
            "status": "failed"
        }

print("✅ Training experiment runner ready")

## 4. Run Training Experiments

**Note**: For demonstration purposes, we'll run dry runs first. Set `DRY_RUN = False` to run actual training.

In [ ]:
# Configuration for experiments
DRY_RUN = True  # Set to False for actual training
MODELS_TO_TRAIN = ['llama-3.2-1b', 'phi-3-mini']  # Focus on best performing models

# Check if training data exists
dataset_path = "../data/processed/training_dataset.json"
if not Path(dataset_path).exists():
    print(f"❌ Training dataset not found at {dataset_path}")
    print("Please run the dataset creation notebook first!")
else:
    print(f"✅ Training dataset found: {dataset_path}")
    
    # Run experiments
    experiment_results = []
    
    for model_id in MODELS_TO_TRAIN:
        print(f"\n{'='*60}")
        print(f"🚀 Training {model_id}")
        print(f"{'='*60}")
        
        result = run_training_experiment(
            model_config_id=model_id,
            dataset_path=dataset_path,
            use_wandb=True,
            dry_run=DRY_RUN
        )
        
        experiment_results.append(result)
        
        # Small delay between experiments
        time.sleep(1)
    
    print(f"\n✅ Completed {len(experiment_results)} experiments")

## 5. Training Results Analysis

In [None]:
# Get updated experiment summary
experiment_summary = tracker.get_experiment_summary()

if not experiment_summary.empty:
    print("📊 Training Experiment Summary:")
    display(experiment_summary)
    
    # Plot training comparison if we have results
    completed_experiments = experiment_summary[experiment_summary['status'].str.contains('completed')]
    
    if len(completed_experiments) > 0:
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Model size comparison
        model_sizes = []
        model_names = []
        
        size_map = {
            'distilgpt2': 82,
            'gpt2-small': 124,
            'dialogpt-medium': 355,
            'llama-3.2-1b': 1000,
            'phi-3-mini': 3800
        }
        
        for _, row in completed_experiments.iterrows():
            model_name = row['model_name']
            model_names.append(model_name)
            model_sizes.append(size_map.get(model_name, 100))
        
        axes[0].bar(range(len(model_names)), model_sizes, 
                   color=['skyblue', 'lightgreen', 'orange', 'red', 'purple'][:len(model_names)])
        axes[0].set_xlabel('Model')
        axes[0].set_ylabel('Parameters (Millions)')
        axes[0].set_title('Model Size Comparison')
        axes[0].set_xticks(range(len(model_names)))
        axes[0].set_xticklabels(model_names, rotation=45)
        
        # Training time comparison (if available)
        training_times = []
        for exp_id in completed_experiments['experiment_id']:
            exp_data = tracker.experiments.get(exp_id, {})
            results = exp_data.get('results', {})
            training_time = results.get('training_time', 0) / 60  # Convert to minutes
            training_times.append(training_time)
        
        if any(t > 0 for t in training_times):
            axes[1].bar(range(len(model_names)), training_times,
                       color=['skyblue', 'lightgreen', 'orange', 'red', 'purple'][:len(model_names)])
            axes[1].set_xlabel('Model')
            axes[1].set_ylabel('Training Time (minutes)')
            axes[1].set_title('Training Time Comparison')
            axes[1].set_xticks(range(len(model_names)))
            axes[1].set_xticklabels(model_names, rotation=45)
        else:
            axes[1].text(0.5, 0.5, 'Training time data\nnot available', 
                        ha='center', va='center', transform=axes[1].transAxes)
            axes[1].set_title('Training Time Comparison')
        
        plt.tight_layout()
        plt.show()
        
    else:
        print("⚠️ No completed experiments to analyze yet")
else:
    print("📝 No experiments found")

## 6. M1 Performance Validation

In [None]:
# Validate M1 performance optimizations
print("⚡ M1 Performance Validation")
print("=" * 40)

# Check MPS availability
if torch.backends.mps.is_available():
    print("✅ MPS (Metal Performance Shaders) available")
    
    # Test tensor operations on MPS
    device = torch.device('mps')
    
    # Benchmark tensor operations
    sizes = [100, 500, 1000, 2000]
    mps_times = []
    cpu_times = []
    
    for size in sizes:
        # MPS benchmark
        x_mps = torch.randn(size, size, device=device)
        start_time = time.time()
        for _ in range(10):
            y_mps = torch.mm(x_mps, x_mps.t())
        mps_time = (time.time() - start_time) / 10
        mps_times.append(mps_time)
        
        # CPU benchmark
        x_cpu = torch.randn(size, size)
        start_time = time.time()
        for _ in range(10):
            y_cpu = torch.mm(x_cpu, x_cpu.t())
        cpu_time = (time.time() - start_time) / 10
        cpu_times.append(cpu_time)
        
        speedup = cpu_time / mps_time
        print(f"Size {size}x{size}: MPS={mps_time:.4f}s, CPU={cpu_time:.4f}s, Speedup={speedup:.2f}x")
    
    # Plot performance comparison
    plt.figure(figsize=(10, 6))
    x_pos = np.arange(len(sizes))
    width = 0.35
    
    plt.bar(x_pos - width/2, mps_times, width, label='MPS (M1 GPU)', color='green', alpha=0.7)
    plt.bar(x_pos + width/2, cpu_times, width, label='CPU', color='blue', alpha=0.7)
    
    plt.xlabel('Matrix Size')
    plt.ylabel('Time (seconds)')
    plt.title('M1 GPU vs CPU Performance Comparison')
    plt.xticks(x_pos, [f'{s}x{s}' for s in sizes])
    plt.legend()
    plt.yscale('log')
    plt.grid(True, alpha=0.3)
    plt.show()
    
    avg_speedup = np.mean([cpu_times[i] / mps_times[i] for i in range(len(sizes))])
    print(f"\n📊 Average M1 GPU speedup: {avg_speedup:.2f}x")
    
else:
    print("❌ MPS not available - falling back to CPU")

# Memory usage check
print("\n💾 Memory Usage Check:")
import psutil
memory = psutil.virtual_memory()
print(f"Total RAM: {memory.total / (1024**3):.1f} GB")
print(f"Available RAM: {memory.available / (1024**3):.1f} GB")
print(f"Used RAM: {memory.used / (1024**3):.1f} GB ({memory.percent:.1f}%)")

# Check if we have enough memory for our models
model_memory_requirements = {
    'distilgpt2': 0.33,      # 330MB
    'gpt2-small': 0.5,       # 500MB  
    'dialogpt-medium': 1.4,  # 1.4GB
    'llama-3.2-1b': 3.5,     # 3.5GB
    'phi-3-mini': 3.8        # 3.8GB
}

available_gb = memory.available / (1024**3)
print(f"\n🎯 Model Memory Feasibility:")
for model, req_gb in model_memory_requirements.items():
    feasible = "✅" if req_gb < available_gb else "❌"
    print(f"  {model}: {req_gb:.1f} GB required {feasible}")

## 7. Reproducibility Validation

In [None]:
# Create reproducibility report
reproducibility_info = {
    "experiment_timestamp": datetime.now().isoformat(),
    "python_version": sys.version,
    "pytorch_version": torch.__version__,
    "numpy_version": np.__version__,
    "random_seeds": {
        "torch": 42,
        "numpy": 42,
        "python_random": 42
    },
    "device_info": {
        "device": "mps" if torch.backends.mps.is_available() else "cpu",
        "mps_available": torch.backends.mps.is_available(),
        "mps_built": torch.backends.mps.is_built() if hasattr(torch.backends.mps, 'is_built') else "unknown"
    },
    "system_info": {
        "platform": sys.platform,
        "total_memory_gb": psutil.virtual_memory().total / (1024**3),
        "cpu_count": psutil.cpu_count()
    },
    "model_configs": {name: {
        "model_name": config["model_name"],
        "epochs": config["training_config"].num_epochs,
        "batch_size": config["training_config"].per_device_train_batch_size,
        "learning_rate": config["training_config"].learning_rate,
        "lora_r": config["lora_config"].r
    } for name, config in model_configs.items()}
}

# Save reproducibility info
repro_file = Path("../models/tracking/reproducibility_info.json")
with open(repro_file, 'w') as f:
    json.dump(reproducibility_info, f, indent=2)

print("📋 Reproducibility Information:")
print(f"  Python: {sys.version.split()[0]}")
print(f"  PyTorch: {torch.__version__}")
print(f"  Device: {reproducibility_info['device_info']['device']}")
print(f"  Seeds: All set to 42")
print(f"  System: {sys.platform}, {psutil.cpu_count()} CPUs, {psutil.virtual_memory().total / (1024**3):.1f}GB RAM")
print(f"\n✅ Reproducibility info saved to: {repro_file}")

# Create experiment reproduction script
repro_script = """
#!/usr/bin/env python3
# Auto-generated reproduction script

import sys
import torch
import numpy as np
import random

# Set seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

# Add src to path
sys.path.append('src')

from domain_generator.models.jupyter_compatible import create_generator

def reproduce_experiment(model_config_id, dataset_path="data/processed/training_dataset.json"):
    \"\"\"Reproduce a training experiment\"\"\" 
    print(f"Reproducing experiment for {model_config_id}")
    
    generator = create_generator(model_config_id)
    model_path = generator.train_model(
        dataset_path=dataset_path,
        output_dir=f"models/reproduced_{model_config_id}",
        use_wandb=False  # Disable W&B for reproduction
    )
    
    print(f"Model saved to: {model_path}")
    return model_path

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="Reproduce training experiment")
    parser.add_argument("model_config", help="Model configuration ID")
    parser.add_argument("--dataset", default="data/processed/training_dataset.json", 
                       help="Path to training dataset")
    
    args = parser.parse_args()
    reproduce_experiment(args.model_config, args.dataset)
"""

repro_script_file = Path("../reproduce_experiment.py")
with open(repro_script_file, 'w') as f:
    f.write(repro_script.strip())

print(f"🔄 Reproduction script created: {repro_script_file}")
print("\nTo reproduce an experiment, run:")
print("  python reproduce_experiment.py <model_config_id>")
print("\nExample:")
print("  python reproduce_experiment.py distilgpt2")

## Summary

This notebook provides:

1. **Model Version Tracking**: Complete experiment tracking with IDs, configs, and results
2. **Reproducible Setup**: Fixed random seeds and documented environment
3. **M1 Optimization**: Validated MPS acceleration and memory efficiency
4. **Training Pipeline**: Automated training with progress tracking
5. **Performance Analysis**: Model comparison and benchmarking
6. **Reproducibility Tools**: Scripts and configs for experiment reproduction

### Key Features:
- ✅ All models optimized for M1 with <4GB memory usage
- ✅ Comprehensive experiment tracking and versioning
- ✅ Reproducible results with fixed seeds
- ✅ Performance validation and benchmarking
- ✅ Easy reproduction scripts

### Next Steps:
1. Run actual training experiments (set `DRY_RUN = False`)
2. Proceed to model evaluation notebook
3. Analyze results and iterate on improvements