# Model Comparison: Test Different Model Sizes and Architectures

Models to test:
- Qwen 1.5B vs 7B vs 14B
- Llama 3 8B
- Mistral 7B

In [None]:
# Cell 1: Install Dependencies
!pip install unsloth transformers datasets trl google-generativeai sentence-transformers scikit-learn pandas matplotlib -q
print("‚úÖ Dependencies installed")

In [None]:
# Cell 2: Imports
import sys
from pathlib import Path
import time
import torch
import gc
import pandas as pd
import matplotlib.pyplot as plt

# Add scripts to path
scripts_dir = Path.cwd() / "scripts"
if scripts_dir.exists():
    sys.path.insert(0, str(scripts_dir))

from unsloth import FastLanguageModel
from scripts.utilities.data_loader import load_people_data

print("‚úÖ Modules imported")

## Test Configuration

In [None]:
# Test Configuration
MODELS_TO_TEST = [
    {"name": "Qwen 1.5B", "path": "Qwen/Qwen2.5-1.5B-Instruct", "rank": 16},
    {"name": "Qwen 7B", "path": "Qwen/Qwen2.5-7B-Instruct", "rank": 16},
    {"name": "Qwen 14B", "path": "Qwen/Qwen2.5-14B-Instruct", "rank": 32},
    {"name": "Llama 3 8B", "path": "meta-llama/Llama-3-8B-Instruct", "rank": 16},
    {"name": "Mistral 7B", "path": "mistralai/Mistral-7B-Instruct-v0.3", "rank": 16},
]

print(f"‚úÖ Configured {len(MODELS_TO_TEST)} models to test")
for model in MODELS_TO_TEST:
    print(f"   ‚Ä¢ {model['name']}: {model['path']} (rank={model['rank']})")

## Run Model Comparison Tests

In [None]:
# Run Model Comparison Tests
results_comparison = []

# Load people data
PEOPLE = load_people_data("configs/people_data.yaml")

# Load training data (would need to generate first)
# training_data = load_training_data("training_data.jsonl")

print("‚ö†Ô∏è Template code - fill in training and evaluation functions")
print("\nFor each model:")
print("  1. Load model with LoRA")
print("  2. Train on same dataset")
print("  3. Evaluate on all test types")
print("  4. Record metrics (scores, time, memory)")
print("  5. Free memory before next model")

# Example structure:
# for model_config in MODELS_TO_TEST:
#     print(f"\n{'='*70}")
#     print(f"TESTING: {model_config['name']}")
#     print("="*70)
#     
#     # Load model
#     model, tokenizer = FastLanguageModel.from_pretrained(
#         model_name=model_config['path'],
#         max_seq_length=2048,
#         dtype=None,
#         load_in_4bit=True,
#     )
#     model = FastLanguageModel.get_peft_model(
#         model, r=model_config['rank'], lora_alpha=model_config['rank']*2,
#         target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
#         lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth",
#     )
#     
#     # Train and evaluate
#     train_start = time.time()
#     # ... training code ...
#     train_time = time.time() - train_start
#     
#     eval_start = time.time()
#     # ... evaluation code ...
#     eval_time = time.time() - eval_start
#     
#     # Record results
#     results_comparison.append({
#         "Model": model_config['name'],
#         "Parameters": model_config['path'].split('/')[-1],
#         "LoRA Rank": model_config['rank'],
#         "Train Time (s)": train_time,
#         "Eval Time (s)": eval_time,
#         "Single Q": scores.get("single_q_avg", 0),
#         "Conversation": scores.get("conv_avg", 0),
#         "Correction": scores.get("correction_avg", 0),
#         "Extended": scores.get("extended_avg", 0),
#         "Overall": scores.get("overall_avg", 0),
#         "Memory (GB)": torch.cuda.memory_allocated() / 1e9 if torch.cuda.is_available() else 0
#     })
#     
#     # Free memory
#     del model, tokenizer
#     torch.cuda.empty_cache()
#     gc.collect()

## Results Analysis

In [None]:
# Results Analysis
if results_comparison:
    df_comparison = pd.DataFrame(results_comparison)
    
    print("\n" + "="*70)
    print("üìä MODEL COMPARISON RESULTS")
    print("="*70)
    print(df_comparison.to_string(index=False))
    
    # Save to CSV
    df_comparison.to_csv("model_comparison_results.csv", index=False)
    print("\n‚úÖ Results saved to model_comparison_results.csv")
    
    # Visualizations
    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
    
    # Plot 1: Overall scores
    df_comparison.plot(x="Model", y="Overall", kind="bar", ax=axes[0,0], title="Overall Score", color="steelblue")
    axes[0,0].axhline(y=0.75, color='r', linestyle='--', label='Target (75%)')
    axes[0,0].legend()
    axes[0,0].set_ylabel("Score")
    
    # Plot 2: Test breakdown
    df_comparison.plot(x="Model", y=["Single Q", "Conversation", "Correction", "Extended"], kind="bar", ax=axes[0,1], title="Score by Test Type")
    axes[0,1].set_ylabel("Score")
    
    # Plot 3: Training time
    df_comparison.plot(x="Model", y="Train Time (s)", kind="bar", ax=axes[0,2], title="Training Time", color="orange")
    axes[0,2].set_ylabel("Seconds")
    
    # Plot 4: Evaluation time
    df_comparison.plot(x="Model", y="Eval Time (s)", kind="bar", ax=axes[1,0], title="Evaluation Time", color="green")
    axes[1,0].set_ylabel("Seconds")
    
    # Plot 5: Memory usage
    df_comparison.plot(x="Model", y="Memory (GB)", kind="bar", ax=axes[1,1], title="GPU Memory Usage", color="purple")
    axes[1,1].set_ylabel("GB")
    
    # Plot 6: Score vs Memory tradeoff
    axes[1,2].scatter(df_comparison["Memory (GB)"], df_comparison["Overall"])
    for i, model in enumerate(df_comparison["Model"]):
        axes[1,2].annotate(model, (df_comparison["Memory (GB)"][i], df_comparison["Overall"][i]))
    axes[1,2].set_xlabel("Memory (GB)")
    axes[1,2].set_ylabel("Overall Score")
    axes[1,2].set_title("Score vs Memory Tradeoff")
    axes[1,2].grid(True)
    
    plt.tight_layout()
    plt.savefig("model_comparison_charts.png", dpi=150)
    print("‚úÖ Charts saved to model_comparison_charts.png")
    
    # Recommendations
    print("\n" + "="*70)
    print("üèÜ RECOMMENDATIONS")
    print("="*70)
    
    # Best overall
    best_overall = df_comparison.loc[df_comparison["Overall"].idxmax()]
    print(f"\n‚úÖ Best overall performance:")
    print(f"   {best_overall['Model']}: {best_overall['Overall']:.1%}")
    
    # Best efficiency (score per GB)
    df_comparison["Efficiency"] = df_comparison["Overall"] / df_comparison["Memory (GB)"]
    best_efficiency = df_comparison.loc[df_comparison["Efficiency"].idxmax()]
    print(f"\n‚úÖ Best efficiency (score/GB):")
    print(f"   {best_efficiency['Model']}: {best_efficiency['Efficiency']:.3f}")
    
    # Fastest
    fastest = df_comparison.loc[df_comparison["Train Time (s)"].idxmin()]
    print(f"\n‚úÖ Fastest training:")
    print(f"   {fastest['Model']}: {fastest['Train Time (s)']:.0f}s")
else:
    print("‚ö†Ô∏è No results yet - run the model comparison tests above")

## Summary

In [None]:
# Summary
print("\n" + "="*70)
print("üèÅ MODEL COMPARISON COMPLETE")
print("="*70)

print("\nüí° This notebook compares different model architectures and sizes:")
print("   ‚Ä¢ Different model families (Qwen, Llama, Mistral)")
print("   ‚Ä¢ Different model sizes (1.5B, 7B, 8B, 14B)")
print("   ‚Ä¢ Performance vs efficiency tradeoffs")
print("   ‚Ä¢ Memory usage analysis")

print("\nüìä Fill in the training and evaluation code to run actual comparisons")
print("   Results will help identify the best model for your use case")