# ML Model Factory - Complete Pipeline

**All-in-one notebook for training ML models on OHLCV data**

## What This Notebook Does
1. **Setup** - Clone repo, install dependencies, mount Drive
2. **Load Data** - Load processed datasets from Drive
3. **Train Models** - Train tabular models (XGBoost, LightGBM)
4. **Evaluate** - Test set evaluation with trading metrics
5. **Save** - Persist models to Google Drive

## Prerequisites
- Processed data in Google Drive at `ml_factory/data/processed/{SYMBOL}/`
- Expected files: `train_scaled.parquet`, `val_scaled.parquet`, `test_scaled.parquet`

## Expected Runtime
- Model Training: 5-20 minutes per model
- Total: ~30 minutes for 2 models

---
# Part 1: Environment Setup
---

In [None]:
# Clone repository
!git clone https://github.com/Snehpatel101/Research.git /content/research 2>/dev/null || echo "Repo already exists"
%cd /content/research

In [None]:
# Install dependencies
!pip install -q -r requirements.txt

In [None]:
# Setup environment and mount Drive
import sys
sys.path.insert(0, '/content/research')

from notebooks.colab_setup import setup_colab_environment

env_info = setup_colab_environment(mount_drive=True, use_gpu=True)

print(f"\nEnvironment ready!")
print(f"  GPU: {env_info.get('gpu_available', False)}")
print(f"  Drive: {env_info.get('drive_mounted', False)}")

In [None]:
# Configuration - EDIT THESE VALUES
SYMBOL = "MES"      # Symbol (MES, MGC, etc.)
HORIZON = 20        # Label horizon (5, 10, 15, 20)

# Paths (adjust if your data is elsewhere)
DATA_PATH = f"/content/drive/MyDrive/ml_factory/data/processed/{SYMBOL}"
OUTPUT_DIR = "/content/experiments"
MODELS_DIR = "/content/drive/MyDrive/ml_factory/models"

print(f"Config: {SYMBOL} @ horizon {HORIZON}")
print(f"Data: {DATA_PATH}")

---
# Part 2: Verify Data
---

In [None]:
import os
from pathlib import Path

# Check data files exist
data_path = Path(DATA_PATH)
required_files = ["train_scaled.parquet", "val_scaled.parquet", "test_scaled.parquet"]

print("Checking data files...")
all_exist = True
for f in required_files:
    exists = (data_path / f).exists()
    status = "OK" if exists else "MISSING"
    print(f"  {f}: {status}")
    if not exists:
        all_exist = False

if not all_exist:
    print("\nERROR: Missing data files!")
    print(f"Please ensure processed data exists at: {DATA_PATH}")
    print("Run the data pipeline first or upload processed data to Drive.")
else:
    print("\nAll data files found!")

In [None]:
# Load and inspect data
from src.phase1.stages.datasets.container import TimeSeriesDataContainer

container = TimeSeriesDataContainer.from_parquet_dir(
    DATA_PATH,
    horizon=HORIZON,
    exclude_invalid_labels=True,
)

print(f"Loaded: {container}")

---
# Part 3: Train Models
---

In [None]:
# Models to train
MODELS_TO_TRAIN = ["xgboost", "lightgbm"]

print(f"Will train: {MODELS_TO_TRAIN}")

In [None]:
from notebooks.colab_setup import get_trainer_for_colab
from datetime import datetime
from pathlib import Path

# Create output directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

trained_models = {}
results = {}

for model_name in MODELS_TO_TRAIN:
    print(f"\n{'='*60}")
    print(f"Training: {model_name.upper()}")
    print(f"{'='*60}")
    
    try:
        trainer, result = get_trainer_for_colab(
            model_name=model_name,
            horizon=HORIZON,
            data_path=DATA_PATH,
            output_dir=OUTPUT_DIR,
        )
        
        trained_models[model_name] = trainer
        results[model_name] = result
        
        # Save to Drive
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_path = Path(MODELS_DIR) / f"{model_name}_{SYMBOL}_h{HORIZON}_{timestamp}"
        trainer.model.save(model_path)
        print(f"Saved to: {model_path}")
        
    except Exception as e:
        print(f"ERROR: {e}")
        results[model_name] = {"error": str(e)}

In [None]:
# Training Summary
print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)

for model_name, result in results.items():
    if "error" in result:
        print(f"\n{model_name}: FAILED - {result['error']}")
    else:
        metrics = result.get('evaluation_metrics', {})
        print(f"\n{model_name}:")
        print(f"  Val Accuracy: {metrics.get('accuracy', 0):.4f}")
        print(f"  Val F1: {metrics.get('macro_f1', 0):.4f}")

---
# Part 4: Test Set Evaluation (One-Shot)
---

**WARNING**: Test set evaluation is ONE-SHOT. Do not iterate on these results.

In [None]:
# Test results are already computed by the trainer
# Just display them

print("TEST SET RESULTS (ONE-SHOT - DO NOT ITERATE)")
print("="*60)

for model_name, result in results.items():
    if "error" not in result:
        test_metrics = result.get('test_metrics', {})
        if test_metrics:
            print(f"\n{model_name.upper()}:")
            print(f"  Test Accuracy: {test_metrics.get('accuracy', 0):.4f}")
            print(f"  Test F1: {test_metrics.get('macro_f1', 0):.4f}")

---
# Part 5: Save Summary
---

In [None]:
import json
from datetime import datetime

# Save summary to Drive
summary = {
    "symbol": SYMBOL,
    "horizon": HORIZON,
    "timestamp": datetime.now().isoformat(),
    "models": list(results.keys()),
    "results": {k: str(v) for k, v in results.items()},
}

summary_path = Path(MODELS_DIR) / f"summary_{SYMBOL}_h{HORIZON}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"Summary saved to: {summary_path}")

In [None]:
print("\n" + "="*60)
print("ALL DONE!")
print("="*60)
print(f"\nModels saved to: {MODELS_DIR}")
print(f"Experiment outputs: {OUTPUT_DIR}")
print("\nNext steps:")
print("  1. Train neural models (lstm, tcn, transformer)")
print("  2. Build ensemble with stacking")
print("  3. Run walk-forward validation")