# ML Model Factory - Complete Pipeline

**All-in-one notebook for training ML models on OHLCV data**

## Instructions
1. Run cells in order
2. Use the form fields to configure settings (click the fields on the right)
3. Models and data will be saved to your Google Drive

---
# Part 1: Setup
---

In [None]:
#@title Clone Repository
!git clone https://github.com/Snehpatel101/Research.git /content/research 2>/dev/null || echo "Repo already exists"
%cd /content/research
!pip install -q -r requirements.txt
print("Setup complete!")

In [None]:
#@title Mount Google Drive & Setup Environment
import sys
sys.path.insert(0, '/content/research')

from notebooks.colab_setup import setup_colab_environment

env_info = setup_colab_environment(mount_drive=True, use_gpu=True)

print(f"\nEnvironment ready!")
print(f"  GPU: {env_info.get('gpu_available', False)}")
print(f"  Drive: {env_info.get('drive_mounted', False)}")

---
# Part 2: Configuration
---

**Click the form fields on the right to configure your settings!**

In [None]:
#@title Configure Training Settings { run: "auto" }

#@markdown ### Symbol & Data
SYMBOL = "MES" #@param ["MES", "MGC", "ES", "GC", "NQ", "CL"] {allow-input: true}
HORIZON = 20 #@param [5, 10, 15, 20] {type:"integer"}

#@markdown ### Models to Train
train_xgboost = True #@param {type:"boolean"}
train_lightgbm = True #@param {type:"boolean"}
train_catboost = False #@param {type:"boolean"}
train_random_forest = False #@param {type:"boolean"}
train_lstm = False #@param {type:"boolean"}
train_tcn = False #@param {type:"boolean"}

#@markdown ### Paths (usually don't need to change)
data_folder = "ml_factory/data/processed" #@param {type:"string"}
models_folder = "ml_factory/models" #@param {type:"string"}

# Build paths
from pathlib import Path
DATA_PATH = f"/content/drive/MyDrive/{data_folder}/{SYMBOL}"
MODELS_DIR = f"/content/drive/MyDrive/{models_folder}"
OUTPUT_DIR = "/content/experiments"

# Build models list
MODELS_TO_TRAIN = []
if train_xgboost: MODELS_TO_TRAIN.append("xgboost")
if train_lightgbm: MODELS_TO_TRAIN.append("lightgbm")
if train_catboost: MODELS_TO_TRAIN.append("catboost")
if train_random_forest: MODELS_TO_TRAIN.append("random_forest")
if train_lstm: MODELS_TO_TRAIN.append("lstm")
if train_tcn: MODELS_TO_TRAIN.append("tcn")

print("=" * 50)
print("CONFIGURATION")
print("=" * 50)
print(f"Symbol: {SYMBOL}")
print(f"Horizon: {HORIZON}")
print(f"Models: {MODELS_TO_TRAIN}")
print(f"Data path: {DATA_PATH}")
print(f"Models path: {MODELS_DIR}")

---
# Part 3: Verify Data
---

In [None]:
#@title Check Data Files Exist
import os
from pathlib import Path

data_path = Path(DATA_PATH)
required_files = ["train_scaled.parquet", "val_scaled.parquet", "test_scaled.parquet"]

print("Checking data files...")
print(f"Looking in: {DATA_PATH}")
print()

all_exist = True
for f in required_files:
    file_path = data_path / f
    exists = file_path.exists()
    status = "OK" if exists else "MISSING"
    print(f"  {f}: {status}")
    if not exists:
        all_exist = False

if not all_exist:
    print("\n" + "!" * 50)
    print("ERROR: Missing data files!")
    print("!" * 50)
    print(f"\nPlease upload your processed data to:")
    print(f"  Google Drive > {data_folder}/{SYMBOL}/")
    print(f"\nRequired files:")
    for f in required_files:
        print(f"  - {f}")
else:
    print("\nAll data files found!")

In [None]:
#@title Load Data
from src.phase1.stages.datasets.container import TimeSeriesDataContainer

print(f"Loading data from {DATA_PATH}...")
container = TimeSeriesDataContainer.from_parquet_dir(
    DATA_PATH,
    horizon=HORIZON,
    exclude_invalid_labels=True,
)

print(f"\nLoaded: {container}")

---
# Part 4: Train Models
---

In [None]:
#@title Train Selected Models
from notebooks.colab_setup import get_trainer_for_colab
from datetime import datetime
from pathlib import Path

# Create output directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

if not MODELS_TO_TRAIN:
    print("No models selected! Go back to Configuration and select at least one model.")
else:
    trained_models = {}
    results = {}
    
    for i, model_name in enumerate(MODELS_TO_TRAIN):
        print(f"\n{'='*60}")
        print(f"[{i+1}/{len(MODELS_TO_TRAIN)}] Training: {model_name.upper()}")
        print(f"{'='*60}")
        
        try:
            trainer, result = get_trainer_for_colab(
                model_name=model_name,
                horizon=HORIZON,
                data_path=DATA_PATH,
                output_dir=OUTPUT_DIR,
            )
            
            trained_models[model_name] = trainer
            results[model_name] = result
            
            # Save to Drive
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = Path(MODELS_DIR) / f"{model_name}_{SYMBOL}_h{HORIZON}_{timestamp}"
            trainer.model.save(model_path)
            print(f"Saved to: {model_path}")
            
        except Exception as e:
            print(f"ERROR: {e}")
            results[model_name] = {"error": str(e)}
    
    print("\n" + "="*60)
    print("TRAINING COMPLETE!")
    print("="*60)

---
# Part 5: Results
---

In [None]:
#@title View Results Summary
print("=" * 60)
print("RESULTS SUMMARY")
print("=" * 60)

for model_name, result in results.items():
    print(f"\n{model_name.upper()}:")
    if "error" in result:
        print(f"  Status: FAILED")
        print(f"  Error: {result['error']}")
    else:
        val_metrics = result.get('evaluation_metrics', {})
        test_metrics = result.get('test_metrics', {})
        
        print(f"  Status: SUCCESS")
        print(f"  Validation:")
        print(f"    Accuracy: {val_metrics.get('accuracy', 0):.4f}")
        print(f"    F1 Score: {val_metrics.get('macro_f1', 0):.4f}")
        
        if test_metrics:
            print(f"  Test (one-shot):")
            print(f"    Accuracy: {test_metrics.get('accuracy', 0):.4f}")
            print(f"    F1 Score: {test_metrics.get('macro_f1', 0):.4f}")

print("\n" + "=" * 60)
print(f"Models saved to: {MODELS_DIR}")
print("=" * 60)

In [None]:
#@title Save Summary to Drive
import json
from datetime import datetime

summary = {
    "symbol": SYMBOL,
    "horizon": HORIZON,
    "timestamp": datetime.now().isoformat(),
    "models": list(results.keys()),
}

# Add metrics for successful models
for model_name, result in results.items():
    if "error" not in result:
        summary[f"{model_name}_val_f1"] = result.get('evaluation_metrics', {}).get('macro_f1', 0)
        summary[f"{model_name}_test_f1"] = result.get('test_metrics', {}).get('macro_f1', 0)

summary_path = Path(MODELS_DIR) / f"summary_{SYMBOL}_h{HORIZON}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2)

print(f"Summary saved to: {summary_path}")
print("\nDone! Check your Google Drive for saved models.")