# ML Model Factory - Complete Pipeline

**All-in-one notebook for training ML models on OHLCV data**

## What This Notebook Does
1. **Setup** - Mount Drive, install dependencies, configure environment
2. **Data Pipeline** - Process raw OHLCV ‚Üí features ‚Üí labels ‚Üí splits
3. **Train Models** - Train tabular models (XGBoost, LightGBM, CatBoost)
4. **Evaluate** - Test set evaluation with trading metrics
5. **Save** - Persist models to Google Drive

## Expected Runtime
- Data Pipeline: 30-60 minutes
- Model Training: 20-40 minutes per model
- Total: ~2 hours for full pipeline + 3 models

## Requirements
- Google Colab (free tier works, Pro recommended)
- Raw OHLCV data in Google Drive
- ~10GB free disk space

---
# Part 1: Environment Setup
---

In [None]:
# Clone repository and install dependencies
!git clone https://github.com/Snehpatel101/Research.git /content/research 2>/dev/null || echo "Repo already cloned"
!cd /content/research && pip install -q -r requirements.txt

In [None]:
# Setup Colab environment
import sys
sys.path.insert(0, '/content/research')

from notebooks.colab_setup import setup_colab_environment

env_info = setup_colab_environment(
    mount_drive=True,
    use_gpu=True,
)

print(f"\nüìä Environment Info:")
print(f"  Running in Colab: {env_info.get('is_colab', False)}")
print(f"  GPU available: {env_info.get('gpu_available', False)}")
print(f"  Drive mounted: {env_info.get('drive_mounted', False)}")

# Check disk space
import shutil
total, used, free = shutil.disk_usage("/content")
print(f"\nüíæ Disk Space: {free // (1024**3)} GB free")

In [None]:
# Configuration - EDIT THESE VALUES
from pathlib import Path

SYMBOL = "MES"  # Symbol to train on (MES, MGC, etc.)
HORIZON = 20    # Label horizon (5, 10, 15, 20)

# Paths
DRIVE_BASE = Path("/content/drive/MyDrive/ml_factory")
DRIVE_DATA = DRIVE_BASE / "data"
DRIVE_MODELS = DRIVE_BASE / "models"
DRIVE_CHECKPOINTS = DRIVE_BASE / "checkpoints"

LOCAL_DATA = Path("/content/data")
LOCAL_OUTPUT = Path("/content/output")

# Create directories
for d in [DRIVE_DATA / "raw", DRIVE_DATA / "processed", DRIVE_MODELS, DRIVE_CHECKPOINTS,
          LOCAL_DATA / "raw", LOCAL_OUTPUT]:
    d.mkdir(parents=True, exist_ok=True)

print(f"‚úÖ Configuration:")
print(f"   Symbol: {SYMBOL}")
print(f"   Horizon: {HORIZON}")
print(f"   Drive base: {DRIVE_BASE}")

---
# Part 2: Data Pipeline (Phases 1-5)
---

**Phases:**
1. Ingestion: Load raw 1-min OHLCV
2. MTF Upscaling: Resample to 8 intraday timeframes
3. Features: 180+ indicators (momentum, wavelets, microstructure)
4. Labeling: Triple-barrier with Optuna optimization
5. Adapters: Model-family data preparation (2D, 3D)

In [None]:
import pandas as pd

# Copy raw data from Drive to local (faster I/O)
raw_drive = DRIVE_DATA / "raw" / f"{SYMBOL}_1m.parquet"
raw_local = LOCAL_DATA / "raw" / f"{SYMBOL}_1m.parquet"

if not raw_local.exists():
    print(f"Copying data from Drive...")
    shutil.copy(raw_drive, raw_local)
    print(f"‚úÖ Copied to {raw_local}")

# Verify data
df_raw = pd.read_parquet(raw_local)
print(f"\nüìà Raw data: {df_raw.shape[0]:,} rows, {df_raw.shape[1]} columns")
print(f"   Date range: {df_raw.index.min()} to {df_raw.index.max()}")
print(df_raw.head())

In [None]:
# Check for existing pipeline checkpoint
import json

checkpoint_file = LOCAL_OUTPUT / "pipeline_state.json"
if checkpoint_file.exists():
    with open(checkpoint_file) as f:
        checkpoint = json.load(f)
    print(f"‚úÖ Found checkpoint from: {checkpoint.get('timestamp', 'unknown')}")
    print(f"   Completed stages: {checkpoint.get('completed_stages', [])}")
else:
    print("üÜï No checkpoint found - will run full pipeline")

In [None]:
# Run data pipeline
from src.pipeline.runner import PipelineRunner
from src.pipeline.config import PipelineConfig

config = PipelineConfig(
    symbols=[SYMBOL],
    data_dir=LOCAL_DATA,
    output_dir=LOCAL_OUTPUT,
)

runner = PipelineRunner(config, resume=True)

try:
    print("\nüöÄ Running data pipeline...")
    success = runner.run()
    
    if success:
        print("\n‚úÖ Pipeline completed successfully!")
    else:
        print("\n‚ö†Ô∏è Pipeline completed with some issues")
        
except Exception as e:
    print(f"\n‚ùå Pipeline failed: {e}")
    raise

In [None]:
# Copy processed data to Drive (permanent storage)
processed_local = LOCAL_DATA / "splits" / "scaled"
processed_drive = DRIVE_DATA / "processed" / SYMBOL

print("Copying processed data to Drive...")
shutil.copytree(processed_local, processed_drive, dirs_exist_ok=True)
print(f"‚úÖ Saved to {processed_drive}")

---
# Part 3: Load Processed Data
---

In [None]:
# Load processed datasets
from src.phase1.stages.datasets.container import TimeSeriesDataContainer

data_dir = LOCAL_DATA / "splits" / "scaled"
if not data_dir.exists():
    # Load from Drive if local doesn't exist
    data_dir = DRIVE_DATA / "processed" / SYMBOL
    shutil.copytree(data_dir, LOCAL_DATA / "splits" / "scaled", dirs_exist_ok=True)
    data_dir = LOCAL_DATA / "splits" / "scaled"

container = TimeSeriesDataContainer.load(data_dir, horizon=HORIZON)
print(f"\nüìä Loaded data container:")
print(f"   {container}")

---
# Part 4: Train Models
---

In [None]:
# Select models to train
MODELS_TO_TRAIN = ["xgboost", "lightgbm"]  # Add "catboost" if installed

print(f"üéØ Will train: {MODELS_TO_TRAIN}")

In [None]:
from src.models import ModelRegistry
from src.models.trainer import Trainer
from datetime import datetime

trained_models = {}
results = {}

for model_name in MODELS_TO_TRAIN:
    print(f"\n{'='*60}")
    print(f"Training: {model_name.upper()}")
    print(f"{'='*60}")
    
    try:
        # Initialize trainer
        trainer = Trainer(
            model_name=model_name,
            horizon=HORIZON,
            output_dir=LOCAL_OUTPUT / "runs",
        )
        
        # Train
        metrics = trainer.train(container)
        
        # Store results
        trained_models[model_name] = trainer
        results[model_name] = {
            "val_accuracy": metrics.val_accuracy,
            "val_f1": metrics.val_f1,
            "train_time": metrics.training_time,
        }
        
        # Save to Drive immediately
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        model_path = DRIVE_MODELS / f"{model_name}_{SYMBOL}_h{HORIZON}_{timestamp}.pkl"
        trainer.model.save(model_path)
        print(f"üíæ Saved to {model_path}")
        
    except Exception as e:
        print(f"‚ùå Failed to train {model_name}: {e}")
        results[model_name] = {"error": str(e)}

In [None]:
# Training Summary
print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)

for model_name, metrics in results.items():
    if "error" in metrics:
        print(f"\n{model_name}: ‚ùå FAILED - {metrics['error']}")
    else:
        print(f"\n{model_name}:")
        print(f"  Val Accuracy: {metrics['val_accuracy']:.4f}")
        print(f"  Val F1: {metrics['val_f1']:.4f}")
        print(f"  Train Time: {metrics['train_time']:.1f}s")

---
# Part 5: Test Set Evaluation
---

‚ö†Ô∏è **WARNING**: Test set evaluation is ONE-SHOT. Do not iterate on these results.

In [None]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Get test data
X_test, y_test = container.get_arrays("test")

print("‚ö†Ô∏è  TEST SET EVALUATION - ONE-SHOT GENERALIZATION ESTIMATE")
print("   Do NOT iterate on these results!")
print("="*60)

test_results = {}

for model_name, trainer in trained_models.items():
    print(f"\n{model_name.upper()}:")
    
    # Predict
    pred_output = trainer.model.predict(X_test)
    y_pred = pred_output.class_predictions
    
    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    
    test_results[model_name] = {"accuracy": acc, "f1": f1}
    
    print(f"  Test Accuracy: {acc:.4f}")
    print(f"  Test F1 (macro): {f1:.4f}")

In [None]:
# Detailed classification report for best model
if test_results:
    best_model = max(test_results, key=lambda x: test_results[x]['f1'])
    print(f"\nüìä Detailed Report for Best Model: {best_model.upper()}")
    print("="*60)
    
    pred_output = trained_models[best_model].model.predict(X_test)
    y_pred = pred_output.class_predictions
    
    print(classification_report(
        y_test, y_pred,
        target_names=['Short (-1)', 'Neutral (0)', 'Long (1)']
    ))

---
# Part 6: Save Final Results
---

In [None]:
import json
from datetime import datetime

# Save summary to Drive
summary = {
    "symbol": SYMBOL,
    "horizon": HORIZON,
    "timestamp": datetime.now().isoformat(),
    "validation_results": results,
    "test_results": test_results,
}

summary_path = DRIVE_BASE / f"run_summary_{SYMBOL}_h{HORIZON}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
with open(summary_path, 'w') as f:
    json.dump(summary, f, indent=2, default=str)

print(f"‚úÖ Summary saved to {summary_path}")

In [None]:
print("\n" + "="*60)
print("üéâ ALL DONE!")
print("="*60)
print(f"\nüìÅ Models saved to: {DRIVE_MODELS}")
print(f"üìÅ Processed data: {DRIVE_DATA / 'processed' / SYMBOL}")
print(f"üìÅ Run summary: {summary_path}")
print("\nüöÄ Next steps:")
print("   1. Train neural models (LSTM, TCN, Transformer)")
print("   2. Build heterogeneous ensemble")
print("   3. Run walk-forward validation")