# ML Model Factory - OHLCV Pipeline

This notebook runs the ensemble price prediction pipeline on Google Colab.

## Prerequisites
1. Upload your raw data files (`MES_1m.parquet`, `MGC_1m.parquet`) to Google Drive
2. Create a folder structure: `My Drive/research/data/raw/`

## Pipeline Stages
1. **Data Ingestion** - Load and validate raw OHLCV data
2. **Data Cleaning** - Resample 1min → 5min, handle gaps
3. **Feature Engineering** - Generate 150+ technical indicators
4. **Labeling** - Triple-barrier label generation
5. **Optimization** - Optuna parameter tuning
6. **Splits** - Train/Val/Test with purge/embargo
7. **Scaling** - Train-only feature scaling
8. **Validation** - Data quality checks
9. **Reporting** - Generate summary report

## 1. Environment Setup

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set project root
import os
PROJECT_ROOT = '/content/drive/MyDrive/research'
os.environ['PROJECT_ROOT'] = PROJECT_ROOT

print(f"Project root: {PROJECT_ROOT}")

In [None]:
# Clone repository (if not already present)
import os
from pathlib import Path

repo_path = Path('/content/research')
if not repo_path.exists():
    !git clone https://github.com/Snehpatel101/research.git /content/research
else:
    print("Repository already cloned")
    # Pull latest changes
    !cd /content/research && git pull

In [None]:
# Install the package
!pip install -e /content/research --quiet

# Verify installation
import src
print(f"Package version: {src.__version__}")

In [None]:
# Create required directories
from pathlib import Path

dirs = [
    f"{PROJECT_ROOT}/data/raw",
    f"{PROJECT_ROOT}/data/clean",
    f"{PROJECT_ROOT}/data/features",
    f"{PROJECT_ROOT}/data/labels",
    f"{PROJECT_ROOT}/data/final",
    f"{PROJECT_ROOT}/data/splits",
    f"{PROJECT_ROOT}/runs",
    f"{PROJECT_ROOT}/results",
    f"{PROJECT_ROOT}/config",
]

for d in dirs:
    Path(d).mkdir(parents=True, exist_ok=True)
    print(f"Created: {d}")

## 2. Verify Data Files

In [None]:
# Check for raw data files
from pathlib import Path
import pandas as pd

raw_dir = Path(f"{PROJECT_ROOT}/data/raw")

symbols = ['MES', 'MGC']
missing = []

for symbol in symbols:
    parquet_path = raw_dir / f"{symbol}_1m.parquet"
    csv_path = raw_dir / f"{symbol}_1m.csv"
    
    if parquet_path.exists():
        df = pd.read_parquet(parquet_path)
        print(f"{symbol}: {len(df):,} rows ({parquet_path.stat().st_size / 1e6:.1f} MB)")
    elif csv_path.exists():
        df = pd.read_csv(csv_path)
        print(f"{symbol}: {len(df):,} rows (CSV)")
    else:
        missing.append(symbol)
        print(f"{symbol}: NOT FOUND")

if missing:
    print(f"\n*** Upload missing files: {missing} ***")
    print(f"Expected location: {raw_dir}/")

## 3. Configure Pipeline

In [None]:
from src.phase1.pipeline_config import PipelineConfig, create_default_config
from pathlib import Path

# Create configuration
config = create_default_config(
    symbols=['MES', 'MGC'],
    project_root=Path(PROJECT_ROOT),
    start_date=None,  # Use all available data
    end_date=None,
    target_timeframe='5min',
    label_horizons=[5, 10, 15, 20],
    train_ratio=0.70,
    val_ratio=0.15,
    test_ratio=0.15,
)

# Display configuration summary
print("Pipeline Configuration:")
print(f"  Run ID: {config.run_id}")
print(f"  Symbols: {config.symbols}")
print(f"  Timeframe: {config.target_timeframe}")
print(f"  Horizons: {config.label_horizons}")
print(f"  Train/Val/Test: {config.train_ratio}/{config.val_ratio}/{config.test_ratio}")
print(f"  Project Root: {config.project_root}")

## 4. Run Full Pipeline

In [None]:
from src.pipeline.runner import PipelineRunner

# Create and run pipeline
runner = PipelineRunner(config)

print("Starting pipeline...")
print("="*50)

success = runner.run()

print("="*50)
if success:
    print("Pipeline completed successfully!")
else:
    print("Pipeline failed. Check logs above for errors.")

## 5. View Results

In [None]:
# Check output files
from pathlib import Path
import os

def show_directory_tree(path, prefix="", max_depth=3, current_depth=0):
    if current_depth >= max_depth:
        return
    
    path = Path(path)
    if not path.exists():
        print(f"{prefix}{path.name}/ (not found)")
        return
    
    items = sorted(path.iterdir())
    for i, item in enumerate(items):
        is_last = i == len(items) - 1
        connector = "└── " if is_last else "├── "
        
        if item.is_file():
            size = item.stat().st_size
            if size > 1e9:
                size_str = f"{size/1e9:.1f}GB"
            elif size > 1e6:
                size_str = f"{size/1e6:.1f}MB"
            elif size > 1e3:
                size_str = f"{size/1e3:.1f}KB"
            else:
                size_str = f"{size}B"
            print(f"{prefix}{connector}{item.name} ({size_str})")
        else:
            print(f"{prefix}{connector}{item.name}/")
            new_prefix = prefix + ("    " if is_last else "│   ")
            show_directory_tree(item, new_prefix, max_depth, current_depth + 1)

print("Data Directory:")
show_directory_tree(f"{PROJECT_ROOT}/data")

In [None]:
# Load and display final labeled data sample
import pandas as pd
from pathlib import Path

final_dir = Path(f"{PROJECT_ROOT}/data/final")
files = list(final_dir.glob("*_labeled.parquet"))

if files:
    df = pd.read_parquet(files[0])
    print(f"Loaded: {files[0].name}")
    print(f"Shape: {df.shape}")
    print(f"\nColumns ({len(df.columns)}):")
    print(df.columns.tolist()[:20], "...")
    print(f"\nSample:")
    display(df.head())
else:
    print("No final labeled data found. Run the pipeline first.")

In [None]:
# Display label distribution
import pandas as pd
import matplotlib.pyplot as plt

if 'df' in dir() and df is not None:
    fig, axes = plt.subplots(1, 4, figsize=(16, 4))
    
    for i, h in enumerate([5, 10, 15, 20]):
        col = f'label_h{h}'
        if col in df.columns:
            counts = df[col].value_counts().sort_index()
            colors = ['red', 'gray', 'green']
            axes[i].bar(counts.index, counts.values, color=colors)
            axes[i].set_title(f'H{h} Labels')
            axes[i].set_xlabel('Label')
            axes[i].set_ylabel('Count')
            axes[i].set_xticks([-1, 0, 1])
            axes[i].set_xticklabels(['Short', 'Neutral', 'Long'])
    
    plt.tight_layout()
    plt.show()

## 6. Run Individual Stages (Optional)

If you want to run specific stages instead of the full pipeline:

In [None]:
# Example: Run only data ingestion
from src.phase1.stages import DataIngestor
from pathlib import Path

# ingestor = DataIngestor(
#     raw_data_dir=Path(f"{PROJECT_ROOT}/data/raw"),
#     output_dir=Path(f"{PROJECT_ROOT}/data/raw/validated")
# )
# 
# df, metadata = ingestor.ingest_file(
#     file_path=Path(f"{PROJECT_ROOT}/data/raw/MES_1m.parquet"),
#     symbol='MES',
#     validate=True
# )
# print(f"Ingested {len(df)} rows")

In [None]:
# Example: Load pre-computed train/val/test splits
from pathlib import Path
import pandas as pd

splits_dir = Path(f"{PROJECT_ROOT}/data/splits/scaled")

if splits_dir.exists():
    train_df = pd.read_parquet(splits_dir / "train_scaled.parquet")
    val_df = pd.read_parquet(splits_dir / "val_scaled.parquet")
    test_df = pd.read_parquet(splits_dir / "test_scaled.parquet")
    
    print(f"Train: {len(train_df):,} rows")
    print(f"Val:   {len(val_df):,} rows")
    print(f"Test:  {len(test_df):,} rows")
else:
    print("Splits not found. Run pipeline first.")

## 7. Next Steps: Model Training (Phase 2)

After running the data pipeline, you can train models using the prepared datasets.

In [None]:
# Example: Prepare data for sklearn model
# from src.phase1.stages.datasets.container import TimeSeriesDataContainer, DataContainerConfig
#
# container_config = DataContainerConfig(
#     horizon=20,
#     feature_columns=[...],  # List your feature columns
#     label_column='label_h20',
#     weight_column='sample_weight_h20'
# )
#
# container = TimeSeriesDataContainer(container_config)
# container.load_splits(splits_dir)
#
# X_train, y_train, w_train = container.get_sklearn_arrays('train')
# X_val, y_val, w_val = container.get_sklearn_arrays('val')
#
# print(f"X_train shape: {X_train.shape}")
# print(f"y_train shape: {y_train.shape}")

In [None]:
print("Notebook complete!")
print(f"\nResults saved to: {PROJECT_ROOT}")