# ML Model Factory - Complete Pipeline

**Train ML models on your OHLCV data**

## Instructions
1. Run cells in order
2. Configure your settings using the form fields on the right
3. Models will be saved to your Google Drive

---
# Part 1: Setup
---

In [None]:
#@title Install & Setup
!git clone https://github.com/Snehpatel101/Research.git /content/research 2>/dev/null || echo "Repo exists"
%cd /content/research
!pip install -q -r requirements.txt

import sys
sys.path.insert(0, '/content/research')

from notebooks.colab_setup import setup_colab_environment
env_info = setup_colab_environment(mount_drive=True, use_gpu=True)

print(f"\nReady! GPU: {env_info.get('gpu_available', False)}")

---
# Part 2: Configuration
---

In [None]:
#@title Configure Your Settings { run: "auto" }

#@markdown ### Your Data
SYMBOL = "SL" #@param {type:"string"}
raw_data_filename = "si_historical_2019_2024.parquet" #@param {type:"string"}
drive_data_folder = "" #@param {type:"string"}
#@markdown *Leave drive_data_folder empty if file is in root of My Drive*

#@markdown ### Training Settings
HORIZON = 20 #@param [5, 10, 15, 20] {type:"integer"}

#@markdown ### Models to Train
train_xgboost = True #@param {type:"boolean"}
train_lightgbm = True #@param {type:"boolean"}
train_catboost = False #@param {type:"boolean"}
train_random_forest = False #@param {type:"boolean"}
train_lstm = False #@param {type:"boolean"}
train_tcn = False #@param {type:"boolean"}

# Build paths
from pathlib import Path

if drive_data_folder:
    RAW_DATA_PATH = f"/content/drive/MyDrive/{drive_data_folder}/{raw_data_filename}"
else:
    RAW_DATA_PATH = f"/content/drive/MyDrive/{raw_data_filename}"

PROCESSED_DIR = f"/content/processed/{SYMBOL}"
MODELS_DIR = f"/content/drive/MyDrive/ml_models/{SYMBOL}"
OUTPUT_DIR = "/content/experiments"

# Build models list
MODELS_TO_TRAIN = []
if train_xgboost: MODELS_TO_TRAIN.append("xgboost")
if train_lightgbm: MODELS_TO_TRAIN.append("lightgbm")
if train_catboost: MODELS_TO_TRAIN.append("catboost")
if train_random_forest: MODELS_TO_TRAIN.append("random_forest")
if train_lstm: MODELS_TO_TRAIN.append("lstm")
if train_tcn: MODELS_TO_TRAIN.append("tcn")

print("=" * 50)
print("YOUR CONFIGURATION")
print("=" * 50)
print(f"Symbol: {SYMBOL}")
print(f"Raw data: {RAW_DATA_PATH}")
print(f"Horizon: {HORIZON}")
print(f"Models: {MODELS_TO_TRAIN}")
print(f"Output: {MODELS_DIR}")

---
# Part 3: Load & Check Your Data
---

In [None]:
#@title Load Your Raw Data
import pandas as pd
from pathlib import Path

raw_path = Path(RAW_DATA_PATH)

if not raw_path.exists():
    print("!" * 50)
    print("ERROR: Data file not found!")
    print("!" * 50)
    print(f"\nLooking for: {RAW_DATA_PATH}")
    print("\nPlease check:")
    print("  1. The filename is correct")
    print("  2. The folder path is correct")
    print("  3. Google Drive is mounted")
else:
    print(f"Loading {raw_path.name}...")
    df_raw = pd.read_parquet(raw_path)
    
    print(f"\nLoaded: {len(df_raw):,} rows")
    print(f"Columns: {list(df_raw.columns)}")
    print(f"\nFirst few rows:")
    display(df_raw.head())
    
    print(f"\nDate range: {df_raw.index.min()} to {df_raw.index.max()}")

In [None]:
#@title Prepare Data for Training
import numpy as np
from pathlib import Path

# Create output directory
Path(PROCESSED_DIR).mkdir(parents=True, exist_ok=True)

print("Preparing data for training...")
print("This is a simplified pipeline for quick training.")
print()

# Ensure we have the right columns
required_cols = ['open', 'high', 'low', 'close', 'volume']
df = df_raw.copy()

# Lowercase column names
df.columns = df.columns.str.lower()

# Check columns
missing = [c for c in required_cols if c not in df.columns]
if missing:
    print(f"Missing columns: {missing}")
    print(f"Available: {list(df.columns)}")
else:
    print(f"Found OHLCV columns")

# Create basic features
print("Creating features...")
df['return_1'] = df['close'].pct_change()
df['return_5'] = df['close'].pct_change(5)
df['return_10'] = df['close'].pct_change(10)
df['return_20'] = df['close'].pct_change(20)

df['volatility_20'] = df['return_1'].rolling(20).std()
df['volatility_60'] = df['return_1'].rolling(60).std()

df['sma_20'] = df['close'].rolling(20).mean()
df['sma_60'] = df['close'].rolling(60).mean()
df['price_sma20_ratio'] = df['close'] / df['sma_20']
df['price_sma60_ratio'] = df['close'] / df['sma_60']

df['volume_sma20'] = df['volume'].rolling(20).mean()
df['volume_ratio'] = df['volume'] / df['volume_sma20']

df['high_low_range'] = (df['high'] - df['low']) / df['close']
df['close_open_range'] = (df['close'] - df['open']) / df['close']

# RSI
delta = df['close'].diff()
gain = delta.where(delta > 0, 0).rolling(14).mean()
loss = (-delta.where(delta < 0, 0)).rolling(14).mean()
rs = gain / loss
df['rsi_14'] = 100 - (100 / (1 + rs))

# Create labels (simple: up/down/neutral based on future returns)
print(f"Creating labels (horizon={HORIZON})...")
future_return = df['close'].shift(-HORIZON) / df['close'] - 1
threshold = future_return.std() * 0.5

df['label'] = 0  # neutral
df.loc[future_return > threshold, 'label'] = 1   # long
df.loc[future_return < -threshold, 'label'] = -1  # short

# Drop NaN rows
df = df.dropna()

# Remove last HORIZON rows (no label)
df = df.iloc[:-HORIZON]

print(f"\nLabel distribution:")
print(df['label'].value_counts().sort_index())

# Split data (70/15/15)
print("\nSplitting data...")
n = len(df)
train_end = int(n * 0.7)
val_end = int(n * 0.85)

train_df = df.iloc[:train_end]
val_df = df.iloc[train_end:val_end]
test_df = df.iloc[val_end:]

print(f"Train: {len(train_df):,} rows")
print(f"Val: {len(val_df):,} rows")
print(f"Test: {len(test_df):,} rows")

# Define feature columns
feature_cols = [
    'return_1', 'return_5', 'return_10', 'return_20',
    'volatility_20', 'volatility_60',
    'price_sma20_ratio', 'price_sma60_ratio',
    'volume_ratio', 'high_low_range', 'close_open_range', 'rsi_14'
]

# Scale features (train-only statistics)
print("\nScaling features...")
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

train_features = scaler.fit_transform(train_df[feature_cols])
val_features = scaler.transform(val_df[feature_cols])
test_features = scaler.transform(test_df[feature_cols])

# Save as parquet
print("\nSaving processed data...")

train_scaled = pd.DataFrame(train_features, columns=feature_cols, index=train_df.index)
train_scaled[f'label_h{HORIZON}'] = train_df['label'].values
train_scaled.to_parquet(f"{PROCESSED_DIR}/train_scaled.parquet")

val_scaled = pd.DataFrame(val_features, columns=feature_cols, index=val_df.index)
val_scaled[f'label_h{HORIZON}'] = val_df['label'].values
val_scaled.to_parquet(f"{PROCESSED_DIR}/val_scaled.parquet")

test_scaled = pd.DataFrame(test_features, columns=feature_cols, index=test_df.index)
test_scaled[f'label_h{HORIZON}'] = test_df['label'].values
test_scaled.to_parquet(f"{PROCESSED_DIR}/test_scaled.parquet")

print(f"\nData saved to: {PROCESSED_DIR}")
print("Ready for training!")

---
# Part 4: Train Models
---

In [None]:
#@title Load Processed Data
from src.phase1.stages.datasets.container import TimeSeriesDataContainer

print(f"Loading processed data...")
container = TimeSeriesDataContainer.from_parquet_dir(
    PROCESSED_DIR,
    horizon=HORIZON,
    exclude_invalid_labels=True,
)

print(f"Loaded: {container}")

In [None]:
#@title Train Models
from notebooks.colab_setup import get_trainer_for_colab
from datetime import datetime
from pathlib import Path

Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
Path(MODELS_DIR).mkdir(parents=True, exist_ok=True)

if not MODELS_TO_TRAIN:
    print("No models selected! Go back and check at least one model.")
else:
    trained_models = {}
    results = {}
    
    for i, model_name in enumerate(MODELS_TO_TRAIN):
        print(f"\n{'='*60}")
        print(f"[{i+1}/{len(MODELS_TO_TRAIN)}] Training: {model_name.upper()}")
        print(f"{'='*60}")
        
        try:
            trainer, result = get_trainer_for_colab(
                model_name=model_name,
                horizon=HORIZON,
                data_path=PROCESSED_DIR,
                output_dir=OUTPUT_DIR,
            )
            
            trained_models[model_name] = trainer
            results[model_name] = result
            
            # Save to Drive
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            model_path = Path(MODELS_DIR) / f"{model_name}_{SYMBOL}_h{HORIZON}_{timestamp}"
            trainer.model.save(model_path)
            print(f"Saved: {model_path}")
            
        except Exception as e:
            print(f"ERROR: {e}")
            import traceback
            traceback.print_exc()
            results[model_name] = {"error": str(e)}
    
    print("\n" + "="*60)
    print("TRAINING COMPLETE!")
    print("="*60)

---
# Part 5: Results
---

In [None]:
#@title View Results
print("=" * 60)
print("RESULTS")
print("=" * 60)

for model_name, result in results.items():
    print(f"\n{model_name.upper()}:")
    if "error" in result:
        print(f"  FAILED: {result['error']}")
    else:
        val_metrics = result.get('evaluation_metrics', {})
        test_metrics = result.get('test_metrics', {})
        
        print(f"  Validation Accuracy: {val_metrics.get('accuracy', 0):.4f}")
        print(f"  Validation F1: {val_metrics.get('macro_f1', 0):.4f}")
        
        if test_metrics:
            print(f"  Test Accuracy: {test_metrics.get('accuracy', 0):.4f}")
            print(f"  Test F1: {test_metrics.get('macro_f1', 0):.4f}")

print(f"\n{'='*60}")
print(f"Models saved to: {MODELS_DIR}")