# ML Model Factory

Train ML models on OHLCV data.

In [None]:
# Setup
!git clone https://github.com/Snehpatel101/Research.git /content/research 2>/dev/null || echo "Done"
%cd /content/research
!pip install -q -r requirements.txt

from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.insert(0, '/content/research')
print("Ready!")

In [None]:
# Configuration
SYMBOL = "SL"
DATA_FILE = "/content/drive/MyDrive/si_historical_2019_2024.parquet"
HORIZON = 20
MODELS = ["xgboost", "lightgbm"]

In [None]:
# Load Data
import pandas as pd
df = pd.read_parquet(DATA_FILE)
print(f"Loaded {len(df):,} rows")
df.head()

In [None]:
# Prepare Data
from sklearn.preprocessing import RobustScaler
import numpy as np

df.columns = df.columns.str.lower()

# Features
df['return_1'] = df['close'].pct_change().shift(1)
df['return_5'] = df['close'].pct_change(5).shift(1)
df['return_20'] = df['close'].pct_change(20).shift(1)
df['volatility'] = df['return_1'].rolling(20).std().shift(1)
df['volume_ratio'] = (df['volume'] / df['volume'].rolling(20).mean()).shift(1)

# Labels
future_ret = df['close'].shift(-HORIZON) / df['close'] - 1
threshold = future_ret.std() * 0.5
df['label'] = 0
df.loc[future_ret > threshold, 'label'] = 1
df.loc[future_ret < -threshold, 'label'] = -1

df = df.dropna().iloc[:-HORIZON]

# Split
features = ['return_1', 'return_5', 'return_20', 'volatility', 'volume_ratio']
n = len(df)
train = df.iloc[:int(n*0.7)]
val = df.iloc[int(n*0.7):int(n*0.85)]
test = df.iloc[int(n*0.85):]

scaler = RobustScaler()
X_train = scaler.fit_transform(train[features])
X_val = scaler.transform(val[features])
X_test = scaler.transform(test[features])
y_train, y_val, y_test = train['label'].values, val['label'].values, test['label'].values

print(f"Train: {len(train)}, Val: {len(val)}, Test: {len(test)}")
print(f"Labels: {pd.Series(y_train).value_counts().to_dict()}")

In [None]:
# Train Models
from sklearn.metrics import accuracy_score, f1_score

results = {}

for model_name in MODELS:
    print(f"\nTraining {model_name}...")
    
    if model_name == "xgboost":
        from xgboost import XGBClassifier
        model = XGBClassifier(n_estimators=100, max_depth=6, random_state=42)
    elif model_name == "lightgbm":
        from lightgbm import LGBMClassifier
        model = LGBMClassifier(n_estimators=100, max_depth=6, random_state=42, verbose=-1)
    elif model_name == "random_forest":
        from sklearn.ensemble import RandomForestClassifier
        model = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
    
    # Shift labels to 0,1,2 for training
    model.fit(X_train, y_train + 1)
    
    # Predict
    val_pred = model.predict(X_val) - 1
    test_pred = model.predict(X_test) - 1
    
    results[model_name] = {
        'val_acc': accuracy_score(y_val, val_pred),
        'val_f1': f1_score(y_val, val_pred, average='macro'),
        'test_acc': accuracy_score(y_test, test_pred),
        'test_f1': f1_score(y_test, test_pred, average='macro'),
        'model': model
    }
    
    print(f"  Val Acc: {results[model_name]['val_acc']:.4f}, F1: {results[model_name]['val_f1']:.4f}")
    print(f"  Test Acc: {results[model_name]['test_acc']:.4f}, F1: {results[model_name]['test_f1']:.4f}")

In [None]:
# Save Models
import joblib
from pathlib import Path
from datetime import datetime

save_dir = Path(f"/content/drive/MyDrive/ml_models/{SYMBOL}")
save_dir.mkdir(parents=True, exist_ok=True)

for name, data in results.items():
    path = save_dir / f"{name}_{SYMBOL}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.pkl"
    joblib.dump(data['model'], path)
    print(f"Saved: {path}")

print("\nDone!")