In [1]:
import pandas as pd
import glob
import os
from IPython.display import display

# 1. Point to your data folder
data_dir = "/Users/harit/algo_crypto/mymodules/data"

# 2. Find all CSV files
csv_paths = glob.glob(os.path.join(data_dir, "*.csv"))

# 3. Load into a dict of DataFrames, converting the millisecond timestamp
dfs = {}
for path in csv_paths:
    name = os.path.splitext(os.path.basename(path))[0]
    df = pd.read_csv(path)  
    # Convert the UNIX-ms timestamp to a true datetime
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
    # (optional) drop the raw timestamp column
    df.drop(columns=['timestamp'], inplace=True)
    # Reorder so datetime is first
    cols = ['datetime', 'open', 'high', 'low', 'close', 'volume']
    df = df[cols]
    dfs[name] = df

# 4. Peek at one
sample_key = 'BTCUSDT_5m_5year'
print("Loaded keys:", list(dfs.keys()))
print(f"\nSample head for {sample_key}:")
display(dfs[sample_key].head())


Loaded keys: ['BTCUSDT_1h_5year', 'ETHUSDT_1m_5year', 'BTCUSDT_5m_5year', 'BTCUSDT_15m_5year', 'ETHUSDT_1h_5year', 'BTCUSDT_1m_5year', 'ETHUSDT_5m_5year', 'ETHUSDT_15m_5year']

Sample head for BTCUSDT_5m_5year:


Unnamed: 0,datetime,open,high,low,close,volume
0,2020-07-05 11:10:00,9042.4,9044.94,9036.61,9041.88,298.229
1,2020-07-05 11:15:00,9041.88,9041.88,9039.0,9039.99,68.817
2,2020-07-05 11:20:00,9039.99,9042.87,9039.88,9042.05,106.649
3,2020-07-05 11:25:00,9042.05,9046.52,9041.24,9045.7,385.077
4,2020-07-05 11:30:00,9045.69,9045.99,9040.0,9041.19,111.229


In [2]:
import pandas as pd
import glob, os

# Reload the raw CSVs into dfs
data_dir = "/Users/harit/algo_crypto/mymodules/data"
csv_paths = glob.glob(os.path.join(data_dir, "*.csv"))

dfs = {}
for path in csv_paths:
    name = os.path.splitext(os.path.basename(path))[0]
    df = pd.read_csv(path)
    df['datetime'] = pd.to_datetime(df['timestamp'], unit='ms')
    df.drop(columns=['timestamp'], inplace=True)
    df = df[['datetime','open','high','low','close','volume']]
    dfs[name] = df

print("Datasets loaded:", list(dfs.keys()))


Datasets loaded: ['BTCUSDT_1h_5year', 'ETHUSDT_1m_5year', 'BTCUSDT_5m_5year', 'BTCUSDT_15m_5year', 'ETHUSDT_1h_5year', 'BTCUSDT_1m_5year', 'ETHUSDT_5m_5year', 'ETHUSDT_15m_5year']


In [3]:
import pandas as pd
import numpy as np
from IPython.display import display

# 1) Extract & sort the BTCUSDT 5 m DataFrame
df5 = dfs['BTCUSDT_5m_5year'].sort_values('datetime').reset_index(drop=True)

# 2) Compute ATR & ATR ratio
ATR_WINDOW = 14
tr = pd.concat([
    df5['high'] - df5['low'],
    (df5['high'] - df5['close'].shift(1)).abs(),
    (df5['low']  - df5['close'].shift(1)).abs()
], axis=1).max(axis=1)
df5['atr']       = tr.rolling(ATR_WINDOW).mean()
df5['atr_slow']  = df5['atr'].rolling(ATR_WINDOW*2).mean()
df5['atr_ratio'] = df5['atr'] / df5['atr_slow']

# 3) Dynamic regime threshold (90th percentile)
threshold = df5['atr_ratio'].quantile(0.90)
df5['high_vol_regime'] = (df5['atr_ratio'] > threshold).astype(int)
print(f"Dynamic regime threshold (90th pct): {threshold:.3f}")

# 4) Split into “normal” vs “tail” subsets
normal_df = df5[df5['high_vol_regime']==0].copy()
tail_df   = df5[df5['high_vol_regime']==1].copy()

# 5) Inspect the separation
print("Normal regime size:", normal_df.shape)
display(normal_df[['datetime','atr_ratio','high_vol_regime']].head())

print("Tail regime size:", tail_df.shape)
display(tail_df[['datetime','atr_ratio','high_vol_regime']].head())


Dynamic regime threshold (90th pct): 1.380
Normal regime size: (473044, 10)


Unnamed: 0,datetime,atr_ratio,high_vol_regime
0,2020-07-05 11:10:00,,0
1,2020-07-05 11:15:00,,0
2,2020-07-05 11:20:00,,0
3,2020-07-05 11:25:00,,0
4,2020-07-05 11:30:00,,0


Tail regime size: (52556, 10)


Unnamed: 0,datetime,atr_ratio,high_vol_regime
118,2020-07-05 21:00:00,1.394751,1
119,2020-07-05 21:05:00,1.402807,1
121,2020-07-05 21:15:00,2.582428,1
122,2020-07-05 21:20:00,2.654296,1
123,2020-07-05 21:25:00,2.674493,1


## PREPROCESSING

In [4]:
# Use your specific keys here:
key_5m = 'BTCUSDT_5m_5year'
key_1h = 'BTCUSDT_1h_5year'


In [8]:
# ── Full Feature Engineering & Regime Separation ────────────────────────────
import pandas as pd
import numpy as np

# Parameters
LOOKBACK      = 20
ATR_WINDOW    = 14
EMA_LONG      = 50
ADX_WINDOW    = 14
VWAP_BAND_PCT = 0.001  # ±0.1%

# 1) Extract & sort your loaded DataFrames
df5  = dfs['BTCUSDT_5m_5year'].sort_values('datetime').reset_index(drop=True)
df1h = dfs['BTCUSDT_1h_5year'].sort_values('datetime').reset_index(drop=True)

# 2) Precompute 1h EMA
df1h['ema_hf'] = df1h['close'].ewm(span=EMA_LONG, adjust=False).mean()
df1h_hf = df1h[['datetime','ema_hf']]

# 3) Build 5m features
df = df5.copy()
df['prev_close'] = df['close'].shift(1)

# 3a) ATR & ATR ratio
tr = pd.concat([
    df['high'] - df['low'],
    (df['high'] - df['prev_close']).abs(),
    (df['low']  - df['prev_close']).abs()
], axis=1).max(axis=1)
df['atr']       = tr.rolling(ATR_WINDOW).mean()
df['atr_slow']  = df['atr'].rolling(ATR_WINDOW*2).mean()
df['atr_ratio'] = df['atr'] / df['atr_slow']

# 3b) RSI(14)
delta = df['close'].diff()
up, down = delta.clip(lower=0), -delta.clip(upper=0)
df['rsi'] = 100 - 100/(1 + up.rolling(ATR_WINDOW).mean()/down.rolling(ATR_WINDOW).mean())

# 3c) EMA long on 5m
df['ema_long'] = df['close'].ewm(span=EMA_LONG, adjust=False).mean()

# 3d) ADX(14)
up_m   = df['high'].diff()
down_m = -(df['low'].shift(1).diff())
plus   = np.where((up_m>down_m)&(up_m>0), up_m, 0.0)
minus  = np.where((down_m>up_m)&(down_m>0), down_m, 0.0)
sm_tr  = tr.ewm(alpha=1/ADX_WINDOW, adjust=False).mean()
sm_p   = pd.Series(plus).ewm(alpha=1/ADX_WINDOW, adjust=False).mean()
sm_m   = pd.Series(minus).ewm(alpha=1/ADX_WINDOW, adjust=False).mean()
df['adx'] = 100 * (sm_p - sm_m).abs()/(sm_p + sm_m)

# 3e) Volume MA
df['vol_ma'] = df['volume'].rolling(20).mean()

# 3f) VWAP bands
df['date']     = df['datetime'].dt.date
typ           = (df['high'] + df['low'] + df['close'])/3
df['cum_vp']   = typ.mul(df['volume']).groupby(df['date']).cumsum()
df['cum_vol']  = df['volume'].groupby(df['date']).cumsum()
df['vwap']     = df['cum_vp'] / df['cum_vol']
df['vwap_upper'] = df['vwap'] * (1 + VWAP_BAND_PCT)
df['vwap_lower'] = df['vwap'] * (1 - VWAP_BAND_PCT)

# 3g) Merge 1h EMA into 5m
df['hour'] = df['datetime'].dt.floor('h')
df = df.merge(df1h_hf.rename(columns={'datetime':'hour'}), on='hour', how='left')

# 4) Define the target
df['return_next'] = df['close'].shift(-1) / df['open'] - 1

# 5) Dynamic regime flag (90th percentile of atr_ratio)
threshold = df['atr_ratio'].quantile(0.90)
df['high_vol_regime'] = (df['atr_ratio'] > threshold).astype(int)
print(f"Dynamic regime threshold (90th pct of ATR ratio): {threshold:.2f}")

# 6) Clean up and build final df_feat
df_feat = (
    df.drop(columns=['prev_close','date','cum_vp','cum_vol','hour'])
      .dropna()
      .reset_index(drop=True)
)

# 7) Split into Normal vs. Tail regimes
normal_df = df_feat[df_feat['high_vol_regime']==0].copy()
tail_df   = df_feat[df_feat['high_vol_regime']==1].copy()

# 8) Inspect the separation
print("Normal regime size:", normal_df.shape)
print("Tail   regime size:", tail_df.shape)
display(normal_df.head(3), tail_df.head(3))


Dynamic regime threshold (90th pct of ATR ratio): 1.38
Normal regime size: (472528, 19)
Tail   regime size: (52527, 19)


Unnamed: 0,datetime,open,high,low,close,volume,atr,atr_slow,atr_ratio,rsi,ema_long,adx,vol_ma,vwap,vwap_upper,vwap_lower,ema_hf,return_next,high_vol_regime
0,2020-07-05 14:30:00,9025.28,9025.28,9018.0,9019.9,160.3,10.754286,11.424949,0.941298,48.813696,9030.389199,2.304033,437.42675,9026.93991,9035.96685,9017.91297,9015.688235,-0.000265,0
1,2020-07-05 14:35:00,9019.91,9025.0,9019.91,9022.89,177.25,10.142143,11.461786,0.884866,40.21414,9030.095113,10.453127,436.1,9026.896435,9035.923331,9017.869539,9015.688235,0.000839,0
2,2020-07-05 14:40:00,9022.89,9033.0,9022.89,9027.48,415.129,10.453571,11.508291,0.908351,44.380616,9029.99256,3.42699,423.125,9026.916919,9035.943836,9017.890002,9015.688235,0.000459,0


Unnamed: 0,datetime,open,high,low,close,volume,atr,atr_slow,atr_ratio,rsi,ema_long,adx,vol_ma,vwap,vwap_upper,vwap_lower,ema_hf,return_next,high_vol_regime
78,2020-07-05 21:00:00,9013.44,9019.0,9003.06,9019.0,1011.406,6.975,5.000893,1.394751,33.951987,9033.242773,52.970394,231.4766,9032.657122,9041.689779,9023.624464,9019.165771,-3.1e-05,1
79,2020-07-05 21:05:00,9019.0,9020.0,9012.18,9013.16,361.556,7.193571,5.127985,1.402807,32.484076,9032.455213,59.530042,245.6563,9032.491374,9041.523866,9023.458883,9019.165771,-0.001206,1
81,2020-07-05 21:15:00,9008.12,9008.74,8900.0,8959.13,10688.672,14.531429,5.627041,2.582428,16.508092,9028.662817,63.160872,765.41325,9015.757709,9024.773467,9006.741951,9019.165771,-0.005564,1


In [9]:
# ── 2) Preprocess, train experts, and fit regime classifier ───────────────────

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score

# Feature and target names
feature_cols = ['atr','atr_ratio','rsi','ema_long','ema_hf','adx','vol_ma','vwap_upper','vwap_lower']
target_col   = 'return_next'

# --- 2.1) Normal‐regime expert ---
Xn = normal_df[feature_cols]
yn = normal_df[target_col]
Xn_tr, Xn_te, yn_tr, yn_te = train_test_split(Xn, yn, test_size=0.2, shuffle=False)
print(f"Normal Expert → Train: {Xn_tr.shape}, Test: {Xn_te.shape}")

model_norm = HistGradientBoostingRegressor(random_state=42)
model_norm.fit(Xn_tr, yn_tr)

pred_n = model_norm.predict(Xn_te)
print(f"Normal Expert → MSE: {mean_squared_error(yn_te,pred_n):.6f}, R²: {r2_score(yn_te,pred_n):.4f}")

# --- 2.2) Tail‐regime expert ---
Xt = tail_df[feature_cols]
yt = tail_df[target_col]
Xt_tr, Xt_te, yt_tr, yt_te = train_test_split(Xt, yt, test_size=0.2, shuffle=False)
print(f"Tail Expert   → Train: {Xt_tr.shape}, Test: {Xt_te.shape}")

model_tail = HistGradientBoostingRegressor(random_state=42)
model_tail.fit(Xt_tr, yt_tr)

pred_t = model_tail.predict(Xt_te)
print(f"Tail Expert   → MSE: {mean_squared_error(yt_te,pred_t):.6f}, R²: {r2_score(yt_te,pred_t):.4f}")

# --- 2.3) Regime classifier ---
Xc = df_feat[feature_cols]
yc = df_feat['high_vol_regime']
Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(Xc, yc, test_size=0.2, shuffle=False)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(Xc_tr, yc_tr)
print(f"Classifier Accuracy: {clf.score(Xc_te, yc_te):.4f}")

# --- 2.4) Mixture‐of‐Experts hold‐out evaluation ---
X_hold = pd.concat([Xn_te, Xt_te])
y_hold = pd.concat([yn_te, yt_te])
reg_pred = clf.predict(X_hold)

y_pred = np.where(
    reg_pred == 0,
    model_norm.predict(X_hold),
    model_tail.predict(X_hold)
)

print(f"Mixture‐of‐Experts → MSE: {mean_squared_error(y_hold,y_pred):.6f}, R²: {r2_score(y_hold,y_pred):.4f}")


Normal Expert → Train: (378022, 9), Test: (94506, 9)
Normal Expert → MSE: 0.000004, R²: 0.0357
Tail Expert   → Train: (42021, 9), Test: (10506, 9)
Tail Expert   → MSE: 0.000014, R²: -0.0092
Classifier Accuracy: 1.0000
Mixture‐of‐Experts → MSE: 0.000005, R²: 0.0229


In [10]:
from xgboost import XGBRegressor
tail_xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
tail_xgb.fit(Xt_tr, yt_tr)
pred_tx = tail_xgb.predict(Xt_te)
print("Tail XGB → MSE:", mean_squared_error(yt_te,pred_tx),
      "R²:", r2_score(yt_te,pred_tx))


Tail XGB → MSE: 1.8527237734442637e-05 R²: -0.3171141063423033


In [11]:
import pandas as pd
import numpy as np
from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
from sklearn.metrics import make_scorer, mean_squared_error

# 1) Prepare tail subset
feature_cols = ['atr','atr_ratio','rsi','ema_long','ema_hf','adx','vol_ma','vwap_upper','vwap_lower']
Xt = tail_df[feature_cols]
yt = tail_df['return_next']

# 2) Train/test split (chronological)
split = int(len(Xt) * 0.8)
Xt_tr, Xt_te = Xt.iloc[:split], Xt.iloc[split:]
yt_tr, yt_te = yt.iloc[:split], yt.iloc[split:]

# 3) Define XGB param space
xgb_param_dist = {
    'n_estimators':    [100, 200, 300, 500],
    'max_depth':       [3, 6, 10, 15],
    'learning_rate':   [0.01, 0.03, 0.05, 0.1],
    'subsample':       [0.5, 0.7, 0.9, 1.0],
    'colsample_bytree':[0.5, 0.7, 0.9, 1.0],
    'reg_alpha':       [0, 0.1, 0.5, 1.0],
    'reg_lambda':      [1.0, 2.0, 5.0]
}

# 4) TimeSeriesSplit & scorer
tscv      = TimeSeriesSplit(n_splits=4)
mse_scorer= make_scorer(mean_squared_error, greater_is_better=False)

# 5) RandomizedSearchCV
search = RandomizedSearchCV(
    XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1),
    xgb_param_dist,
    n_iter=30,
    scoring=mse_scorer,
    cv=tscv,
    verbose=2,
    random_state=42,
    n_jobs=-1
)
search.fit(Xt_tr, yt_tr)

# 6) Best tail-expert params & CV MSE
print("Best Tail-Expert Params:", search.best_params_)
print("Best CV MSE:", -search.best_score_)

# 7) Evaluate on hold-out
best_tail = search.best_estimator_
pred_tail = best_tail.predict(Xt_te)
print("Tail-XGB Hold-out → MSE:",
      mean_squared_error(yt_te, pred_tail),
      "R²:", r2_score(yt_te, pred_tail))


Fitting 4 folds for each of 30 candidates, totalling 120 fits
[CV] END colsample_bytree=0.5, learning_rate=0.03, max_depth=3, n_estimators=200, reg_alpha=1.0, reg_lambda=5.0, subsample=0.5; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.03, max_depth=3, n_estimators=200, reg_alpha=1.0, reg_lambda=5.0, subsample=0.5; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100, reg_alpha=0.1, reg_lambda=1.0, subsample=0.9; total time=   0.0s
[CV] END colsample_bytree=0.5, learning_rate=0.03, max_depth=3, n_estimators=200, reg_alpha=1.0, reg_lambda=5.0, subsample=0.5; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100, reg_alpha=0.1, reg_lambda=1.0, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.7, learning_rate=0.1, max_depth=3, n_estimators=100, reg_alpha=0.1, reg_lambda=1.0, subsample=0.9; total time=   0.1s
[CV] END colsample_bytree=0.5, learning_rate=0.03, max_dept



[CV] END colsample_bytree=0.9, learning_rate=0.03, max_depth=3, n_estimators=100, reg_alpha=1.0, reg_lambda=1.0, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.05, max_depth=15, n_estimators=200, reg_alpha=0.1, reg_lambda=2.0, subsample=0.9; total time=   1.7s
[CV] END colsample_bytree=0.9, learning_rate=0.03, max_depth=3, n_estimators=100, reg_alpha=1.0, reg_lambda=1.0, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.03, max_depth=3, n_estimators=100, reg_alpha=1.0, reg_lambda=1.0, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=0.9, learning_rate=0.03, max_depth=3, n_estimators=100, reg_alpha=1.0, reg_lambda=1.0, subsample=0.7; total time=   0.1s
[CV] END colsample_bytree=1.0, learning_rate=0.03, max_depth=15, n_estimators=100, reg_alpha=0.1, reg_lambda=5.0, subsample=1.0; total time=   0.8s
[CV] END colsample_bytree=0.5, learning_rate=0.1, max_depth=3, n_estimators=300, reg_alpha=0.5, reg_lambda=5.0, subs

In [12]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score

# 3.1) Rename for clarity
tail_xgb = search.best_estimator_
norm_hgb = model_norm  # from your normal-expert cell
clf_rf   = clf         # your regime classifier

# 3.2) Build hold-out set
X_hold = pd.concat([Xn_te, Xt_te])
y_hold = pd.concat([yn_te, yt_te])

# 3.3) Classify regimes and predict
regime_pred = clf_rf.predict(X_hold)
pred_norm   = norm_hgb.predict(X_hold)
pred_tail   = tail_xgb.predict(X_hold)

# 3.4) Mixture (hard switch)
y_pred_mix  = np.where(regime_pred==0, pred_norm, pred_tail)

# 3.5) Evaluate
mse_mix = mean_squared_error(y_hold, y_pred_mix)
r2_mix  = r2_score(y_hold, y_pred_mix)
print(f"Updated Ensemble → MSE: {mse_mix:.6f}, R²: {r2_mix:.4f}")


Updated Ensemble → MSE: 0.000005, R²: 0.0368
