In [1]:
# ========================================
# STEP 1: CROSS-PLATFORM DEPENDENCY MANAGEMENT
# ========================================
print("🔧 Setting up dependencies...")

# Cross-platform dependency installation
try:
    import pandas, numpy, sklearn, xgboost, matplotlib, seaborn, joblib, tqdm
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, roc_auc_score
    from sklearn.utils.class_weight import compute_sample_weight
    from sklearn.ensemble import GradientBoostingClassifier
    import joblib
    import lightgbm as lgb  # not used here, keeps env parity
    print("✅ Core dependencies already available")
except ImportError as e:
    print(f"Installing missing dependencies: {e}")
    import sys, subprocess
    pkgs = ['pandas', 'numpy', 'scikit-learn', 'xgboost', 'lightgbm',
            'matplotlib', 'seaborn', 'joblib', 'tqdm', 'pyarrow']
    subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + pkgs)
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, roc_auc_score
    from sklearn.utils.class_weight import compute_sample_weight
    from sklearn.ensemble import GradientBoostingClassifier
    import joblib
    print("✅ Dependencies installed")

# Try to mount Google Drive if available (Colab environment)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IS_COLAB = True
    BASE_DIR = '/content/drive/MyDrive/daygent_v1_models'  # same base folder as your LGBM 4h
    print("✅ Google Drive mounted (Colab environment)")
except ImportError:
    IS_COLAB = False
    BASE_DIR = './daygent_v1_models'
    print("✅ Local environment detected")

# Core imports
import os
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Paths
DATA_DIR = os.path.join(BASE_DIR, 'spy_data_export')
MODEL_DIR = os.path.join(BASE_DIR, 'gb_1d')
os.makedirs(MODEL_DIR, exist_ok=True)

print(f"✅ Model directory: {MODEL_DIR}")
print(f"✅ Data directory: {DATA_DIR}")


🔧 Setting up dependencies...
✅ Core dependencies already available
Mounted at /content/drive
✅ Google Drive mounted (Colab environment)
✅ Model directory: /content/drive/MyDrive/daygent_v1_models/gb_1d
✅ Data directory: /content/drive/MyDrive/daygent_v1_models/spy_data_export


In [2]:
# ========================================
# STEP 2: LOAD 1D AND 4H DATA (FOR OVERLAP TEST PERIOD)
# ========================================
print("\n📊 Loading 1D and 4H timeframe data...")

TIMEFRAMES_ORDERED = ['1d', '4h']
raw_data = {}

for tf in TIMEFRAMES_ORDERED:
    csv_file = os.path.join(DATA_DIR, f'spy_{tf}.csv')
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"❌ {csv_file} not found!")
    df = pd.read_csv(csv_file)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)
    raw_data[tf] = df
    print(f"✅ Loaded {tf} data: {len(df):,} candles")
    print(f"📅 {tf} range: {df['timestamp'].min()} to {df['timestamp'].max()}")



📊 Loading 1D and 4H timeframe data...
✅ Loaded 1d data: 2,547 candles
📅 1d range: 2014-12-23 14:30:00+00:00 to 2025-02-07 14:30:00+00:00
✅ Loaded 4h data: 3,058 candles
📅 4h range: 2019-01-07 14:30:00+00:00 to 2025-02-10 14:30:00+00:00


In [3]:
# ========================================
# STEP 3: DEFINE TEST PERIOD (LAST 35 COMMON TRADING DAYS)
# ========================================
latest_start = max(raw_data['1d']['timestamp'].min(), raw_data['4h']['timestamp'].min())
earliest_end  = min(raw_data['1d']['timestamp'].max(), raw_data['4h']['timestamp'].max())

common_dates = set(raw_data['1d'][(raw_data['1d']['timestamp'] >= latest_start) &
                                  (raw_data['1d']['timestamp'] <= earliest_end)]['timestamp'].dt.date.unique())
common_dates &= set(raw_data['4h'][(raw_data['4h']['timestamp'] >= latest_start) &
                                   (raw_data['4h']['timestamp'] <= earliest_end)]['timestamp'].dt.date.unique())

all_days = sorted(common_dates)
TEST_DAYS = min(35, len(all_days))
selected_days = all_days[-TEST_DAYS:]

test_start = pd.Timestamp.combine(selected_days[0],  pd.Timestamp.min.time()).tz_localize('UTC')
test_end   = pd.Timestamp.combine(selected_days[-1], pd.Timestamp.max.time()).tz_localize('UTC')

print(f"\n🎯 Test period: {test_start.date()} → {test_end.date()} ({TEST_DAYS} trading days)")



🎯 Test period: 2024-12-17 → 2025-02-07 (35 trading days)


In [4]:
# ========================================
# STEP 4: FEATURE EXTRACTION (16-FEATURE CONTRACT)
# ========================================
def parse_vector_column(vector_str):
    """Parse vector string to numpy array"""
    if pd.isna(vector_str) or vector_str is None:
        return None
    if isinstance(vector_str, str):
        s = vector_str.strip('[]"')
        try:
            return np.array([float(x.strip()) for x in s.split(',')])
        except ValueError:
            return None
    return np.array(vector_str)

FEATURE_NAMES = [
    'raw_o','raw_h','raw_l','raw_c','raw_v',
    'iso_0','iso_1','iso_2','iso_3',
    'tf_1d','tf_4h',
    'hl_range','price_change','upper_shadow','lower_shadow','volume_m'
]

def build_feature_vector(raw_ohlcv, iso_ohlc, tf, tf_list):
    """Build 16-feature vector"""
    o, h, l, c, v = raw_ohlcv
    features = list(raw_ohlcv)          # 5
    features.extend(list(iso_ohlc))     # 4
    features.extend([1 if tf == t else 0 for t in tf_list])  # 2
    features.extend([
        (h - l) / c if c else 0,        # hl_range
        (c - o) / o if o else 0,        # price_change
        (h - c) / c if c else 0,        # upper_shadow
        (c - l) / c if c else 0,        # lower_shadow
        v / 1_000_000,                  # volume_m
    ])  # 5
    return np.array(features, dtype=float)

def extract_features_1d(row):
    raw_ohlcv = parse_vector_column(row.get('raw_ohlcv_vec'))
    iso_ohlc  = parse_vector_column(row.get('iso_ohlc'))
    future    = row.get('future')
    if raw_ohlcv is None or iso_ohlc is None or pd.isna(future):
        return None, None
    if len(raw_ohlcv) != 5 or len(iso_ohlc) != 4:
        return None, None
    return build_feature_vector(raw_ohlcv, iso_ohlc, '1d', TIMEFRAMES_ORDERED), int(future)


In [5]:
# ========================================
# STEP 5: EXTRACT 1D TRAIN/TEST FEATURES
# ========================================
print("\n🔄 Extracting features from 1d data...")

df_1d = raw_data['1d']
train_df = df_1d[df_1d['timestamp'] < test_start].copy()
test_df  = df_1d[(df_1d['timestamp'] >= test_start) & (df_1d['timestamp'] <= test_end)].copy()

print(f"📊 Train samples: {len(train_df):,}")
print(f"📊 Test samples: {len(test_df):,}")

# Train features
X_train, y_train = [], []
for _, row in tqdm(train_df.iterrows(), total=len(train_df), desc="Extracting 1d train features"):
    features, label = extract_features_1d(row)
    if features is not None:
        X_train.append(features)
        y_train.append(label)

X_train = np.array(X_train)
y_train = np.array(y_train)
print(f"\n✅ Training features extracted: {X_train.shape}")
if len(y_train):
    print(f"📊 Class distribution: {np.bincount(y_train)}")

# Test features + raw info for the detailed report
X_test, y_test, test_timestamps = [], [], []
test_rows_info = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Extracting 1d test features"):
    fv, lbl = extract_features_1d(row)
    if fv is not None:
        X_test.append(fv)
        y_test.append(lbl)
        test_timestamps.append(row['timestamp'])
        test_rows_info.append({
            'timestamp': row['timestamp'],
            'raw_ohlcv': parse_vector_column(row['raw_ohlcv_vec']),
            'iso_ohlc':  parse_vector_column(row['iso_ohlc']),
            'future': int(row['future']),
            'feature_vector': fv
        })

X_test = np.array(X_test)
y_test = np.array(y_test)
print(f"📊 Test features extracted: {X_test.shape}")



🔄 Extracting features from 1d data...
📊 Train samples: 2,512
📊 Test samples: 35


Extracting 1d train features: 100%|██████████| 2512/2512 [00:00<00:00, 16245.64it/s]



✅ Training features extracted: (2512, 16)
📊 Class distribution: [1146 1366]


Extracting 1d test features: 100%|██████████| 35/35 [00:00<00:00, 9168.16it/s]

📊 Test features extracted: (35, 16)





In [6]:
# ========================================
# STEP 6: SCALE AND SPLIT
# ========================================
scaler = StandardScaler()
split_idx = int(len(X_train) * 0.8)
print(f"\n🔧 Fitting scaler on first {split_idx:,} training samples...")
scaler.fit(X_train[:split_idx])

X_train_scaled = scaler.transform(X_train)
X_tr, X_val = X_train_scaled[:split_idx], X_train_scaled[split_idx:]
y_tr, y_val = y_train[:split_idx], y_train[split_idx:]

print(f"📊 Training set: {X_tr.shape}")
print(f"📊 Validation set: {X_val.shape}")



🔧 Fitting scaler on first 2,009 training samples...
📊 Training set: (2009, 16)
📊 Validation set: (503, 16)


In [8]:
# ========================================
# STEP 7: TRAIN GRADIENT BOOSTING (EXACT BEST PARAMS) + THRESHOLD CALIBRATION
# ========================================
print("\n🚀 Training GradientBoosting (1d) with exact best parameters...")

gb_params = {
    'n_estimators': 180,
    'max_depth': 5,
    'learning_rate': 0.1625,
    'subsample': 0.8043,
    'min_samples_split': 2,
    'min_samples_leaf': 1,
    'random_state': 42,
}

gb = GradientBoostingClassifier(**gb_params)

# Use balanced sample weights (matches optimizer behavior)
sw_tr = compute_sample_weight('balanced', y_tr) if len(y_tr) else None
gb.fit(X_tr, y_tr, sample_weight=sw_tr)

# Validate
val_proba = gb.predict_proba(X_val)[:, 1] if len(X_val) else np.array([])
val_pred  = (val_proba >= 0.5).astype(int) if len(val_proba) else np.array([])
val_acc   = accuracy_score(y_val, val_pred) if len(val_pred) else float('nan')
val_auc   = roc_auc_score(y_val, val_proba) if (len(val_proba) and len(np.unique(y_val))==2) else float('nan')

print(f"✅ Validation Accuracy (t=0.50): {val_acc:.4f}")
print(f"✅ Validation AUC: {val_auc:.4f}")

# Threshold calibration on validation
best_thr = 0.5
best_val_acc = val_acc
if len(val_proba):
    for thr in np.linspace(0.30, 0.70, 41):
        preds_thr = (val_proba >= thr).astype(int)
        acc_thr = accuracy_score(y_val, preds_thr)
        if acc_thr > best_val_acc:
            best_val_acc = acc_thr
            best_thr = float(thr)

print(f"✅ Calibrated decision threshold on validation: {best_thr:.2f} (Acc={best_val_acc:.4f})")

# Refit on all in-sample (train + val)
X_full = X_train_scaled
y_full = y_train
gb_full = GradientBoostingClassifier(**gb_params)
sw_full = compute_sample_weight('balanced', y_full) if len(y_full) else None
gb_full.fit(X_full, y_full, sample_weight=sw_full)



🚀 Training GradientBoosting (1d) with exact best parameters...
✅ Validation Accuracy (t=0.50): 0.4811
✅ Validation AUC: 0.4855
✅ Calibrated decision threshold on validation: 0.57 (Acc=0.4911)


In [9]:
# ========================================
# STEP 8: TEST + DETAILED DAY-BY-DAY / PREDICTION-BY-PREDICTION ANALYSIS
# ========================================
print(f"\n🧪 Testing on isolated {len(selected_days)}-day period (1d)...")

# Scale test with SAME scaler
X_test_scaled = scaler.transform(X_test) if len(X_test) else np.empty((0, X_tr.shape[1]))

# Predictions
test_pred_proba = gb_full.predict_proba(X_test_scaled)[:, 1] if len(X_test_scaled) else np.array([])
test_pred = (test_pred_proba >= best_thr).astype(int) if len(test_pred_proba) else np.array([])

# Metrics
test_acc = accuracy_score(y_test, test_pred) if len(test_pred) else float('nan')
test_auc = roc_auc_score(y_test, test_pred_proba) if (len(test_pred_proba) and len(np.unique(y_test))==2) else float('nan')

print(f"\n🎯 TEST RESULTS (1d):")
print(f"✅ Test Accuracy: {test_acc:.4f}")
print(f"✅ Test AUC: {test_auc:.4f}")
if len(test_pred):
    print(f"📊 Test predictions: {np.bincount(test_pred)}")
    print(f"📊 Actual labels: {np.bincount(y_test)}")

# Build detailed per-prediction table
records = []
for i, info in enumerate(test_rows_info):
    ts   = info['timestamp']
    fv   = info['feature_vector']
    raw  = info['raw_ohlcv']
    iso  = info['iso_ohlc']
    true = info['future']

    proba = float(test_pred_proba[i])
    pred  = int(test_pred[i])
    correct = bool(pred == true)
    margin = proba - best_thr

    rec = {
        'candle_index_in_test': i + 1,
        'timestamp_utc': ts,
        'date_utc': ts.date(),
        'pred_prob_up': proba,
        'pred_label': int(pred),      # 1=up, 0=down
        'true_label': int(true),
        'correct': correct,
        'threshold_used': best_thr,
        'decision_margin': margin,

        # Raw 1d OHLCV & ISO
        'raw_o': raw[0], 'raw_h': raw[1], 'raw_l': raw[2], 'raw_c': raw[3], 'raw_v': raw[4],
        'iso_0': iso[0], 'iso_1': iso[1], 'iso_2': iso[2], 'iso_3': iso[3],

        # Engineered features pulled from the fv (by index in FEATURE_NAMES)
        'tf_1d': fv[FEATURE_NAMES.index('tf_1d')],
        'tf_4h': fv[FEATURE_NAMES.index('tf_4h')],
        'hl_range': fv[FEATURE_NAMES.index('hl_range')],
        'price_change': fv[FEATURE_NAMES.index('price_change')],
        'upper_shadow': fv[FEATURE_NAMES.index('upper_shadow')],
        'lower_shadow': fv[FEATURE_NAMES.index('lower_shadow')],
        'volume_m': fv[FEATURE_NAMES.index('volume_m')],
    }
    records.append(rec)

pred_df = pd.DataFrame.from_records(records).sort_values(['date_utc','timestamp_utc']).reset_index(drop=True)

# Save machine-friendly CSV
pred_csv_path = os.path.join(MODEL_DIR, 'test_predictions_1d.csv')
pred_df.to_csv(pred_csv_path, index=False)

# Human-readable TXT report grouped by day
txt_lines = []
txt_lines.append("="*90)
txt_lines.append("GRADIENT BOOSTING 1D — DETAILED DAY-BY-DAY / PREDICTION-BY-PREDICTION REPORT")
txt_lines.append("="*90)
txt_lines.append(f"Test period: {test_start.date()} → {test_end.date()}")
txt_lines.append(f"Total test candles: {len(pred_df)}")
txt_lines.append(f"Calibrated threshold: {best_thr:.2f}")
txt_lines.append(f"Overall Test Accuracy: {test_acc:.4f}")
txt_lines.append(f"Overall Test AUC: {test_auc:.4f}")
txt_lines.append("")

for day in pred_df['date_utc'].unique():
    day_block = pred_df[pred_df['date_utc'] == day]
    correct_n = int(day_block['correct'].sum())
    total_n   = len(day_block)
    txt_lines.append("-"*90)
    txt_lines.append(f"{day}  —  Day accuracy: {correct_n}/{total_n}  ({correct_n/total_n:.3f})")
    txt_lines.append("-"*90)
    for _, r in day_block.iterrows():
        dir_word   = "UP" if r['pred_label'] == 1 else "DOWN"
        truth_word = "UP" if r['true_label'] == 1 else "DOWN"
        right_wrong = "✅ CORRECT" if r['correct'] else "❌ WRONG"
        txt_lines.append(
            f"[{int(r['candle_index_in_test']):02d}] {r['timestamp_utc']}  "
            f"pred={dir_word}  p_up={r['pred_prob_up']:.4f}  thr={r['threshold_used']:.2f}  "
            f"margin={r['decision_margin']:.4f}  truth={truth_word}  → {right_wrong}"
        )
        txt_lines.append(
            f"    OHLCV: O={r['raw_o']:.4f}, H={r['raw_h']:.4f}, L={r['raw_l']:.4f}, C={r['raw_c']:.4f}, V={r['raw_v']:.0f} | "
            f"ISO: [{r['iso_0']:.4f}, {r['iso_1']:.4f}, {r['iso_2']:.4f}, {r['iso_3']:.4f}] | "
            f"feats: hl={r['hl_range']:.4f}, dC={r['price_change']:.4f}, upSh={r['upper_shadow']:.4f}, "
            f"loSh={r['lower_shadow']:.4f}, vol_m={r['volume_m']:.4f}"
        )
    txt_lines.append("")

report_path = os.path.join(MODEL_DIR, 'gb_1d_day_by_day.txt')
with open(report_path, 'w') as f:
    f.write("\n".join(txt_lines))

print(f"\n📝 Saved detailed TXT report to: {report_path}")
print(f"🧾 Saved machine-readable predictions to: {pred_csv_path}")



🧪 Testing on isolated 35-day period (1d)...

🎯 TEST RESULTS (1d):
✅ Test Accuracy: 0.7143
✅ Test AUC: 0.7733
📊 Test predictions: [13 22]
📊 Actual labels: [15 20]

📝 Saved detailed TXT report to: /content/drive/MyDrive/daygent_v1_models/gb_1d/gb_1d_day_by_day.txt
🧾 Saved machine-readable predictions to: /content/drive/MyDrive/daygent_v1_models/gb_1d/test_predictions_1d.csv


In [10]:
# ========================================
# STEP 9: SAVE MODEL, SCALER, AND RESULTS
# ========================================
print("\n💾 Saving model and results...")

model_path  = os.path.join(MODEL_DIR, 'gb_1d_final.joblib')
scaler_path = os.path.join(MODEL_DIR, 'scaler_1d.joblib')
joblib.dump(gb_full, model_path)
joblib.dump(scaler, scaler_path)

def _to_py(v):
    try:
        if isinstance(v, (np.integer, np.int64, np.int32)):
            return int(v)
        if isinstance(v, (np.floating,)):
            return float(v)
        return v
    except Exception:
        return v

results = {
    'test_accuracy': float(test_acc),
    'test_auc': float(test_auc),
    'validation_accuracy': float(best_val_acc),
    'validation_auc': float(val_auc) if not (isinstance(val_auc, float) and np.isnan(val_auc)) else None,
    'chosen_threshold': float(best_thr),
    'train_samples': int(len(X_tr)),
    'val_samples': int(len(X_val)),
    'test_samples': int(len(X_test)),
    'feature_count': int(X_train.shape[1]) if X_train.ndim == 2 else 0,
    'model_params': {k: _to_py(v) for k, v in gb_params.items()},
    'feature_names': FEATURE_NAMES,
    'report_txt': os.path.basename(report_path),
    'predictions_csv': os.path.basename(pred_csv_path),
    'model_path': os.path.basename(model_path),
    'scaler_path': os.path.basename(scaler_path),
    'test_period': f"{test_start.date()} to {test_end.date()}"
}

import json
with open(os.path.join(MODEL_DIR, 'results_gb_1d.json'), 'w') as f:
    json.dump(results, f, indent=2)

print(f"✅ Model saved to: {model_path}")
print(f"✅ Scaler saved to: {scaler_path}")
print("✅ Results JSON saved as: results_gb_1d.json")



💾 Saving model and results...
✅ Model saved to: /content/drive/MyDrive/daygent_v1_models/gb_1d/gb_1d_final.joblib
✅ Scaler saved to: /content/drive/MyDrive/daygent_v1_models/gb_1d/scaler_1d.joblib
✅ Results JSON saved as: results_gb_1d.json


In [11]:
# ========================================
# STEP 10: SAVE DEPLOYMENT ARTIFACTS (for your site)
# ========================================
import json
from textwrap import dedent

deployment_config = {
    "model_type": "GradientBoostingClassifier",
    "timeframe": "1d",
    "feature_contract_version": "v1",
    "feature_names": FEATURE_NAMES,
    "calibrated_threshold": float(best_thr),
    "artifact_paths": {
        "model_joblib": "gb_1d_final.joblib",
        "scaler_joblib": "scaler_1d.joblib"
    },
    "inference_notes": {
        "scaling": "StandardScaler fitted on first 80% of pre-test 1d training data",
        "one_hot": {"tf_1d": 1, "tf_4h": 0},
        "expected_columns_in_csv": ["timestamp", "raw_ohlcv_vec", "iso_ohlc", "future"]
    },
    "gb_params": {k: _to_py(v) for k, v in gb_params.items()}
}

config_path = os.path.join(MODEL_DIR, "deployment_config.json")
with open(config_path, "w") as f:
    json.dump(deployment_config, f, indent=2)

feature_schema = {
    "raw_ohlcv_vec": {
        "desc": "Stringified list of [open, high, low, close, volume]",
        "len": 5,
        "dtype": "float"
    },
    "iso_ohlc": {
        "desc": "Stringified list of 4 ISO-normalized OHLC values",
        "len": 4,
        "dtype": "float"
    },
    "engineered": [
        "hl_range=(H-L)/C",
        "price_change=(C-O)/O",
        "upper_shadow=(H-C)/C",
        "lower_shadow=(C-L)/C",
        "volume_m=V/1e6"
    ],
    "tf_one_hot": {"tf_1d": 1, "tf_4h": 0}
}

schema_path = os.path.join(MODEL_DIR, "feature_schema.json")
with open(schema_path, "w") as f:
    json.dump(feature_schema, f, indent=2)

readme_text = dedent(f"""
    ============================================
    GradientBoosting 1D Inference — Deployment Notes
    ============================================

    Artifacts:
    - Model:       {os.path.basename(model_path)}
    - Scaler:      {os.path.basename(scaler_path)}
    - Config:      {os.path.basename(config_path)}
    - Feature schema: feature_schema.json
    - Threshold:   {best_thr:.2f}
    - Predictions: test_predictions_1d.csv
    - Report:      gb_1d_day_by_day.txt

    Feature order (must match EXACTLY):
    {FEATURE_NAMES}

    Inference pipeline for your site:
    1) Parse raw input row:
       - Parse 'raw_ohlcv_vec' -> [o,h,l,c,v]
       - Parse 'iso_ohlc'      -> [iso_0..iso_3]
       - Add one-hot: tf_1d=1, tf_4h=0
       - Compute engineered features as in feature_schema.json
       - Concatenate into a single 16-length vector in the listed order.

    2) Load scaler with joblib and call scaler.transform([vector]).
    3) Load model with joblib and call model.predict_proba(scaled)[0,1].
    4) If prob >= {best_thr:.2f} => predict UP (1); else DOWN (0).

    Notes:
    - Trained with class_weight via compute_sample_weight('balanced').
    - Scaler fit only on pre-test train split (first 80%).
    - Keep feature order and scaling identical for consistent results.
""").strip()

readme_path = os.path.join(MODEL_DIR, "README_DEPLOY_1D.txt")
with open(readme_path, "w") as f:
    f.write(readme_text)

print("📦 Deployment artifacts saved:")
print(" -", config_path)
print(" -", schema_path)
print(" -", readme_path)


📦 Deployment artifacts saved:
 - /content/drive/MyDrive/daygent_v1_models/gb_1d/deployment_config.json
 - /content/drive/MyDrive/daygent_v1_models/gb_1d/feature_schema.json
 - /content/drive/MyDrive/daygent_v1_models/gb_1d/README_DEPLOY_1D.txt
