In [1]:
# ========================================
# STEP 1: CROSS-PLATFORM DEPENDENCY MANAGEMENT
# ========================================
print("🔧 Setting up dependencies...")

# Cross-platform dependency installation
try:
    import pandas, numpy, sklearn, xgboost, matplotlib, seaborn, joblib, tqdm
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, roc_auc_score
    from sklearn.utils.class_weight import compute_sample_weight
    from sklearn.ensemble import GradientBoostingClassifier
    import joblib
    import lightgbm as lgb  # not used here, keeps env parity
    print("✅ Core dependencies already available")
except ImportError as e:
    print(f"Installing missing dependencies: {e}")
    import sys, subprocess
    pkgs = ['pandas', 'numpy', 'scikit-learn', 'xgboost', 'lightgbm',
            'matplotlib', 'seaborn', 'joblib', 'tqdm', 'pyarrow']
    subprocess.check_call([sys.executable, '-m', 'pip', 'install'] + pkgs)
    from sklearn.preprocessing import StandardScaler
    from sklearn.metrics import accuracy_score, roc_auc_score
    from sklearn.utils.class_weight import compute_sample_weight
    from sklearn.ensemble import GradientBoostingClassifier
    import joblib
    print("✅ Dependencies installed")

# Try to mount Google Drive if available (Colab environment)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    IS_COLAB = True
    BASE_DIR = '/content/drive/MyDrive/daygent_v1_models'  # same base folder as your LGBM 4h
    print("✅ Google Drive mounted (Colab environment)")
except ImportError:
    IS_COLAB = False
    BASE_DIR = './daygent_v1_models'
    print("✅ Local environment detected")

# Core imports
import os
import pandas as pd
import numpy as np
import json
from datetime import datetime, timedelta
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Paths
DATA_DIR = os.path.join(BASE_DIR, 'spy_data_export')
ORIGINAL_MODEL_DIR = os.path.join(BASE_DIR, 'gb_1d_versionlock')  # Load from original location
REVERSE_MODEL_DIR = os.path.join(BASE_DIR, 'gb_1d_reverse_versionlock')  # Save reverse test results here
os.makedirs(REVERSE_MODEL_DIR, exist_ok=True)

print("🔄 GB1D Model Reverse Test - Loading and Verifying Saved Model")
print("Target: Reproduce EXACT same results as original gb1d_iso.ipynb run")
print("Expected test accuracy: 0.7143, test AUC: 0.7733, threshold: 0.57")
print("="*80)
print(f"✅ Original model directory: {ORIGINAL_MODEL_DIR}")
print(f"✅ Reverse test output directory: {REVERSE_MODEL_DIR}")
print(f"✅ Data directory: {DATA_DIR}")


🔧 Setting up dependencies...
✅ Core dependencies already available
Mounted at /content/drive
✅ Google Drive mounted (Colab environment)
🔄 GB1D Model Reverse Test - Loading and Verifying Saved Model
Target: Reproduce EXACT same results as original gb1d_iso.ipynb run
Expected test accuracy: 0.7143, test AUC: 0.7733, threshold: 0.57
✅ Original model directory: /content/drive/MyDrive/daygent_v1_models/gb_1d_versionlock
✅ Reverse test output directory: /content/drive/MyDrive/daygent_v1_models/gb_1d_reverse_versionlock
✅ Data directory: /content/drive/MyDrive/daygent_v1_models/spy_data_export


In [2]:
# ========================================
# VERSION EXTRACTION FOR ENVIRONMENT LOCKING
# ========================================
print("\n🔒 Extracting exact library versions for environment locking...")

import sys
import json
import pandas as pd
import numpy as np
import sklearn
import xgboost as xgb
import lightgbm as lgb
import joblib

versions = {
    'python': sys.version,
    'pandas': pd.__version__,
    'numpy': np.__version__,
    'scikit-learn': sklearn.__version__,
    'xgboost': xgb.__version__,
    'lightgbm': lgb.__version__,
    'joblib': joblib.__version__
}

print("📦 Current library versions:")
for lib, ver in versions.items():
    print(f"  {lib}: {ver}")

# Save versions to file for environment creation
versions_file = os.path.join(REVERSE_MODEL_DIR, 'training_versions.json')
with open(versions_file, 'w') as f:
    json.dump(versions, f, indent=2)

print(f"✅ Versions saved to: {versions_file}")


🔒 Extracting exact library versions for environment locking...
📦 Current library versions:
  python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
  pandas: 2.2.2
  numpy: 2.0.2
  scikit-learn: 1.6.1
  xgboost: 3.0.4
  lightgbm: 4.6.0
  joblib: 1.5.1
✅ Versions saved to: /content/drive/MyDrive/daygent_v1_models/gb_1d_reverse_versionlock/training_versions.json


In [3]:
# ========================================
# STEP 2: LOAD SAVED MODEL ARTIFACTS
# ========================================
print("\n🔧 Loading saved model artifacts...")

# Load configuration files
with open(os.path.join(ORIGINAL_MODEL_DIR, 'results_gb_1d.json'), 'r') as f:
    original_results = json.load(f)

with open(os.path.join(ORIGINAL_MODEL_DIR, 'deployment_config.json'), 'r') as f:
    config = json.load(f)

# Load model and scaler
model_path = os.path.join(ORIGINAL_MODEL_DIR, 'gb_1d_final.joblib')
scaler_path = os.path.join(ORIGINAL_MODEL_DIR, 'scaler_1d.joblib')

gb_model = joblib.load(model_path)
scaler = joblib.load(scaler_path)

# Extract key parameters
FEATURE_NAMES = config['feature_names']
THRESHOLD = config['calibrated_threshold']
TEST_PERIOD = original_results['test_period']

print(f"✅ Model loaded: {type(gb_model).__name__}")
print(f"✅ Scaler loaded: {type(scaler).__name__}")
print(f"📊 Feature count: {len(FEATURE_NAMES)}")
print(f"🎯 Calibrated threshold: {THRESHOLD}")
print(f"📅 Original test period: {TEST_PERIOD}")

# Load original predictions for comparison
original_preds = pd.read_csv(os.path.join(ORIGINAL_MODEL_DIR, 'test_predictions_1d.csv'))
print(f"📋 Original predictions loaded: {len(original_preds)} samples")



🔧 Loading saved model artifacts...
✅ Model loaded: GradientBoostingClassifier
✅ Scaler loaded: StandardScaler
📊 Feature count: 16
🎯 Calibrated threshold: 0.57
📅 Original test period: 2024-12-17 to 2025-02-07
📋 Original predictions loaded: 35 samples


In [4]:
# ========================================
# STEP 3: LOAD RAW DATA (EXACT SAME AS ORIGINAL)
# ========================================
print("\n📊 Loading raw data (matching original process)...")

TIMEFRAMES_ORDERED = ['1d', '4h']
raw_data = {}

for tf in TIMEFRAMES_ORDERED:
    csv_file = os.path.join(DATA_DIR, f'spy_{tf}.csv')
    if not os.path.exists(csv_file):
        raise FileNotFoundError(f"❌ {csv_file} not found!")
    df = pd.read_csv(csv_file)
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values('timestamp').reset_index(drop=True)
    raw_data[tf] = df
    print(f"✅ Loaded {tf} data: {len(df):,} candles")
    print(f"📅 {tf} range: {df['timestamp'].min()} to {df['timestamp'].max()}")



📊 Loading raw data (matching original process)...
✅ Loaded 1d data: 2,547 candles
📅 1d range: 2014-12-23 14:30:00+00:00 to 2025-02-07 14:30:00+00:00
✅ Loaded 4h data: 3,058 candles
📅 4h range: 2019-01-07 14:30:00+00:00 to 2025-02-10 14:30:00+00:00


In [5]:
# ========================================
# STEP 4: RECREATE EXACT TEST PERIOD (MATCHING ORIGINAL LOGIC)
# ========================================
print("\n🎯 Recreating exact test period determination...")

# Find common date range between 1d and 4h data
latest_start = max(raw_data['1d']['timestamp'].min(), raw_data['4h']['timestamp'].min())
earliest_end  = min(raw_data['1d']['timestamp'].max(), raw_data['4h']['timestamp'].max())

# Find common trading days
common_dates = set(raw_data['1d'][(raw_data['1d']['timestamp'] >= latest_start) &
                                  (raw_data['1d']['timestamp'] <= earliest_end)]['timestamp'].dt.date.unique())
common_dates &= set(raw_data['4h'][(raw_data['4h']['timestamp'] >= latest_start) &
                                   (raw_data['4h']['timestamp'] <= earliest_end)]['timestamp'].dt.date.unique())

# Select last 35 days (same as original)
all_days = sorted(common_dates)
TEST_DAYS = min(35, len(all_days))
selected_days = all_days[-TEST_DAYS:]

test_start = pd.Timestamp.combine(selected_days[0],  pd.Timestamp.min.time()).tz_localize('UTC')
test_end   = pd.Timestamp.combine(selected_days[-1], pd.Timestamp.max.time()).tz_localize('UTC')

print(f"📅 Recreated test period: {test_start.date()} → {test_end.date()} ({TEST_DAYS} trading days)")
print(f"🔍 Original test period: {TEST_PERIOD}")

# Verify we got the exact same period
expected_start = "2024-12-17"
expected_end = "2025-02-07"
if str(test_start.date()) == expected_start and str(test_end.date()) == expected_end:
    print("✅ Test period matches original exactly!")
else:
    print(f"⚠️  Test period mismatch! Expected: {expected_start} to {expected_end}")



🎯 Recreating exact test period determination...
📅 Recreated test period: 2024-12-17 → 2025-02-07 (35 trading days)
🔍 Original test period: 2024-12-17 to 2025-02-07
✅ Test period matches original exactly!


In [6]:
# ========================================
# STEP 5: FEATURE EXTRACTION FUNCTIONS (EXACT SAME AS ORIGINAL)
# ========================================
print("\n🔧 Setting up feature extraction (exact same as original)...")

def parse_vector_column(vector_str):
    """Parse vector string to numpy array"""
    if pd.isna(vector_str) or vector_str is None:
        return None
    if isinstance(vector_str, str):
        s = vector_str.strip('[]"')
        try:
            return np.array([float(x.strip()) for x in s.split(',')])
        except ValueError:
            return None
    return np.array(vector_str)

def build_feature_vector(raw_ohlcv, iso_ohlc, tf, tf_list):
    """Build 16-feature vector"""
    o, h, l, c, v = raw_ohlcv
    features = list(raw_ohlcv)          # 5
    features.extend(list(iso_ohlc))     # 4
    features.extend([1 if tf == t else 0 for t in tf_list])  # 2
    features.extend([
        (h - l) / c if c else 0,        # hl_range
        (c - o) / o if o else 0,        # price_change
        (h - c) / c if c else 0,        # upper_shadow
        (c - l) / c if c else 0,        # lower_shadow
        v / 1_000_000,                  # volume_m
    ])  # 5
    return np.array(features, dtype=float)

def extract_features_1d(row):
    raw_ohlcv = parse_vector_column(row.get('raw_ohlcv_vec'))
    iso_ohlc  = parse_vector_column(row.get('iso_ohlc'))
    future    = row.get('future')
    if raw_ohlcv is None or iso_ohlc is None or pd.isna(future):
        return None, None
    if len(raw_ohlcv) != 5 or len(iso_ohlc) != 4:
        return None, None
    return build_feature_vector(raw_ohlcv, iso_ohlc, '1d', TIMEFRAMES_ORDERED), int(future)

print(f"✅ Feature extraction functions ready")
print(f"📋 Expected feature names: {FEATURE_NAMES}")



🔧 Setting up feature extraction (exact same as original)...
✅ Feature extraction functions ready
📋 Expected feature names: ['raw_o', 'raw_h', 'raw_l', 'raw_c', 'raw_v', 'iso_0', 'iso_1', 'iso_2', 'iso_3', 'tf_1d', 'tf_4h', 'hl_range', 'price_change', 'upper_shadow', 'lower_shadow', 'volume_m']


In [7]:
# ========================================
# STEP 6: EXTRACT TEST FEATURES (EXACT SAME LOGIC)
# ========================================
print("\n🔄 Extracting test features (matching original process)...")

df_1d = raw_data['1d']
test_df = df_1d[(df_1d['timestamp'] >= test_start) & (df_1d['timestamp'] <= test_end)].copy()

print(f"📊 Test samples from data: {len(test_df)}")
print(f"📊 Expected test samples: {original_results['test_samples']}")

# Extract test features and store detailed info
X_test, y_test, test_timestamps = [], [], []
test_rows_info = []

for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc="Extracting test features"):
    fv, lbl = extract_features_1d(row)
    if fv is not None:
        X_test.append(fv)
        y_test.append(lbl)
        test_timestamps.append(row['timestamp'])
        test_rows_info.append({
            'timestamp': row['timestamp'],
            'raw_ohlcv': parse_vector_column(row['raw_ohlcv_vec']),
            'iso_ohlc':  parse_vector_column(row['iso_ohlc']),
            'future': int(row['future']),
            'feature_vector': fv
        })

X_test = np.array(X_test)
y_test = np.array(y_test)

print(f"📊 Extracted test features: {X_test.shape}")
print(f"📊 Test labels: {len(y_test)}")

# Verify we got exactly the same number of samples
if len(X_test) == original_results['test_samples']:
    print("✅ Test sample count matches original exactly!")
else:
    print(f"⚠️  Sample count mismatch! Got {len(X_test)}, expected {original_results['test_samples']}")



🔄 Extracting test features (matching original process)...
📊 Test samples from data: 35
📊 Expected test samples: 35


Extracting test features: 100%|██████████| 35/35 [00:00<00:00, 8139.31it/s]

📊 Extracted test features: (35, 16)
📊 Test labels: 35
✅ Test sample count matches original exactly!





In [8]:
# ========================================
# STEP 7: MAKE PREDICTIONS WITH LOADED MODEL
# ========================================
print("\n🧠 Making predictions with loaded model...")

# Scale test features with loaded scaler (same as original)
X_test_scaled = scaler.transform(X_test) if len(X_test) else np.empty((0, len(FEATURE_NAMES)))

# Make predictions
test_pred_proba = gb_model.predict_proba(X_test_scaled)[:, 1] if len(X_test_scaled) else np.array([])
test_pred = (test_pred_proba >= THRESHOLD).astype(int) if len(test_pred_proba) else np.array([])

# Calculate metrics
test_acc = accuracy_score(y_test, test_pred) if len(test_pred) else float('nan')
test_auc = roc_auc_score(y_test, test_pred_proba) if (len(test_pred_proba) and len(np.unique(y_test))==2) else float('nan')

print(f"\n🎯 REVERSE TEST RESULTS:")
print(f"✅ Test Accuracy: {test_acc:.4f}")
print(f"✅ Test AUC: {test_auc:.4f}")
print(f"📊 Threshold used: {THRESHOLD}")
if len(test_pred):
    print(f"📊 Test predictions: {np.bincount(test_pred)}")
    print(f"📊 Actual labels: {np.bincount(y_test)}")

print(f"\n🔍 COMPARISON WITH ORIGINAL:")
print(f"Original Test Accuracy: {original_results['test_accuracy']:.4f}")
print(f"Original Test AUC: {original_results['test_auc']:.4f}")
print(f"Original Threshold: {original_results['chosen_threshold']}")



🧠 Making predictions with loaded model...

🎯 REVERSE TEST RESULTS:
✅ Test Accuracy: 0.7143
✅ Test AUC: 0.7733
📊 Threshold used: 0.57
📊 Test predictions: [13 22]
📊 Actual labels: [15 20]

🔍 COMPARISON WITH ORIGINAL:
Original Test Accuracy: 0.7143
Original Test AUC: 0.7733
Original Threshold: 0.57


In [9]:
# ========================================
# STEP 8: DETAILED PREDICTION-BY-PREDICTION COMPARISON
# ========================================
print("\n🔍 Detailed prediction-by-prediction comparison...")

# Build new predictions table
new_records = []
for i, info in enumerate(test_rows_info):
    ts   = info['timestamp']
    fv   = info['feature_vector']
    raw  = info['raw_ohlcv']
    iso  = info['iso_ohlc']
    true = info['future']

    proba = float(test_pred_proba[i])
    pred  = int(test_pred[i])
    correct = bool(pred == true)
    margin = proba - THRESHOLD

    rec = {
        'candle_index_in_test': i + 1,
        'timestamp_utc': ts,
        'date_utc': ts.date(),
        'pred_prob_up': proba,
        'pred_label': int(pred),
        'true_label': int(true),
        'correct': correct,
        'threshold_used': THRESHOLD,
        'decision_margin': margin,
        'raw_o': raw[0], 'raw_h': raw[1], 'raw_l': raw[2], 'raw_c': raw[3], 'raw_v': raw[4],
        'iso_0': iso[0], 'iso_1': iso[1], 'iso_2': iso[2], 'iso_3': iso[3],
        'tf_1d': fv[FEATURE_NAMES.index('tf_1d')],
        'tf_4h': fv[FEATURE_NAMES.index('tf_4h')],
        'hl_range': fv[FEATURE_NAMES.index('hl_range')],
        'price_change': fv[FEATURE_NAMES.index('price_change')],
        'upper_shadow': fv[FEATURE_NAMES.index('upper_shadow')],
        'lower_shadow': fv[FEATURE_NAMES.index('lower_shadow')],
        'volume_m': fv[FEATURE_NAMES.index('volume_m')],
    }
    new_records.append(rec)

new_pred_df = pd.DataFrame.from_records(new_records).sort_values(['date_utc','timestamp_utc']).reset_index(drop=True)

print(f"📊 New predictions table: {len(new_pred_df)} rows")
print(f"📊 Original predictions table: {len(original_preds)} rows")

# Compare key columns
comparison_cols = ['pred_prob_up', 'pred_label', 'true_label', 'correct']
print("\n🔍 Comparing key prediction columns:")

all_match = True
for col in comparison_cols:
    if col in original_preds.columns and col in new_pred_df.columns:
        if col == 'pred_prob_up':
            # For probabilities, allow small floating point differences
            diff = np.abs(original_preds[col].values - new_pred_df[col].values)
            max_diff = np.max(diff)
            matches = np.allclose(original_preds[col].values, new_pred_df[col].values, rtol=1e-10, atol=1e-10)
            print(f"  {col}: {'✅ EXACT MATCH' if matches else '❌ MISMATCH'} (max diff: {max_diff:.2e})")
            if not matches:
                all_match = False
        else:
            matches = (original_preds[col].values == new_pred_df[col].values).all()
            print(f"  {col}: {'✅ EXACT MATCH' if matches else '❌ MISMATCH'}")
            if not matches:
                all_match = False
    else:
        print(f"  {col}: ⚠️  Column not found in one of the datasets")
        all_match = False

print(f"\n🎯 OVERALL COMPARISON: {'✅ ALL PREDICTIONS MATCH EXACTLY!' if all_match else '❌ SOME DIFFERENCES FOUND'}")



🔍 Detailed prediction-by-prediction comparison...
📊 New predictions table: 35 rows
📊 Original predictions table: 35 rows

🔍 Comparing key prediction columns:
  pred_prob_up: ✅ EXACT MATCH (max diff: 1.11e-16)
  pred_label: ✅ EXACT MATCH
  true_label: ✅ EXACT MATCH
  correct: ✅ EXACT MATCH

🎯 OVERALL COMPARISON: ✅ ALL PREDICTIONS MATCH EXACTLY!


In [10]:
# ========================================
# STEP 9: FINAL VALIDATION SUMMARY
# ========================================
print("\n" + "="*80)
print("FINAL VALIDATION SUMMARY")
print("="*80)

# Metrics comparison
acc_match = abs(test_acc - original_results['test_accuracy']) < 1e-10
auc_match = abs(test_auc - original_results['test_auc']) < 1e-10
threshold_match = abs(THRESHOLD - original_results['chosen_threshold']) < 1e-10

print(f"📊 Test Accuracy Match: {'✅' if acc_match else '❌'} ({test_acc:.6f} vs {original_results['test_accuracy']:.6f})")
print(f"📊 Test AUC Match: {'✅' if auc_match else '❌'} ({test_auc:.6f} vs {original_results['test_auc']:.6f})")
print(f"📊 Threshold Match: {'✅' if threshold_match else '❌'} ({THRESHOLD} vs {original_results['chosen_threshold']})")
print(f"📊 Sample Count Match: {'✅' if len(X_test) == original_results['test_samples'] else '❌'} ({len(X_test)} vs {original_results['test_samples']})")
print(f"📊 Prediction Details Match: {'✅' if all_match else '❌'}")

# Overall validation
perfect_match = acc_match and auc_match and threshold_match and len(X_test) == original_results['test_samples'] and all_match

print(f"\n🎯 REVERSE TEST RESULT: {'🎉 PERFECT MATCH! Model loaded and reproduced identical results.' if perfect_match else '⚠️  Some differences detected. Review above for details.'}")

if perfect_match:
    print("\n✅ The saved GB1D model has been successfully validated!")
    print("✅ All predictions, metrics, and results match the original training run exactly.")
    print("✅ The model can be confidently deployed for production use.")
else:
    print("\n⚠️  Validation failed. The loaded model does not reproduce identical results.")
    print("⚠️  This could indicate:")
    print("   - Data loading differences")
    print("   - Feature extraction differences")
    print("   - Model/scaler loading issues")
    print("   - Random seed or environment differences")

print("\n" + "="*80)



FINAL VALIDATION SUMMARY
📊 Test Accuracy Match: ✅ (0.714286 vs 0.714286)
📊 Test AUC Match: ✅ (0.773333 vs 0.773333)
📊 Threshold Match: ✅ (0.57 vs 0.57)
📊 Sample Count Match: ✅ (35 vs 35)
📊 Prediction Details Match: ✅

🎯 REVERSE TEST RESULT: 🎉 PERFECT MATCH! Model loaded and reproduced identical results.

✅ The saved GB1D model has been successfully validated!
✅ All predictions, metrics, and results match the original training run exactly.
✅ The model can be confidently deployed for production use.



In [11]:
# ========================================
# STEP 10: SAVE REVERSE TEST RESULTS
# ========================================
print("\n💾 Saving reverse test results...")

reverse_test_results = {
    'reverse_test_date': datetime.now().isoformat(),
    'perfect_match': perfect_match,
    'loaded_model_results': {
        'test_accuracy': float(test_acc),
        'test_auc': float(test_auc),
        'threshold_used': float(THRESHOLD),
        'test_samples': int(len(X_test)),
        'predictions_match': all_match
    },
    'original_results': original_results,
    'differences': {
        'accuracy_diff': float(abs(test_acc - original_results['test_accuracy'])),
        'auc_diff': float(abs(test_auc - original_results['test_auc'])),
        'sample_count_match': len(X_test) == original_results['test_samples']
    }
}

# Save results to the reverse test directory
results_path = os.path.join(REVERSE_MODEL_DIR, 'gb1d_reverse_test_results.json')
with open(results_path, 'w') as f:
    json.dump(reverse_test_results, f, indent=2)

# Save the new predictions for manual inspection if needed
preds_path = os.path.join(REVERSE_MODEL_DIR, 'gb1d_reverse_test_predictions.csv')
new_pred_df.to_csv(preds_path, index=False)

print(f"✅ Reverse test results saved to: {results_path}")
print(f"✅ New predictions saved to: {preds_path}")
print("\n🏁 Reverse test complete!")



💾 Saving reverse test results...
✅ Reverse test results saved to: /content/drive/MyDrive/daygent_v1_models/gb_1d_reverse_versionlock/gb1d_reverse_test_results.json
✅ New predictions saved to: /content/drive/MyDrive/daygent_v1_models/gb_1d_reverse_versionlock/gb1d_reverse_test_predictions.csv

🏁 Reverse test complete!
