# In this notebook we will train the random forest model

In [5]:
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             make_scorer, accuracy_score, roc_auc_score)
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COLUMN, TARGET_COL = "timestamp", "target"
START_DATE, TEST_FRAC = "2022-01-01", 0.20 
DROP_COLS = [ 
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'rsi_oversold', 'rsi_overbought', 'stoch_overbought',
    'stoch_oversold', 'cci_overbought', 'cci_oversold', 'near_upper_band',
    'near_lower_band', 'overbought_reversal', 'oversold_reversal',
    'ema_cross_up', 'ema_cross_down', 'macd_cross_up', 'macd_cross_down',
    'trending_market', 'trend_alignment', 'ema7_above_ema21', 'macd_rising',
    'bollinger_upper', 'bollinger_lower', 'bullish_scenario_1',
    'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# LOAD DATA
# ──────────────────────────────────────────────────────────────
print("📊 Loading 4H Bitcoin data...")
if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COLUMN]).set_index(TIME_COLUMN).sort_index()
df = df.loc[START_DATE:].copy()

# Verify target column exists
if TARGET_COL not in df.columns:
    sys.exit(f"❌ Target column '{TARGET_COL}' not found!")

X = df.drop(columns=[col for col in DROP_COLS if col in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

# Chronological split (IMPORTANT: maintains time order)
split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"   📅 Date range: {df.index.min()} to {df.index.max()}")
print(f"   📊 Train: {X_train.shape[0]:,} samples | Test: {X_test.shape[0]:,} samples")
print(f"   🎯 Features: {X_train.shape[1]} | Target balance: {y.mean():.1%} bullish")

# ──────────────────────────────────────────────────────────────
# CUSTOM SCORER (Fβ WITH β = 0.5 for 2x precision weight)
# ──────────────────────────────────────────────────────────────
def precision_weighted_f1(y_true, y_pred):
    """F-beta score with beta=0.5 to weight precision 2x more than recall."""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    if p + r == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

scorer = make_scorer(precision_weighted_f1, greater_is_better=True)

# ──────────────────────────────────────────────────────────────
# HYPERPARAMETER SEARCH
# ──────────────────────────────────────────────────────────────
param_dist = {
    "n_estimators":       [100, 150, 200, 250, 300, 400, 500],
    "max_depth":          [8, 10, 12, 15, 18, 20, None],
    "min_samples_split":  [5, 10, 15, 20, 25],
    "min_samples_leaf":   [2, 4, 6, 8, 10],
    "max_leaf_nodes":     [None, 50, 100, 200, 500],
    "max_features":       ["sqrt", "log2", 0.3, 0.5, 0.7],
    "bootstrap":          [True, False],
    "max_samples":        [0.7, 0.8, 0.9, 1.0],
    "class_weight":       [None, "balanced", "balanced_subsample"],
}

# Time-series cross-validation (respects temporal order)
cv = TimeSeriesSplit(n_splits=4)
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    scoring=scorer,
    n_iter=50,
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1  # Show progress
)

print("\n🔍 Running hyperparameter optimization...")
start = time.time()
search.fit(X_train, y_train)
search_time = time.time() - start

print(f"⏱️  Optimization completed in {search_time:.1f}s")
print(f"🎯 Best CV score: {search.best_score_:.4f}")

# ──────────────────────────────────────────────────────────────
# RESULTS & EVALUATION
# ──────────────────────────────────────────────────────────────
print("\n🌟 OPTIMAL PARAMETERS:")
print("-" * 40)
for k, v in search.best_params_.items():
    print(f"   {k:<20}: {v}")

# Test set evaluation
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\n📊 TEST SET PERFORMANCE:")
print("-" * 40)
print(f"   Accuracy                : {accuracy_score(y_test, y_pred):.4f}")
print(f"   Precision               : {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"   Recall                  : {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"   F1 (standard)           : {f1_score(y_test, y_pred, zero_division=0):.4f}")
print(f"   F1 (precision-weighted) : {precision_weighted_f1(y_test, y_pred):.4f}")
print(f"   ROC-AUC                 : {roc_auc_score(y_test, y_prob):.4f}")

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🌟 TOP 10 MOST IMPORTANT FEATURES:")
print("-" * 40)
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"   {i:2d}. {row['feature']:<20}: {row['importance']:.4f}")

# Additional insights
print(f"\n📈 TRAINING INSIGHTS:")
print("-" * 40)
print(f"   Train period: {df.index[0]} to {df.index[split-1]}")
print(f"   Test period:  {df.index[split]} to {df.index[-1]}")
print(f"   CV folds:     {cv.n_splits}")
print(f"   Total params tested: {len(search.cv_results_['params'])}")

print(f"\n✅ Optimization complete! Use these parameters for your production Random Forest model.")

📊 Loading 4H Bitcoin data...
   📅 Date range: 2022-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 5,672 samples | Test: 1,419 samples
   🎯 Features: 37 | Target balance: 50.6% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 147.0s
🎯 Best CV score: 0.5588

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 300
   min_samples_split   : 15
   min_samples_leaf    : 4
   max_samples         : 0.9
   max_leaf_nodes      : 200
   max_features        : 0.3
   max_depth           : 10
   class_weight        : balanced
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5321
   Precision               : 0.5494
   Recall                  : 0.4890
   F1 (standard)           : 0.5174
   F1 (precision-weighted) : 0.5361
   ROC-AUC                 : 0.5344

🌟 TOP 10 MOST IMPORTANT FEATURES:
----

📊 Loading 4H Bitcoin data...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 samples | Test: 4,046 samples
   🎯 Features: 37 | Target balance: 51.8% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 344.3s
🎯 Best CV score: 0.5087

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 100
   min_samples_split   : 5
   min_samples_leaf    : 8
   max_samples         : 0.9
   max_leaf_nodes      : 100
   max_features        : 0.7
   max_depth           : 20
   class_weight        : None
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5368
   Precision               : 0.5677
   Recall                  : 0.4115
   F1 (standard)           : 0.4771
   F1 (precision-weighted) : 0.5276
   ROC-AUC                 : 0.5510

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0810
    2. buying_pressure     : 0.0535
    3. atr_ratio           : 0.0462
    4. volume_mean_20      : 0.0445
    5. adx                 : 0.0430
    6. volume              : 0.0424
    7. fear_greed_score    : 0.0416
    8. roc_24h             : 0.0415
    9. OBV                 : 0.0398
   10. stoch_%K            : 0.0398

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

📊 Loading 4H Bitcoin data...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 samples | Test: 3,171 samples
   🎯 Features: 37 | Target balance: 51.1% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 343.2s
🎯 Best CV score: 0.5165

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 400
   min_samples_split   : 5
   min_samples_leaf    : 10
   max_samples         : 0.9
   max_leaf_nodes      : 500
   max_features        : 0.5
   max_depth           : 8
   class_weight        : balanced_subsample
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5244
   Precision               : 0.5769
   Recall                  : 0.3351
   F1 (standard)           : 0.4240
   F1 (precision-weighted) : 0.5042
   ROC-AUC                 : 0.5501

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0778
    2. buying_pressure     : 0.0613
    3. roc_24h             : 0.0469
    4. bb_position         : 0.0459
    5. fear_greed_score    : 0.0449
    6. atr_ratio           : 0.0431
    7. adx                 : 0.0415
    8. volume_mean_20      : 0.0404
    9. stoch_%K            : 0.0404
   10. CCI                 : 0.0386

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

📊 Loading 4H Bitcoin data...
   📅 Date range: 2020-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 9,180 samples | Test: 2,296 samples
   🎯 Features: 37 | Target balance: 51.0% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 314.8s
🎯 Best CV score: 0.5496

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 300
   min_samples_split   : 15
   min_samples_leaf    : 2
   max_samples         : 0.9
   max_leaf_nodes      : 100
   max_features        : log2
   max_depth           : None
   class_weight        : None
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5366
   Precision               : 0.5727
   Recall                  : 0.4228
   F1 (standard)           : 0.4865
   F1 (precision-weighted) : 0.5348
   ROC-AUC                 : 0.5327

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0611
    2. buying_pressure     : 0.0475
    3. bb_position         : 0.0431
    4. stoch_%K            : 0.0418
    5. fear_greed_score    : 0.0417
    6. price_vs_vwap       : 0.0397
    7. CCI                 : 0.0384
    8. roc_24h             : 0.0371
    9. stoch_%D            : 0.0367
   10. OBV                 : 0.0363

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2020-01-01 00:00:00 to 2024-03-10 08:00:00
   Test period:  2024-03-10 12:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

📊 Loading 4H Bitcoin data...
   📅 Date range: 2022-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 5,672 samples | Test: 1,419 samples
   🎯 Features: 37 | Target balance: 50.6% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 147.0s
🎯 Best CV score: 0.5588

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 300
   min_samples_split   : 15
   min_samples_leaf    : 4
   max_samples         : 0.9
   max_leaf_nodes      : 200
   max_features        : 0.3
   max_depth           : 10
   class_weight        : balanced
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5321
   Precision               : 0.5494
   Recall                  : 0.4890
   F1 (standard)           : 0.5174
   F1 (precision-weighted) : 0.5361
   ROC-AUC                 : 0.5344

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0645
    2. buying_pressure     : 0.0505
    3. volume_ratio        : 0.0435
    4. stoch_%K            : 0.0427
    5. bb_position         : 0.0419
    6. CCI                 : 0.0400
    7. atr_ratio           : 0.0394
    8. volume              : 0.0392
    9. fear_greed_score    : 0.0378
   10. stoch_%D            : 0.0377

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2022-01-01 00:00:00 to 2024-08-03 12:00:00
   Test period:  2024-08-03 16:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

In [3]:
# =============================================================
#  RANDOM-FOREST  •  FINAL TRAINING WITH OPTIMAL PARAMS
# =============================================================
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION
# ──────────────────────────────────────────────────────────────
CSV_FILE     = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
                    r"\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COLUMN  = "timestamp"
TARGET_COL   = "target"
START_DATE   = "2018-01-01"
TEST_FRAC    = 0.20

DROP_COLS = [
    'open','high','low','high_low','high_close','low_close','typical_price',
    'volume_breakout','volume_breakdown','break_upper_band','break_lower_band',
    'vol_spike_1_5x','rsi_oversold','rsi_overbought','stoch_overbought',
    'stoch_oversold','cci_overbought','cci_oversold','near_upper_band',
    'near_lower_band','overbought_reversal','oversold_reversal',
    'ema_cross_up','ema_cross_down','macd_cross_up','macd_cross_down',
    'trending_market','trend_alignment','ema7_above_ema21','macd_rising',
    'bollinger_upper','bollinger_lower','bullish_scenario_1',
    'bullish_scenario_5','bearish_scenario_1'
]

best_params = {
    "n_estimators":     500,
    "max_depth":        6,
    "min_samples_split": 25,
    "min_samples_leaf": 8,
    "max_leaf_nodes":   200,
    "max_features":    "sqrt",
    "bootstrap":        True,
    "max_samples":      0.8,
    "class_weight":     None,
    "random_state":     42,
    "n_jobs":           -1
}

# ──────────────────────────────────────────────────────────────
# 2) LOAD & PREP DATA
# ──────────────────────────────────────────────────────────────
print("📊 Loading 4H Bitcoin data for final Random Forest training...")

if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COLUMN]).set_index(TIME_COLUMN).sort_index()
df = df.loc[START_DATE:].copy()

if TARGET_COL not in df.columns:
    sys.exit(f"❌ '{TARGET_COL}' column missing!")

X = df.drop(columns=[col for col in DROP_COLS if col in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"   📅 Date range: {df.index.min()} to {df.index.max()}")
print(f"   📊 Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,}")
print(f"   🎯 Features: {X_train.shape[1]} | Target balance: {y.mean():.1%} bullish")
print(f"   ⏰ Train period: {df.index[0]} to {df.index[split-1]}")
print(f"   🧪 Test period:  {df.index[split]} to {df.index[-1]}")

# ──────────────────────────────────────────────────────────────
# 3) TRAIN FINAL MODEL
# ──────────────────────────────────────────────────────────────
print(f"\n🚀 Training final Random Forest with optimal parameters...")
print("   Parameters:")
for k, v in best_params.items():
    print(f"      {k:<18}: {v}")

t0 = time.time()
rf_final = RandomForestClassifier(**best_params)
rf_final.fit(X_train, y_train)
training_time = time.time() - t0

print(f"🟢 Model trained successfully in {training_time:.1f}s")

# ──────────────────────────────────────────────────────────────
# 4) EVALUATE MODEL
# ──────────────────────────────────────────────────────────────
print(f"\n📊 FINAL MODEL EVALUATION")
print("=" * 40)

y_pred = rf_final.predict(X_test)
y_prob = rf_final.predict_proba(X_test)[:, 1] if rf_final.n_classes_ == 2 else rf_final.predict_proba(X_test).max(axis=1)

def precision_weighted_f1(y_true, y_pred):
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    return (1 + beta**2) * p * r / (beta**2 * p + r) if (p + r) > 0 else 0.0

metrics = {
    "Accuracy":                 accuracy_score(y_test, y_pred),
    "Precision":                precision_score(y_test, y_pred, zero_division=0),
    "Recall":                   recall_score(y_test, y_pred, zero_division=0),
    "F1 (standard)":            f1_score(y_test, y_pred, zero_division=0),
    "F1 (precision-weighted)":  precision_weighted_f1(y_test, y_pred),
    "ROC-AUC":                  roc_auc_score(y_test, y_prob)
}

print("🎯 Test Set Performance:")
for k, v in metrics.items():
    print(f"   {k:<25}: {v:.4f}")

# ──────────────────────────────────────────────────────────────
# 5) FEATURE IMPORTANCE
# ──────────────────────────────────────────────────────────────
print(f"\n🌟 FEATURE IMPORTANCE ANALYSIS")
print("-" * 40)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(15).iterrows(), 1):
    print(f"   {i:2d}. {row['feature']:<20}: {row['importance']:.4f}")

# ──────────────────────────────────────────────────────────────
# 6) SUMMARY
# ──────────────────────────────────────────────────────────────
print(f"\n🎉 TRAINING COMPLETE!")
print("=" * 50)
print(f"🎯 Model Performance Summary:")
print(f"   • Accuracy: {metrics['Accuracy']:.3f}")
print(f"   • Precision: {metrics['Precision']:.3f} (optimized metric)")
print(f"   • F1-weighted: {metrics['F1 (precision-weighted)']:.3f}")
print(f"   • Training time: {training_time:.1f}s")
print(f"   • Features used: {len(X_train.columns)}")
print(f"\n🚀 Ready for downstream use or ensemble integration!")


📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 | Test: 3,171
   🎯 Features: 37 | Target balance: 51.1% bullish
   ⏰ Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   🧪 Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 500
      max_depth         : 6
      min_samples_split : 25
      min_samples_leaf  : 8
      max_leaf_nodes    : 200
      max_features      : sqrt
      bootstrap         : True
      max_samples       : 0.8
      class_weight      : None
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 1.3s

📊 FINAL MODEL EVALUATION
🎯 Test Set Performance:
   Accuracy                 : 0.5282
   Precision                : 0.5813
   Recall                   : 0.3454
   F1 (standard)            : 0.4333
   F1 (precision-weighted)  : 0.5114

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 | Test: 4,046
   🎯 Features: 37 | Target balance: 51.8% bullish
   ⏰ Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   🧪 Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 15
      min_samples_split : 10
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : sqrt
      bootstrap         : True
      max_samples       : 0.8
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 1.7s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5309
   Precision                : 0.5776
   Recall                   : 0.3224
   F1 (standard)            : 0.4138
   F1 (precision-weighted)  : 0.4987
   ROC-AUC                  : 0.5489

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0530
    2. buying_pressure     : 0.0427
    3. bb_position         : 0.0406
    4. stoch_%K            : 0.0395
    5. fear_greed_score    : 0.0393
    6. atr_ratio           : 0.0385
    7. roc_24h             : 0.0376
    8. volume_ratio        : 0.0374
    9. adx                 : 0.0373
   10. stoch_%D            : 0.0373
   11. volume              : 0.0369
   12. price_vs_vwap       : 0.0363
   13. CCI                 : 0.0363
   14. volume_mean_20      : 0.0360
   15. parkinson_vol       : 0.0351

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.531
   • Precision: 0.578 (optimized metric)
   • F1-weighted: 0.499
   • Training time: 1.7s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 | Test: 4,046
   🎯 Features: 37 | Target balance: 51.8% bullish
   ⏰ Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   🧪 Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 10
      min_samples_split : 15
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : 0.3
      bootstrap         : True
      max_samples       : 0.9
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 2.7s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5314
   Precision                : 0.5750
   Recall                   : 0.3359
   F1 (standard)            : 0.4241
   F1 (precision-weighted)  : 0.5033
   ROC-AUC                  : 0.5537

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0628
    2. buying_pressure     : 0.0481
    3. stoch_%K            : 0.0426
    4. bb_position         : 0.0426
    5. fear_greed_score    : 0.0407
    6. stoch_%D            : 0.0400
    7. volume_mean_20      : 0.0397
    8. atr_ratio           : 0.0396
    9. volume              : 0.0391
   10. roc_24h             : 0.0388
   11. volume_ratio        : 0.0383
   12. adx                 : 0.0379
   13. price_vs_vwap       : 0.0373
   14. RSI                 : 0.0367
   15. CCI                 : 0.0364

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.531
   • Precision: 0.575 (optimized metric)
   • F1-weighted: 0.503
   • Training time: 2.7s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 | Test: 3,171
   🎯 Features: 37 | Target balance: 51.1% bullish
   ⏰ Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   🧪 Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 15
      min_samples_split : 10
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : sqrt
      bootstrap         : True
      max_samples       : 0.8
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 1.4s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5251
   Precision                : 0.5908
   Recall                   : 0.2947
   F1 (standard)            : 0.3932
   F1 (precision-weighted)  : 0.4919
   ROC-AUC                  : 0.5537

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0526
    2. buying_pressure     : 0.0456
    3. fear_greed_score    : 0.0421
    4. bb_position         : 0.0415
    5. roc_24h             : 0.0403
    6. stoch_%K            : 0.0389
    7. CCI                 : 0.0382
    8. adx                 : 0.0374
    9. stoch_%D            : 0.0365
   10. volume_mean_20      : 0.0363
   11. volume              : 0.0356
   12. atr_ratio           : 0.0356
   13. price_vs_vwap       : 0.0355
   14. volume_ratio        : 0.0351
   15. MACD_histogram      : 0.0339

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.525
   • Precision: 0.591 (optimized metric)
   • F1-weighted: 0.492
   • Training time: 1.4s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 | Test: 3,171
   🎯 Features: 37 | Target balance: 51.1% bullish
   ⏰ Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   🧪 Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 400
      max_depth         : 8
      min_samples_split : 5
      min_samples_leaf  : 10
      max_leaf_nodes    : 500
      max_features      : 0.5
      bootstrap         : True
      max_samples       : 0.9
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 3.6s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5244
   Precision                : 0.5769
   Recall                   : 0.3351
   F1 (standard)            : 0.4240
   F1 (precision-weighted)  : 0.5042
   ROC-AUC                  : 0.5501

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0778
    2. buying_pressure     : 0.0613
    3. roc_24h             : 0.0469
    4. bb_position         : 0.0459
    5. fear_greed_score    : 0.0449
    6. atr_ratio           : 0.0431
    7. adx                 : 0.0415
    8. volume_mean_20      : 0.0404
    9. stoch_%K            : 0.0404
   10. CCI                 : 0.0386
   11. stoch_%D            : 0.0377
   12. price_vs_vwap       : 0.0369
   13. volume              : 0.0354
   14. bollinger_width     : 0.0354
   15. volume_ratio        : 0.0353

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.524
   • Precision: 0.577 (optimized metric)
   • F1-weighted: 0.504
   • Training time: 3.6s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading data and training model with your best parameters...
   📊 Train: 12,684 | Test: 3,171 | Features: 37
🚀 Training Random Forest...
✅ Model trained in 1.4s

🎯 THRESHOLD SENSITIVITY ANALYSIS
==========================================================================================
Threshold  Accuracy   Precision   Recall     F1         Predictions  % of Test  % Change  
------------------------------------------------------------------------------------------
0.3        0.523      0.523       0.992      0.685      3140         99.0      %    +0.0%
0.4        0.538      0.540       0.778      0.637      2386         75.2      %    +0.0%
0.5        0.525      0.591       0.295      0.393      826          26.0      %    +0.0%
0.6        0.479      0.556       0.015      0.029      45           1.4       %   -94.6%
0.7        0.478      0.000       0.000      0.000      0            0.0       %  -100.0%
0.8        0.478      0.000       0.000      0.000      0            0.0       %  -100.0%

🏆 BEST THRESHOLDS BY METRIC:
------------------------------------------------------------
   Best Accuracy:  0.4  (Acc: 0.538, Prec: 0.540, Rec: 0.778)
   Best Precision: 0.5  (Prec: 0.591, Rec: 0.295, F1: 0.393)
   Best Recall:    0.3  (Rec: 0.992, Prec: 0.523, F1: 0.685)
   Best F1:        0.3  (F1: 0.685, Prec: 0.523, Rec: 0.992)

💡 TRADING STRATEGY RECOMMENDATIONS:
------------------------------------------------------------
   🛡️  Conservative:  No threshold achieves 65%+ precision
   ⚖️  Balanced:     0.3  (Prec: 0.523, Rec: 0.992, 3140 signals)
   ⚡ Aggressive:   0.4  (Rec: 0.778, 2386 signals)

📊 SIGNAL VOLUME ANALYSIS:
------------------------------------------------------------
   Default (0.5):   826 signals (26.0% of test set)
   High Volume:     3,140 signals at 0.3 (+0% vs default)
   Selective:       0 signals at 0.8 (-100% vs default)

📈 PERFORMANCE RANGES ACROSS THRESHOLDS:
------------------------------------------------------------
   Accuracy:   0.478 - 0.538
   Precision:  0.000 - 0.591
   Recall:     0.000 - 0.992
   F1 Score:   0.000 - 0.685
   Signals:    0 - 3,140

🎯 FINAL RECOMMENDATION:
============================================================
   🏆 Use threshold: 0.3
   📊 Performance:   Accuracy=0.523, Precision=0.523, Recall=0.992, F1=0.685
   📈 Signals:       3,140 (99.0% of test set)
   💡 Reason:        F1 score improved +74.1%

✅ Threshold analysis complete! Use threshold 0.3 for optimal performance.

📊 Loading data and training model with your best parameters...
   📊 Train: 12,684 | Test: 3,171 | Features: 37
🚀 Training Random Forest...
✅ Model trained in 1.7s

🎯 THRESHOLD SENSITIVITY ANALYSIS
==========================================================================================
Threshold  Accuracy   Precision   Recall     F1         Predictions  % of Test  % Change  
------------------------------------------------------------------------------------------
0.3        0.522      0.522       1.000      0.686      3171         100.0     %    +0.0%
0.4        0.534      0.534       0.855      0.657      2653         83.7      %    +0.0%
0.5        0.524      0.580       0.321      0.413      915          28.9      %    +0.0%
0.6        0.479      0.750       0.004      0.007      8            0.3       %   -99.1%
0.7        0.478      0.000       0.000      0.000      0            0.0       %  -100.0%
0.8        0.478      0.000       0.000      0.000      0            0.0       %  -100.0%

🏆 BEST THRESHOLDS BY METRIC:
------------------------------------------------------------
   Best Accuracy:  0.4  (Acc: 0.534, Prec: 0.534, Rec: 0.855)
   Best Precision: 0.6  (Prec: 0.750, Rec: 0.004, F1: 0.007)
   Best Recall:    0.3  (Rec: 1.000, Prec: 0.522, F1: 0.686)
   Best F1:        0.3  (F1: 0.686, Prec: 0.522, Rec: 1.000)

💡 TRADING STRATEGY RECOMMENDATIONS:
------------------------------------------------------------
   🛡️  Conservative:  0.6  (Prec: 0.750, 8 signals)
   ⚖️  Balanced:     0.3  (Prec: 0.522, Rec: 1.000, 3171 signals)
   ⚡ Aggressive:   0.4  (Rec: 0.855, 2653 signals)

📊 SIGNAL VOLUME ANALYSIS:
------------------------------------------------------------
   Default (0.5):   915 signals (28.9% of test set)
   High Volume:     3,171 signals at 0.3 (+0% vs default)
   Selective:       0 signals at 0.8 (-100% vs default)

📈 PERFORMANCE RANGES ACROSS THRESHOLDS:
------------------------------------------------------------
   Accuracy:   0.478 - 0.534
   Precision:  0.000 - 0.750
   Recall:     0.000 - 1.000
   F1 Score:   0.000 - 0.686
   Signals:    0 - 3,171

🎯 FINAL RECOMMENDATION:
============================================================
   🏆 Use threshold: 0.3
   📊 Performance:   Accuracy=0.522, Precision=0.522, Recall=1.000, F1=0.686
   📈 Signals:       3,171 (100.0% of test set)
   💡 Reason:        F1 score improved +66.1%

✅ Threshold analysis complete! Use threshold 0.3 for optimal performance.

In [13]:
# =============================================================
#  THRESHOLD EVALUATION  •  TEST YOUR TRAINED MODEL
# =============================================================
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION - PUT YOUR BEST PARAMETERS HERE
# ──────────────────────────────────────────────────────────────
CSV_FILE     = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
                    r"\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COLUMN  = "timestamp"
TARGET_COL   = "target"
START_DATE   = "2018-01-01"
TEST_FRAC    = 0.20

DROP_COLS = [
    'open','high','low','high_low','high_close','low_close','typical_price',
    'volume_breakout','volume_breakdown','break_upper_band','break_lower_band',
    'vol_spike_1_5x','rsi_oversold','rsi_overbought','stoch_overbought',
    'stoch_oversold','cci_overbought','cci_oversold','near_upper_band',
    'near_lower_band','overbought_reversal','oversold_reversal',
    'ema_cross_up','ema_cross_down','macd_cross_up','macd_cross_down',
    'trending_market','trend_alignment','ema7_above_ema21','macd_rising',
    'bollinger_upper','bollinger_lower','bullish_scenario_1',
    'bullish_scenario_5','bearish_scenario_1'
]

# 🎯 PUT YOUR BEST PARAMETERS HERE (from hyperparameter search)
BEST_PARAMS = {
    "n_estimators":     300,
    "max_depth":        15,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "max_leaf_nodes":   200,
    "max_features":    "sqrt",
    "bootstrap":        True,
    "max_samples":      0.8,
    "class_weight":     "balanced_subsample",
    "random_state":     42,
    "n_jobs":           -1
}

# ──────────────────────────────────────────────────────────────
# 2) LOAD DATA & TRAIN MODEL
# ──────────────────────────────────────────────────────────────
print("📊 Loading data and training model with your best parameters...")

if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COLUMN]).set_index(TIME_COLUMN).sort_index()
df = df.loc[START_DATE:].copy()

if TARGET_COL not in df.columns:
    sys.exit(f"❌ '{TARGET_COL}' column missing!")

X = df.drop(columns=[col for col in DROP_COLS if col in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"   📊 Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,} | Features: {X_train.shape[1]}")

# Train model
print("🚀 Training Random Forest...")
start_time = time.time()
model = RandomForestClassifier(**BEST_PARAMS)
model.fit(X_train, y_train)
train_time = time.time() - start_time

# Get predictions and probabilities
y_prob = model.predict_proba(X_test)[:, 1]
print(f"✅ Model trained in {train_time:.1f}s")

# ──────────────────────────────────────────────────────────────
# 3) THRESHOLD ANALYSIS
# ──────────────────────────────────────────────────────────────
print(f"\n🎯 THRESHOLD SENSITIVITY ANALYSIS")
print("=" * 90)
print(f"{'Threshold':<10} {'Accuracy':<10} {'Precision':<11} {'Recall':<10} {'F1':<10} {'Predictions':<12} {'% of Test':<10} {'% Change':<10}")
print("-" * 90)

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]
baseline_predictions = None
threshold_results = []

for i, threshold in enumerate(thresholds):
    # Apply threshold to probabilities
    y_pred_thresh = (y_prob >= threshold).astype(int)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_thresh)
    precision = precision_score(y_test, y_pred_thresh, zero_division=0)
    recall = recall_score(y_test, y_pred_thresh, zero_division=0)
    f1 = f1_score(y_test, y_pred_thresh, zero_division=0)
    
    # Count predictions
    positive_predictions = sum(y_pred_thresh)
    pct_of_test = (positive_predictions / len(y_test)) * 100
    
    # Set baseline (0.5 threshold) for comparison
    if threshold == 0.5:
        baseline_predictions = positive_predictions
    
    # Calculate percentage change from baseline
    if baseline_predictions is not None:
        pct_change = ((positive_predictions - baseline_predictions) / baseline_predictions * 100) if baseline_predictions > 0 else 0
    else:
        pct_change = 0
    
    threshold_results.append({
        'threshold': threshold,
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'predictions': positive_predictions,
        'pct_of_test': pct_of_test,
        'pct_change': pct_change
    })
    
    # Display row
    print(f"{threshold:<10.1f} {accuracy:<10.3f} {precision:<11.3f} {recall:<10.3f} {f1:<10.3f} "
          f"{positive_predictions:<12} {pct_of_test:<10.1f}% {pct_change:>+7.1f}%")

# ──────────────────────────────────────────────────────────────
# 4) ANALYSIS & RECOMMENDATIONS
# ──────────────────────────────────────────────────────────────

# Find best thresholds for different objectives
best_accuracy = max(threshold_results, key=lambda x: x['accuracy'])
best_precision = max(threshold_results, key=lambda x: x['precision'])
best_recall = max(threshold_results, key=lambda x: x['recall'])
best_f1 = max(threshold_results, key=lambda x: x['f1'])

print(f"\n🏆 BEST THRESHOLDS BY METRIC:")
print("-" * 60)
print(f"   Best Accuracy:  {best_accuracy['threshold']:.1f}  "
      f"(Acc: {best_accuracy['accuracy']:.3f}, Prec: {best_accuracy['precision']:.3f}, Rec: {best_accuracy['recall']:.3f})")
print(f"   Best Precision: {best_precision['threshold']:.1f}  "
      f"(Prec: {best_precision['precision']:.3f}, Rec: {best_precision['recall']:.3f}, F1: {best_precision['f1']:.3f})")
print(f"   Best Recall:    {best_recall['threshold']:.1f}  "
      f"(Rec: {best_recall['recall']:.3f}, Prec: {best_recall['precision']:.3f}, F1: {best_recall['f1']:.3f})")
print(f"   Best F1:        {best_f1['threshold']:.1f}  "
      f"(F1: {best_f1['f1']:.3f}, Prec: {best_f1['precision']:.3f}, Rec: {best_f1['recall']:.3f})")

# Find balanced options
print(f"\n💡 TRADING STRATEGY RECOMMENDATIONS:")
print("-" * 60)

# Conservative (high precision, low false positives)
conservative = [r for r in threshold_results if r['precision'] >= 0.65]
if conservative:
    best_conservative = max(conservative, key=lambda x: x['recall'])
    print(f"   🛡️  Conservative:  {best_conservative['threshold']:.1f}  "
          f"(Prec: {best_conservative['precision']:.3f}, {best_conservative['predictions']} signals)")
else:
    print(f"   🛡️  Conservative:  No threshold achieves 65%+ precision")

# Balanced (good precision AND recall)
balanced = [r for r in threshold_results if r['precision'] >= 0.50 and r['recall'] >= 0.40]
if balanced:
    best_balanced = max(balanced, key=lambda x: x['f1'])
    print(f"   ⚖️  Balanced:     {best_balanced['threshold']:.1f}  "
          f"(Prec: {best_balanced['precision']:.3f}, Rec: {best_balanced['recall']:.3f}, {best_balanced['predictions']} signals)")
else:
    print(f"   ⚖️  Balanced:     No threshold achieves 50%+ precision AND 40%+ recall")

# Aggressive (high recall, catch more opportunities)
aggressive = [r for r in threshold_results if r['recall'] >= 0.50]
if aggressive:
    best_aggressive = max(aggressive, key=lambda x: x['precision'])
    print(f"   ⚡ Aggressive:   {best_aggressive['threshold']:.1f}  "
          f"(Rec: {best_aggressive['recall']:.3f}, {best_aggressive['predictions']} signals)")
else:
    print(f"   ⚡ Aggressive:   No threshold achieves 50%+ recall")

# Volume analysis
print(f"\n📊 SIGNAL VOLUME ANALYSIS:")
print("-" * 60)
baseline_result = next(r for r in threshold_results if r['threshold'] == 0.5)
print(f"   Default (0.5):   {baseline_result['predictions']:,} signals ({baseline_result['pct_of_test']:.1f}% of test set)")

high_volume = [r for r in threshold_results if r['predictions'] >= baseline_result['predictions'] * 1.5]
if high_volume:
    best_volume = min(high_volume, key=lambda x: x['threshold'])  # Lowest threshold with high volume
    print(f"   High Volume:     {best_volume['predictions']:,} signals at {best_volume['threshold']:.1f} "
          f"({best_volume['pct_change']:+.0f}% vs default)")

low_volume = [r for r in threshold_results if r['predictions'] <= baseline_result['predictions'] * 0.6]
if low_volume:
    best_selective = max(low_volume, key=lambda x: x['threshold'])  # Highest threshold with low volume
    print(f"   Selective:       {best_selective['predictions']:,} signals at {best_selective['threshold']:.1f} "
          f"({best_selective['pct_change']:+.0f}% vs default)")

# Performance ranges
print(f"\n📈 PERFORMANCE RANGES ACROSS THRESHOLDS:")
print("-" * 60)
print(f"   Accuracy:   {min(r['accuracy'] for r in threshold_results):.3f} - {max(r['accuracy'] for r in threshold_results):.3f}")
print(f"   Precision:  {min(r['precision'] for r in threshold_results):.3f} - {max(r['precision'] for r in threshold_results):.3f}")
print(f"   Recall:     {min(r['recall'] for r in threshold_results):.3f} - {max(r['recall'] for r in threshold_results):.3f}")
print(f"   F1 Score:   {min(r['f1'] for r in threshold_results):.3f} - {max(r['f1'] for r in threshold_results):.3f}")
print(f"   Signals:    {min(r['predictions'] for r in threshold_results):,} - {max(r['predictions'] for r in threshold_results):,}")

# ──────────────────────────────────────────────────────────────
# 5) DATE-BY-DATE PREDICTIONS OUTPUT
# ──────────────────────────────────────────────────────────────
print(f"\n📅 GENERATING DATE-BY-DATE PREDICTIONS...")

# Use default 0.5 threshold for predictions
y_pred_final = (y_prob >= 0.5).astype(int)

# Create detailed predictions DataFrame
predictions_df = pd.DataFrame({
    'timestamp': X_test.index,
    'actual': y_test.values,
    'probability': y_prob,
    'predicted': y_pred_final
})

# Add prediction confidence categories
predictions_df['confidence'] = pd.cut(
    predictions_df['probability'], 
    bins=[0, 0.3, 0.4, 0.6, 0.7, 1.0],
    labels=['Very_Low', 'Low', 'Medium', 'High', 'Very_High']
)

# Add correctness
predictions_df['correct'] = (predictions_df['actual'] == predictions_df['predicted'])

print(f"📊 SAMPLE PREDICTIONS (First 20 rows):")
print("=" * 90)
print(f"{'Date':<20} {'Actual':<7} {'Predicted':<10} {'Probability':<12} {'Confidence':<12} {'Correct':<8}")
print("-" * 90)

for i, (_, row) in enumerate(predictions_df.head(20).iterrows()):
    date_str = row['timestamp'].strftime('%Y-%m-%d %H:%M')
    actual_str = "🟢 Bull" if row['actual'] == 1 else "🔴 Bear"
    pred_str = "🟢 Bull" if row['predicted'] == 1 else "🔴 Bear"
    prob_str = f"{row['probability']:.4f}"
    conf_str = str(row['confidence'])
    correct_str = "✅" if row['correct'] else "❌"
    
    print(f"{date_str:<20} {actual_str:<7} {pred_str:<10} {prob_str:<12} {conf_str:<12} {correct_str:<8}")

# Show statistics by confidence level
print(f"\n📈 ACCURACY BY CONFIDENCE LEVEL:")
print("-" * 50)
confidence_stats = predictions_df.groupby('confidence').agg({
    'correct': ['count', 'sum', 'mean'],
    'probability': ['mean', 'std']
}).round(4)

for conf_level in predictions_df['confidence'].cat.categories:
    if conf_level in confidence_stats.index:
        stats = confidence_stats.loc[conf_level]
        count = int(stats[('correct', 'count')])
        accuracy = stats[('correct', 'mean')]
        avg_prob = stats[('probability', 'mean')]
        
        print(f"   {conf_level:<12}: {count:>4} predictions, {accuracy:.1%} accuracy, avg prob: {avg_prob:.3f}")

# Save predictions to CSV for ensemble analysis
output_file = CSV_FILE.parent / f"rf_predictions_{START_DATE.replace('-', '')}.csv"

# Also save to Desktop for easy access
desktop_path = Path.home() / "Desktop"
desktop_file = desktop_path / f"bitcoin_rf_predictions_{START_DATE.replace('-', '')}.csv"

try:
    predictions_df.to_csv(desktop_file, index=False)
    desktop_saved = True
except:
    desktop_saved = False

print(f"\n💾 PREDICTIONS SAVED:")
print(f"   Primary file: {output_file}")
if desktop_saved:
    print(f"   Desktop copy: {desktop_file}")
    print(f"   ✅ Ready to download from Desktop!")
else:
    print(f"   ❌ Could not save to Desktop, check permissions")
print(f"   Rows: {len(predictions_df):,}")
print(f"   Columns: {list(predictions_df.columns)}")

# Create a summary file for quick reference
summary_data = {
    'Model': ['Random_Forest'],
    'Start_Date': [START_DATE],
    'Test_Samples': [len(predictions_df)],
    'Accuracy': [predictions_df['correct'].mean()],
    'Precision': [precision_score(predictions_df['actual'], predictions_df['predicted'])],
    'Recall': [recall_score(predictions_df['actual'], predictions_df['predicted'])],
    'F1_Score': [f1_score(predictions_df['actual'], predictions_df['predicted'])],
    'Avg_Probability': [predictions_df['probability'].mean()],
    'Bull_Signals': [sum(predictions_df['predicted'])],
    'Bull_Percentage': [sum(predictions_df['predicted'])/len(predictions_df)*100],
    'File_Path': [str(desktop_file if desktop_saved else output_file)]
}

summary_df = pd.DataFrame(summary_data)
summary_file = desktop_path / f"bitcoin_model_summary_{START_DATE.replace('-', '')}.csv" if desktop_saved else CSV_FILE.parent / f"model_summary_{START_DATE.replace('-', '')}.csv"

try:
    summary_df.to_csv(summary_file, index=False)
    print(f"   Summary file: {summary_file}")
except:
    print(f"   ❌ Could not save summary file")

# Summary for ensemble integration
print(f"\n🤖 ENSEMBLE INTEGRATION READY:")
print("-" * 50)
print(f"   Model Type:       Random Forest")
print(f"   Test Period:      {predictions_df['timestamp'].min()} to {predictions_df['timestamp'].max()}")
print(f"   Total Predictions: {len(predictions_df):,}")
print(f"   Bullish Signals:   {sum(predictions_df['predicted']):,} ({sum(predictions_df['predicted'])/len(predictions_df)*100:.1f}%)")
print(f"   Overall Accuracy:  {predictions_df['correct'].mean():.1%}")
print(f"   Avg Probability:   {predictions_df['probability'].mean():.3f}")

# Show high confidence predictions (for ensemble voting)
high_conf_mask = predictions_df['probability'] >= 0.7
low_conf_mask = predictions_df['probability'] <= 0.3

print(f"\n🎯 PROBABILITY DISTRIBUTION:")
print("-" * 50)
if high_conf_mask.any():
    high_conf_bull = high_conf_mask & (predictions_df['predicted'] == 1)
    print(f"   High confidence Bull: {sum(high_conf_bull):,} (prob ≥ 0.7)")
    if sum(high_conf_bull) > 0:
        print(f"   Sample dates: {', '.join(predictions_df[high_conf_bull]['timestamp'].dt.strftime('%Y-%m-%d').head(3).tolist())}")

if low_conf_mask.any():
    high_conf_bear = low_conf_mask & (predictions_df['predicted'] == 0)
    print(f"   High confidence Bear: {sum(high_conf_bear):,} (prob ≤ 0.3)")
    if sum(high_conf_bear) > 0:
        print(f"   Sample dates: {', '.join(predictions_df[high_conf_bear]['timestamp'].dt.strftime('%Y-%m-%d').head(3).tolist())}")

print(f"   💡 You can experiment with any threshold using the 'probability' column!")

# ──────────────────────────────────────────────────────────────
# 6) FINAL RECOMMENDATION
# ──────────────────────────────────────────────────────────────
print(f"\n🎯 FINAL RECOMMENDATION:")
print("=" * 60)

# Choose best overall threshold
if best_f1['f1'] > baseline_result['f1'] * 1.1:  # If F1 improved by 10%+
    recommended = best_f1
    reason = f"F1 score improved {((best_f1['f1'] / baseline_result['f1']) - 1) * 100:+.1f}%"
elif best_recall['recall'] > baseline_result['recall'] * 1.5:  # If recall improved significantly
    recommended = best_recall
    reason = f"Recall improved {((best_recall['recall'] / baseline_result['recall']) - 1) * 100:+.1f}%"
else:
    recommended = baseline_result
    reason = "Default threshold performs best"

print(f"   🏆 Use threshold: {recommended['threshold']:.1f}")
print(f"   📊 Performance:   Accuracy={recommended['accuracy']:.3f}, Precision={recommended['precision']:.3f}, "
      f"Recall={recommended['recall']:.3f}, F1={recommended['f1']:.3f}")
print(f"   📈 Signals:       {recommended['predictions']:,} ({recommended['pct_of_test']:.1f}% of test set)")
print(f"   💡 Reason:        {reason}")

print(f"\n✅ Threshold analysis complete! Use threshold {recommended['threshold']:.1f} for optimal performance.")

📊 Loading data and training model with your best parameters...
   📊 Train: 12,684 | Test: 3,171 | Features: 37
🚀 Training Random Forest...
✅ Model trained in 1.4s

🎯 THRESHOLD SENSITIVITY ANALYSIS
Threshold  Accuracy   Precision   Recall     F1         Predictions  % of Test  % Change  
------------------------------------------------------------------------------------------
0.3        0.523      0.523       0.992      0.685      3140         99.0      %    +0.0%
0.4        0.538      0.540       0.778      0.637      2386         75.2      %    +0.0%
0.5        0.525      0.591       0.295      0.393      826          26.0      %    +0.0%
0.6        0.479      0.556       0.015      0.029      45           1.4       %   -94.6%
0.7        0.478      0.000       0.000      0.000      0            0.0       %  -100.0%
0.8        0.478      0.000       0.000      0.000      0            0.0       %  -100.0%

🏆 BEST THRESHOLDS BY METRIC:
---------------------------------------------------