# In this notebook we will train the random forest model

In [5]:
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             make_scorer, accuracy_score, roc_auc_score)
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COLUMN, TARGET_COL = "timestamp", "target"
START_DATE, TEST_FRAC = "2022-01-01", 0.20 
DROP_COLS = [ 
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'rsi_oversold', 'rsi_overbought', 'stoch_overbought',
    'stoch_oversold', 'cci_overbought', 'cci_oversold', 'near_upper_band',
    'near_lower_band', 'overbought_reversal', 'oversold_reversal',
    'ema_cross_up', 'ema_cross_down', 'macd_cross_up', 'macd_cross_down',
    'trending_market', 'trend_alignment', 'ema7_above_ema21', 'macd_rising',
    'bollinger_upper', 'bollinger_lower', 'bullish_scenario_1',
    'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# LOAD DATA
# ──────────────────────────────────────────────────────────────
print("📊 Loading 4H Bitcoin data...")
if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COLUMN]).set_index(TIME_COLUMN).sort_index()
df = df.loc[START_DATE:].copy()

# Verify target column exists
if TARGET_COL not in df.columns:
    sys.exit(f"❌ Target column '{TARGET_COL}' not found!")

X = df.drop(columns=[col for col in DROP_COLS if col in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

# Chronological split (IMPORTANT: maintains time order)
split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"   📅 Date range: {df.index.min()} to {df.index.max()}")
print(f"   📊 Train: {X_train.shape[0]:,} samples | Test: {X_test.shape[0]:,} samples")
print(f"   🎯 Features: {X_train.shape[1]} | Target balance: {y.mean():.1%} bullish")

# ──────────────────────────────────────────────────────────────
# CUSTOM SCORER (Fβ WITH β = 0.5 for 2x precision weight)
# ──────────────────────────────────────────────────────────────
def precision_weighted_f1(y_true, y_pred):
    """F-beta score with beta=0.5 to weight precision 2x more than recall."""
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    if p + r == 0:
        return 0.0
    return (1 + beta**2) * p * r / (beta**2 * p + r)

scorer = make_scorer(precision_weighted_f1, greater_is_better=True)

# ──────────────────────────────────────────────────────────────
# HYPERPARAMETER SEARCH
# ──────────────────────────────────────────────────────────────
param_dist = {
    "n_estimators":       [100, 150, 200, 250, 300, 400, 500],
    "max_depth":          [8, 10, 12, 15, 18, 20, None],
    "min_samples_split":  [5, 10, 15, 20, 25],
    "min_samples_leaf":   [2, 4, 6, 8, 10],
    "max_leaf_nodes":     [None, 50, 100, 200, 500],
    "max_features":       ["sqrt", "log2", 0.3, 0.5, 0.7],
    "bootstrap":          [True, False],
    "max_samples":        [0.7, 0.8, 0.9, 1.0],
    "class_weight":       [None, "balanced", "balanced_subsample"],
}

# Time-series cross-validation (respects temporal order)
cv = TimeSeriesSplit(n_splits=4)
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    scoring=scorer,
    n_iter=50,
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1  # Show progress
)

print("\n🔍 Running hyperparameter optimization...")
start = time.time()
search.fit(X_train, y_train)
search_time = time.time() - start

print(f"⏱️  Optimization completed in {search_time:.1f}s")
print(f"🎯 Best CV score: {search.best_score_:.4f}")

# ──────────────────────────────────────────────────────────────
# RESULTS & EVALUATION
# ──────────────────────────────────────────────────────────────
print("\n🌟 OPTIMAL PARAMETERS:")
print("-" * 40)
for k, v in search.best_params_.items():
    print(f"   {k:<20}: {v}")

# Test set evaluation
best_model = search.best_estimator_
y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:, 1]

print("\n📊 TEST SET PERFORMANCE:")
print("-" * 40)
print(f"   Accuracy                : {accuracy_score(y_test, y_pred):.4f}")
print(f"   Precision               : {precision_score(y_test, y_pred, zero_division=0):.4f}")
print(f"   Recall                  : {recall_score(y_test, y_pred, zero_division=0):.4f}")
print(f"   F1 (standard)           : {f1_score(y_test, y_pred, zero_division=0):.4f}")
print(f"   F1 (precision-weighted) : {precision_weighted_f1(y_test, y_pred):.4f}")
print(f"   ROC-AUC                 : {roc_auc_score(y_test, y_prob):.4f}")

# Feature importance (top 10)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

print(f"\n🌟 TOP 10 MOST IMPORTANT FEATURES:")
print("-" * 40)
for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"   {i:2d}. {row['feature']:<20}: {row['importance']:.4f}")

# Additional insights
print(f"\n📈 TRAINING INSIGHTS:")
print("-" * 40)
print(f"   Train period: {df.index[0]} to {df.index[split-1]}")
print(f"   Test period:  {df.index[split]} to {df.index[-1]}")
print(f"   CV folds:     {cv.n_splits}")
print(f"   Total params tested: {len(search.cv_results_['params'])}")

print(f"\n✅ Optimization complete! Use these parameters for your production Random Forest model.")

📊 Loading 4H Bitcoin data...
   📅 Date range: 2022-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 5,672 samples | Test: 1,419 samples
   🎯 Features: 37 | Target balance: 50.6% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 147.0s
🎯 Best CV score: 0.5588

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 300
   min_samples_split   : 15
   min_samples_leaf    : 4
   max_samples         : 0.9
   max_leaf_nodes      : 200
   max_features        : 0.3
   max_depth           : 10
   class_weight        : balanced
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5321
   Precision               : 0.5494
   Recall                  : 0.4890
   F1 (standard)           : 0.5174
   F1 (precision-weighted) : 0.5361
   ROC-AUC                 : 0.5344

🌟 TOP 10 MOST IMPORTANT FEATURES:
----

📊 Loading 4H Bitcoin data...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 samples | Test: 4,046 samples
   🎯 Features: 37 | Target balance: 51.8% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 344.3s
🎯 Best CV score: 0.5087

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 100
   min_samples_split   : 5
   min_samples_leaf    : 8
   max_samples         : 0.9
   max_leaf_nodes      : 100
   max_features        : 0.7
   max_depth           : 20
   class_weight        : None
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5368
   Precision               : 0.5677
   Recall                  : 0.4115
   F1 (standard)           : 0.4771
   F1 (precision-weighted) : 0.5276
   ROC-AUC                 : 0.5510

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0810
    2. buying_pressure     : 0.0535
    3. atr_ratio           : 0.0462
    4. volume_mean_20      : 0.0445
    5. adx                 : 0.0430
    6. volume              : 0.0424
    7. fear_greed_score    : 0.0416
    8. roc_24h             : 0.0415
    9. OBV                 : 0.0398
   10. stoch_%K            : 0.0398

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

📊 Loading 4H Bitcoin data...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 samples | Test: 3,171 samples
   🎯 Features: 37 | Target balance: 51.1% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 343.2s
🎯 Best CV score: 0.5165

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 400
   min_samples_split   : 5
   min_samples_leaf    : 10
   max_samples         : 0.9
   max_leaf_nodes      : 500
   max_features        : 0.5
   max_depth           : 8
   class_weight        : balanced_subsample
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5244
   Precision               : 0.5769
   Recall                  : 0.3351
   F1 (standard)           : 0.4240
   F1 (precision-weighted) : 0.5042
   ROC-AUC                 : 0.5501

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0778
    2. buying_pressure     : 0.0613
    3. roc_24h             : 0.0469
    4. bb_position         : 0.0459
    5. fear_greed_score    : 0.0449
    6. atr_ratio           : 0.0431
    7. adx                 : 0.0415
    8. volume_mean_20      : 0.0404
    9. stoch_%K            : 0.0404
   10. CCI                 : 0.0386

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

📊 Loading 4H Bitcoin data...
   📅 Date range: 2020-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 9,180 samples | Test: 2,296 samples
   🎯 Features: 37 | Target balance: 51.0% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 314.8s
🎯 Best CV score: 0.5496

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 300
   min_samples_split   : 15
   min_samples_leaf    : 2
   max_samples         : 0.9
   max_leaf_nodes      : 100
   max_features        : log2
   max_depth           : None
   class_weight        : None
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5366
   Precision               : 0.5727
   Recall                  : 0.4228
   F1 (standard)           : 0.4865
   F1 (precision-weighted) : 0.5348
   ROC-AUC                 : 0.5327

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0611
    2. buying_pressure     : 0.0475
    3. bb_position         : 0.0431
    4. stoch_%K            : 0.0418
    5. fear_greed_score    : 0.0417
    6. price_vs_vwap       : 0.0397
    7. CCI                 : 0.0384
    8. roc_24h             : 0.0371
    9. stoch_%D            : 0.0367
   10. OBV                 : 0.0363

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2020-01-01 00:00:00 to 2024-03-10 08:00:00
   Test period:  2024-03-10 12:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

📊 Loading 4H Bitcoin data...
   📅 Date range: 2022-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 5,672 samples | Test: 1,419 samples
   🎯 Features: 37 | Target balance: 50.6% bullish

🔍 Running hyperparameter optimization...
Fitting 4 folds for each of 50 candidates, totalling 200 fits
⏱️  Optimization completed in 147.0s
🎯 Best CV score: 0.5588

🌟 OPTIMAL PARAMETERS:
----------------------------------------
   n_estimators        : 300
   min_samples_split   : 15
   min_samples_leaf    : 4
   max_samples         : 0.9
   max_leaf_nodes      : 200
   max_features        : 0.3
   max_depth           : 10
   class_weight        : balanced
   bootstrap           : True

📊 TEST SET PERFORMANCE:
----------------------------------------
   Accuracy                : 0.5321
   Precision               : 0.5494
   Recall                  : 0.4890
   F1 (standard)           : 0.5174
   F1 (precision-weighted) : 0.5361
   ROC-AUC                 : 0.5344

🌟 TOP 10 MOST IMPORTANT FEATURES:
----------------------------------------
    1. roc_4h              : 0.0645
    2. buying_pressure     : 0.0505
    3. volume_ratio        : 0.0435
    4. stoch_%K            : 0.0427
    5. bb_position         : 0.0419
    6. CCI                 : 0.0400
    7. atr_ratio           : 0.0394
    8. volume              : 0.0392
    9. fear_greed_score    : 0.0378
   10. stoch_%D            : 0.0377

📈 TRAINING INSIGHTS:
----------------------------------------
   Train period: 2022-01-01 00:00:00 to 2024-08-03 12:00:00
   Test period:  2024-08-03 16:00:00 to 2025-03-28 00:00:00
   CV folds:     4
   Total params tested: 50

✅ Optimization complete! Use these parameters for your production Random Forest model.

In [12]:
# =============================================================
#  RANDOM-FOREST  •  FINAL TRAINING WITH OPTIMAL PARAMS
# =============================================================
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score,
                             f1_score, roc_auc_score)
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# 1) CONFIGURATION
# ──────────────────────────────────────────────────────────────
CSV_FILE     = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
                    r"\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COLUMN  = "timestamp"
TARGET_COL   = "target"
START_DATE   = "2016-01-01"
TEST_FRAC    = 0.20

DROP_COLS = [
    'open','high','low','high_low','high_close','low_close','typical_price',
    'volume_breakout','volume_breakdown','break_upper_band','break_lower_band',
    'vol_spike_1_5x','rsi_oversold','rsi_overbought','stoch_overbought',
    'stoch_oversold','cci_overbought','cci_oversold','near_upper_band',
    'near_lower_band','overbought_reversal','oversold_reversal',
    'ema_cross_up','ema_cross_down','macd_cross_up','macd_cross_down',
    'trending_market','trend_alignment','ema7_above_ema21','macd_rising',
    'bollinger_upper','bollinger_lower','bullish_scenario_1',
    'bullish_scenario_5','bearish_scenario_1'
]

best_params = {
    "n_estimators":     300,
    "max_depth":        10,
    "min_samples_split": 15,
    "min_samples_leaf": 4,
    "max_leaf_nodes":   200,
    "max_features":    0.3,
    "bootstrap":        True,
    "max_samples":      0.9,
    "class_weight":     "balanced_subsample",
    "random_state":     42,
    "n_jobs":           -1
}

# ──────────────────────────────────────────────────────────────
# 2) LOAD & PREP DATA
# ──────────────────────────────────────────────────────────────
print("📊 Loading 4H Bitcoin data for final Random Forest training...")

if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COLUMN]).set_index(TIME_COLUMN).sort_index()
df = df.loc[START_DATE:].copy()

if TARGET_COL not in df.columns:
    sys.exit(f"❌ '{TARGET_COL}' column missing!")

X = df.drop(columns=[col for col in DROP_COLS if col in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"   📅 Date range: {df.index.min()} to {df.index.max()}")
print(f"   📊 Train: {X_train.shape[0]:,} | Test: {X_test.shape[0]:,}")
print(f"   🎯 Features: {X_train.shape[1]} | Target balance: {y.mean():.1%} bullish")
print(f"   ⏰ Train period: {df.index[0]} to {df.index[split-1]}")
print(f"   🧪 Test period:  {df.index[split]} to {df.index[-1]}")

# ──────────────────────────────────────────────────────────────
# 3) TRAIN FINAL MODEL
# ──────────────────────────────────────────────────────────────
print(f"\n🚀 Training final Random Forest with optimal parameters...")
print("   Parameters:")
for k, v in best_params.items():
    print(f"      {k:<18}: {v}")

t0 = time.time()
rf_final = RandomForestClassifier(**best_params)
rf_final.fit(X_train, y_train)
training_time = time.time() - t0

print(f"🟢 Model trained successfully in {training_time:.1f}s")

# ──────────────────────────────────────────────────────────────
# 4) EVALUATE MODEL
# ──────────────────────────────────────────────────────────────
print(f"\n📊 FINAL MODEL EVALUATION")
print("=" * 40)

y_pred = rf_final.predict(X_test)
y_prob = rf_final.predict_proba(X_test)[:, 1] if rf_final.n_classes_ == 2 else rf_final.predict_proba(X_test).max(axis=1)

def precision_weighted_f1(y_true, y_pred):
    p = precision_score(y_true, y_pred, zero_division=0)
    r = recall_score(y_true, y_pred, zero_division=0)
    beta = 0.5
    return (1 + beta**2) * p * r / (beta**2 * p + r) if (p + r) > 0 else 0.0

metrics = {
    "Accuracy":                 accuracy_score(y_test, y_pred),
    "Precision":                precision_score(y_test, y_pred, zero_division=0),
    "Recall":                   recall_score(y_test, y_pred, zero_division=0),
    "F1 (standard)":            f1_score(y_test, y_pred, zero_division=0),
    "F1 (precision-weighted)":  precision_weighted_f1(y_test, y_pred),
    "ROC-AUC":                  roc_auc_score(y_test, y_prob)
}

print("🎯 Test Set Performance:")
for k, v in metrics.items():
    print(f"   {k:<25}: {v:.4f}")

# ──────────────────────────────────────────────────────────────
# 5) FEATURE IMPORTANCE
# ──────────────────────────────────────────────────────────────
print(f"\n🌟 FEATURE IMPORTANCE ANALYSIS")
print("-" * 40)

feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': rf_final.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 15 Most Important Features:")
for i, (_, row) in enumerate(feature_importance.head(15).iterrows(), 1):
    print(f"   {i:2d}. {row['feature']:<20}: {row['importance']:.4f}")

# ──────────────────────────────────────────────────────────────
# 6) SUMMARY
# ──────────────────────────────────────────────────────────────
print(f"\n🎉 TRAINING COMPLETE!")
print("=" * 50)
print(f"🎯 Model Performance Summary:")
print(f"   • Accuracy: {metrics['Accuracy']:.3f}")
print(f"   • Precision: {metrics['Precision']:.3f} (optimized metric)")
print(f"   • F1-weighted: {metrics['F1 (precision-weighted)']:.3f}")
print(f"   • Training time: {training_time:.1f}s")
print(f"   • Features used: {len(X_train.columns)}")
print(f"\n🚀 Ready for downstream use or ensemble integration!")


📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 | Test: 4,046
   🎯 Features: 37 | Target balance: 51.8% bullish
   ⏰ Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   🧪 Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 10
      min_samples_split : 15
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : 0.3
      bootstrap         : True
      max_samples       : 0.9
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 2.7s

📊 FINAL MODEL EVALUATION
🎯 Test Set Performance:
   Accuracy                 : 0.5314
   Precision                : 0.5750
   Recall                   : 0.3359
   F1 (standard)            : 0.4241
   F1 (precision-weigh

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 | Test: 4,046
   🎯 Features: 37 | Target balance: 51.8% bullish
   ⏰ Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   🧪 Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 15
      min_samples_split : 10
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : sqrt
      bootstrap         : True
      max_samples       : 0.8
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 1.7s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5309
   Precision                : 0.5776
   Recall                   : 0.3224
   F1 (standard)            : 0.4138
   F1 (precision-weighted)  : 0.4987
   ROC-AUC                  : 0.5489

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0530
    2. buying_pressure     : 0.0427
    3. bb_position         : 0.0406
    4. stoch_%K            : 0.0395
    5. fear_greed_score    : 0.0393
    6. atr_ratio           : 0.0385
    7. roc_24h             : 0.0376
    8. volume_ratio        : 0.0374
    9. adx                 : 0.0373
   10. stoch_%D            : 0.0373
   11. volume              : 0.0369
   12. price_vs_vwap       : 0.0363
   13. CCI                 : 0.0363
   14. volume_mean_20      : 0.0360
   15. parkinson_vol       : 0.0351

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.531
   • Precision: 0.578 (optimized metric)
   • F1-weighted: 0.499
   • Training time: 1.7s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2016-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 16,184 | Test: 4,046
   🎯 Features: 37 | Target balance: 51.8% bullish
   ⏰ Train period: 2016-01-01 00:00:00 to 2023-05-23 16:00:00
   🧪 Test period:  2023-05-23 20:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 10
      min_samples_split : 15
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : 0.3
      bootstrap         : True
      max_samples       : 0.9
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 2.7s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5314
   Precision                : 0.5750
   Recall                   : 0.3359
   F1 (standard)            : 0.4241
   F1 (precision-weighted)  : 0.5033
   ROC-AUC                  : 0.5537

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0628
    2. buying_pressure     : 0.0481
    3. stoch_%K            : 0.0426
    4. bb_position         : 0.0426
    5. fear_greed_score    : 0.0407
    6. stoch_%D            : 0.0400
    7. volume_mean_20      : 0.0397
    8. atr_ratio           : 0.0396
    9. volume              : 0.0391
   10. roc_24h             : 0.0388
   11. volume_ratio        : 0.0383
   12. adx                 : 0.0379
   13. price_vs_vwap       : 0.0373
   14. RSI                 : 0.0367
   15. CCI                 : 0.0364

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.531
   • Precision: 0.575 (optimized metric)
   • F1-weighted: 0.503
   • Training time: 2.7s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 | Test: 3,171
   🎯 Features: 37 | Target balance: 51.1% bullish
   ⏰ Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   🧪 Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 300
      max_depth         : 15
      min_samples_split : 10
      min_samples_leaf  : 4
      max_leaf_nodes    : 200
      max_features      : sqrt
      bootstrap         : True
      max_samples       : 0.8
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 1.4s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5251
   Precision                : 0.5908
   Recall                   : 0.2947
   F1 (standard)            : 0.3932
   F1 (precision-weighted)  : 0.4919
   ROC-AUC                  : 0.5537

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0526
    2. buying_pressure     : 0.0456
    3. fear_greed_score    : 0.0421
    4. bb_position         : 0.0415
    5. roc_24h             : 0.0403
    6. stoch_%K            : 0.0389
    7. CCI                 : 0.0382
    8. adx                 : 0.0374
    9. stoch_%D            : 0.0365
   10. volume_mean_20      : 0.0363
   11. volume              : 0.0356
   12. atr_ratio           : 0.0356
   13. price_vs_vwap       : 0.0355
   14. volume_ratio        : 0.0351
   15. MACD_histogram      : 0.0339

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.525
   • Precision: 0.591 (optimized metric)
   • F1-weighted: 0.492
   • Training time: 1.4s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

📊 Loading 4H Bitcoin data for final Random Forest training...
   📅 Date range: 2018-01-01 00:00:00 to 2025-03-28 00:00:00
   📊 Train: 12,684 | Test: 3,171
   🎯 Features: 37 | Target balance: 51.1% bullish
   ⏰ Train period: 2018-01-01 00:00:00 to 2023-10-16 12:00:00
   🧪 Test period:  2023-10-16 16:00:00 to 2025-03-28 00:00:00

🚀 Training final Random Forest with optimal parameters...
   Parameters:
      n_estimators      : 400
      max_depth         : 8
      min_samples_split : 5
      min_samples_leaf  : 10
      max_leaf_nodes    : 500
      max_features      : 0.5
      bootstrap         : True
      max_samples       : 0.9
      class_weight      : balanced_subsample
      random_state      : 42
      n_jobs            : -1
🟢 Model trained successfully in 3.6s

📊 FINAL MODEL EVALUATION
========================================
🎯 Test Set Performance:
   Accuracy                 : 0.5244
   Precision                : 0.5769
   Recall                   : 0.3351
   F1 (standard)            : 0.4240
   F1 (precision-weighted)  : 0.5042
   ROC-AUC                  : 0.5501

🌟 FEATURE IMPORTANCE ANALYSIS
----------------------------------------
Top 15 Most Important Features:
    1. roc_4h              : 0.0778
    2. buying_pressure     : 0.0613
    3. roc_24h             : 0.0469
    4. bb_position         : 0.0459
    5. fear_greed_score    : 0.0449
    6. atr_ratio           : 0.0431
    7. adx                 : 0.0415
    8. volume_mean_20      : 0.0404
    9. stoch_%K            : 0.0404
   10. CCI                 : 0.0386
   11. stoch_%D            : 0.0377
   12. price_vs_vwap       : 0.0369
   13. volume              : 0.0354
   14. bollinger_width     : 0.0354
   15. volume_ratio        : 0.0353

🎉 TRAINING COMPLETE!
==================================================
🎯 Model Performance Summary:
   • Accuracy: 0.524
   • Precision: 0.577 (optimized metric)
   • F1-weighted: 0.504
   • Training time: 3.6s
   • Features used: 37

🚀 Ready for downstream use or ensemble integration!

In [None]:
import numpy as np, pandas as pd, time, sys, warnings
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from sklearn.metrics import (precision_score, recall_score, f1_score,
                             make_scorer, accuracy_score, roc_auc_score)
warnings.filterwarnings("ignore")
np.random.seed(42)

# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
CSV_FILE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction\Stock-Market-Prediction\data\processed\gemini_btc_with_features_4h.csv")
TIME_COLUMN, TARGET_COL = "timestamp", "target"
START_DATE, TEST_FRAC = "2022-01-01", 0.20 
DROP_COLS = [ 
    'open', 'high', 'low', 'high_low', 'high_close', 'low_close', 'typical_price',
    'volume_breakout', 'volume_breakdown', 'break_upper_band', 'break_lower_band',
    'vol_spike_1_5x', 'rsi_oversold', 'rsi_overbought', 'stoch_overbought',
    'stoch_oversold', 'cci_overbought', 'cci_oversold', 'near_upper_band',
    'near_lower_band', 'overbought_reversal', 'oversold_reversal',
    'ema_cross_up', 'ema_cross_down', 'macd_cross_up', 'macd_cross_down',
    'trending_market', 'trend_alignment', 'ema7_above_ema21', 'macd_rising',
    'bollinger_upper', 'bollinger_lower', 'bullish_scenario_1',
    'bullish_scenario_5', 'bearish_scenario_1'
]

# ──────────────────────────────────────────────────────────────
# LOAD DATA
# ──────────────────────────────────────────────────────────────
print("📊 Loading 4H Bitcoin data...")
if not CSV_FILE.exists():
    sys.exit(f"❌ File not found: {CSV_FILE}")

df = pd.read_csv(CSV_FILE, parse_dates=[TIME_COLUMN]).set_index(TIME_COLUMN).sort_index()
df = df.loc[START_DATE:].copy()

if TARGET_COL not in df.columns:
    sys.exit(f"❌ Target column '{TARGET_COL}' not found!")

X = df.drop(columns=[col for col in DROP_COLS if col in df.columns] + [TARGET_COL], errors="ignore")
y = df[TARGET_COL]

# Chronological split
split = int(len(df) * (1 - TEST_FRAC))
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

print(f"   📅 Date range: {df.index.min()} to {df.index.max()}")
print(f"   📊 Train: {X_train.shape[0]:,} samples | Test: {X_test.shape[0]:,} samples")
print(f"   🎯 Features: {X_train.shape[1]} | Target balance: {y.mean():.1%} bullish")

# ──────────────────────────────────────────────────────────────
# IMPROVED PARAMETER SEARCH STRATEGY
# ──────────────────────────────────────────────────────────────

# Start with your known good baseline parameters
baseline_params = {
    "n_estimators": 300,
    "max_depth": 15,
    "min_samples_split": 10,
    "min_samples_leaf": 4,
    "max_leaf_nodes": 200,
    "max_features": "sqrt",
    "bootstrap": True,
    "max_samples": 0.8,
    "class_weight": "balanced_subsample",
    "random_state": 42,
    "n_jobs": -1
}

print(f"\n🎯 BASELINE MODEL PERFORMANCE:")
print("-" * 50)
baseline_rf = RandomForestClassifier(**baseline_params)
baseline_rf.fit(X_train, y_train)
baseline_pred = baseline_rf.predict(X_test)
baseline_prob = baseline_rf.predict_proba(X_test)[:, 1]

baseline_scores = {
    'accuracy': accuracy_score(y_test, baseline_pred),
    'precision': precision_score(y_test, baseline_pred, zero_division=0),
    'recall': recall_score(y_test, baseline_pred, zero_division=0),
    'f1': f1_score(y_test, baseline_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, baseline_prob)
}

for metric, score in baseline_scores.items():
    print(f"   {metric.upper():<12}: {score:.4f}")

# SMART PARAMETER SEARCH: Narrow ranges around your good baseline
param_dist = {
    # Tree structure - explore around your good values
    "n_estimators":       [200, 250, 300, 350, 400, 450],
    "max_depth":          [12, 14, 15, 16, 18, 20],
    "min_samples_split":  [6, 8, 10, 12, 15],
    "min_samples_leaf":   [2, 3, 4, 5, 6],
    
    # Node constraints - fine-tune around your baseline
    "max_leaf_nodes":     [150, 200, 250, 300, None],
    "max_features":       ["sqrt", "log2", 0.4, 0.5, 0.6],
    
    # Sampling - explore variations
    "max_samples":        [0.7, 0.8, 0.85, 0.9, 0.95],
    "bootstrap":          [True],  # Keep True since it worked
    "class_weight":       ["balanced_subsample", "balanced", None],
}

# Use ROC-AUC as primary metric (more stable than F1 for imbalanced data)
print(f"\n🔍 SMART HYPERPARAMETER SEARCH:")
print("-" * 50)
print(f"   Target: Beat baseline ROC-AUC of {baseline_scores['roc_auc']:.4f}")
print(f"   Strategy: Focused search around proven good parameters")
print(f"   Search space: ~4,050 combinations")

# More robust cross-validation with gap to prevent leakage
cv = TimeSeriesSplit(n_splits=6, gap=12)  # 12 = 2 days gap for 4H data
rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# Use ROC-AUC for more stable optimization
search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    scoring='roc_auc',  # More stable than custom F1 scorer
    n_iter=100,  # More iterations for better exploration
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print(f"\n🚀 Starting optimization (targeting <15min runtime)...")
start_time = time.time()
search.fit(X_train, y_train)
search_time = time.time() - start_time

print(f"\n⏱️  Search completed in {search_time/60:.1f} minutes")
print(f"🎯 Best CV ROC-AUC: {search.best_score_:.4f}")
print(f"📈 Improvement over baseline CV: {search.best_score_ - baseline_scores['roc_auc']:.4f}")

# ──────────────────────────────────────────────────────────────
# COMPREHENSIVE EVALUATION
# ──────────────────────────────────────────────────────────────
print(f"\n🌟 OPTIMIZED PARAMETERS:")
print("-" * 50)
for k, v in search.best_params_.items():
    baseline_val = baseline_params.get(k, "N/A")
    change_indicator = "📈" if str(v) != str(baseline_val) else "➡️"
    print(f"   {change_indicator} {k:<20}: {v} (was: {baseline_val})")

# Test both models side by side
best_model = search.best_estimator_
optimized_pred = best_model.predict(X_test)
optimized_prob = best_model.predict_proba(X_test)[:, 1]

optimized_scores = {
    'accuracy': accuracy_score(y_test, optimized_pred),
    'precision': precision_score(y_test, optimized_pred, zero_division=0),
    'recall': recall_score(y_test, optimized_pred, zero_division=0),
    'f1': f1_score(y_test, optimized_pred, zero_division=0),
    'roc_auc': roc_auc_score(y_test, optimized_prob)
}

print(f"\n📊 PERFORMANCE COMPARISON:")
print("-" * 60)
print(f"{'Metric':<12} {'Baseline':<10} {'Optimized':<10} {'Improvement':<12}")
print("-" * 60)

for metric in ['accuracy', 'precision', 'recall', 'f1', 'roc_auc']:
    baseline_val = baseline_scores[metric]
    optimized_val = optimized_scores[metric]
    improvement = optimized_val - baseline_val
    improvement_pct = (improvement / baseline_val) * 100 if baseline_val > 0 else 0
    
    status = "🟢" if improvement > 0.001 else "🔴" if improvement < -0.001 else "➡️"
    print(f"{metric.upper():<12} {baseline_val:<10.4f} {optimized_val:<10.4f} "
          f"{status} {improvement:+.4f} ({improvement_pct:+.1f}%)")

# Feature importance comparison
print(f"\n🌟 TOP 10 FEATURE IMPORTANCE (OPTIMIZED MODEL):")
print("-" * 50)
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': best_model.feature_importances_
}).sort_values('importance', ascending=False)

for i, (_, row) in enumerate(feature_importance.head(10).iterrows(), 1):
    print(f"   {i:2d}. {row['feature']:<20}: {row['importance']:.4f}")

# Final recommendation
print(f"\n🎯 FINAL RECOMMENDATION:")
print("-" * 50)
if optimized_scores['roc_auc'] > baseline_scores['roc_auc'] + 0.001:
    print("✅ USE OPTIMIZED PARAMETERS - Clear improvement found!")
    print(f"   ROC-AUC improvement: +{optimized_scores['roc_auc'] - baseline_scores['roc_auc']:.4f}")
elif abs(optimized_scores['roc_auc'] - baseline_scores['roc_auc']) <= 0.001:
    print("➡️  MODELS ARE EQUIVALENT - Stick with baseline for simplicity")
else:
    print("❌ BASELINE IS BETTER - Keep your original parameters")

print(f"\n📋 SEARCH SUMMARY:")
print("-" * 50)
print(f"   Parameters tested: {len(search.cv_results_['params'])}")
print(f"   CV folds: {cv.n_splits}")
print(f"   Search time: {search_time/60:.1f} minutes")
print(f"   Best score found: {search.best_score_:.4f}")

print(f"\n✅ Hyperparameter optimization complete!")