# in this notebook we will train the first combation method - majority voting

In [5]:
"""
Exhaustive Majority-Voting Grid Search (10 models, combos 2-10)
==============================================================

• Aligns on latest common start date
• Verifies all 'actual' columns match
• Evaluates 1 013 subsets with hard vote
• Writes:
    - majority_vote_results.csv
    - individual_model_metrics.csv
"""

import itertools, sys, time
from pathlib import Path
import numpy as np
import pandas as pd
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, confusion_matrix
)
from tqdm.auto import tqdm

# ──────────────────────────────────────────────────────────────
# CONFIG
# ──────────────────────────────────────────────────────────────
BASE = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
            r"\Stock-Market-Prediction\Final_runs_csv")

FILES = {
    "catboost" : "catboost_trial57_predictions.csv",
    "cnn"      : "cnn_predictions.csv",
    "gru"      : "gru_trial28_f05_preds.csv",
    "lgbm"     : "lgbm_predictions_formatted_backup.csv",
    "logreg"   : "logisticreg_validation_predictions.csv",
    "lstm"     : "lstm_test_predictions.csv",
    "rf"       : "RandomForest_predictions_custom_high_precision.csv",
    "tcn"      : "TCN_Trial_36_predictions.csv",
    "xgb"      : "xgboost_predictions_fixed.csv",
    "cnn_lstm" : "cnn_lstm_val_preds_20250614_142329.csv",
}

THRESH = {k: 0.5 for k in FILES}          # per-model prob→label threshold

# ──────────────────────────────────────────────────────────────
# 1. LOAD & ALIGN
# ──────────────────────────────────────────────────────────────
earliest = {}

def load_one(path: Path, key: str) -> pd.DataFrame:
    df = pd.read_csv(path)
    if {"timestamp", "prob_up", "actual"} - set(df.columns):
        sys.exit(f"❌ required cols missing in {path.name}")

    ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
    if ts.isna().any():
        bad = ts.isna().sum()
        sys.exit(f"❌ {bad} unparsable timestamps in {path.name}")

    earliest[key] = ts.min()

    y_bin = (df["prob_up"].to_numpy() >= THRESH[key]).astype(np.uint8)
    return pd.DataFrame({
        "timestamp": ts,
        key: y_bin,
        f"{key}_actual": df["actual"]
    })

print("🔧 Loading prediction files …")
merged = None
for key, fname in FILES.items():
    p = BASE / fname
    if not p.exists():
        sys.exit(f"❌ file not found: {p}")
    part = load_one(p, key)
    merged = part if merged is None else merged.merge(part, on="timestamp", how="inner")

# ── trim to start at the latest model-specific start date
start_cut = max(earliest.values())
merged = merged[merged["timestamp"] >= start_cut].reset_index(drop=True)
print(f"Rows after inner-join & cut-to-latest-start: {len(merged):,}")

# ── verify 'actual' columns identical
act_cols = [c for c in merged.columns if c.endswith("_actual")]
if not merged[act_cols].eq(merged[act_cols[0]], axis=0).all().all():
    sys.exit("❌ 'actual' values differ across files – fix before continuing")

y_true = merged[act_cols[0]].to_numpy(dtype=np.uint8)
merged.drop(columns=act_cols, inplace=True)

pred_keys = list(FILES.keys())
X = merged[pred_keys].to_numpy(dtype=np.uint8)

# ──────────────────────────────────────────────────────────────
# 2. EXHAUSTIVE COMBINATION SEARCH (k = 2 … 10)
# ──────────────────────────────────────────────────────────────
def hard_vote(mat: np.ndarray) -> np.ndarray:
    return (mat.sum(1) > mat.shape[1] / 2).astype(np.uint8)

n = len(pred_keys)
total = (2 ** n) - 1 - n
results = []

t0 = time.time()
with tqdm(total=total, unit="combo") as bar:
    for k in range(2, n + 1):
        for idx in itertools.combinations(range(n), k):
            y_pred = hard_vote(X[:, idx])

            prec = precision_score(y_true, y_pred, zero_division=0)
            rec  = recall_score   (y_true, y_pred, zero_division=0)
            f1   = f1_score      (y_true, y_pred, zero_division=0)
            acc  = accuracy_score(y_true, y_pred)
            tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

            results.append({
                "models"   : "+".join(pred_keys[i] for i in idx),
                "n_models" : k,
                "precision": round(prec, 6),
                "recall"   : round(rec, 6),
                "f1"       : round(f1, 6),
                "accuracy" : round(acc, 6),
                "tp": tp, "fp": fp, "tn": tn, "fn": fn
            })
            bar.update(1)

print(f"🏁 combos done in {time.time()-t0:.1f} s")

# ──────────────────────────────────────────────────────────────
# 3. SINGLE-MODEL METRICS
# ──────────────────────────────────────────────────────────────
indiv = []
for i, name in enumerate(pred_keys):
    y_pred = X[:, i]
    indiv.append({
        "models": name, "n_models": 1,
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall"   : recall_score  (y_true, y_pred, zero_division=0),
        "f1"       : f1_score      (y_true, y_pred, zero_division=0),
        "accuracy" : accuracy_score(y_true, y_pred)
    })
indiv_df = pd.DataFrame(indiv).sort_values("f1", ascending=False)

# ──────────────────────────────────────────────────────────────
# 4. SAVE OUTPUTS
# ──────────────────────────────────────────────────────────────
out_all = BASE / "majority_vote_results.csv"
out_ind = BASE / "individual_model_metrics.csv"

(pd.DataFrame(results)
   .sort_values(["precision", "recall", "f1"],
                ascending=[False, False, True])
   .to_csv(out_all, index=False))
indiv_df.to_csv(out_ind, index=False)

# ──────────────────────────────────────────────────────────────
# 5. CONSOLE SUMMARY
# ──────────────────────────────────────────────────────────────
print("\nTop-10 ensembles by F1:")
top10 = pd.read_csv(out_all).head(10)
print(top10[["models", "n_models", "precision", "recall", "f1"]].to_string(index=False))

best_single = indiv_df.iloc[0]
best_combo  = top10.iloc[0]
improv = best_combo["f1"] - best_single["f1"]

print("\nBest single model :",
      f"{best_single['models']}  F1={best_single['f1']:.4f}")
print("Best ensemble     :",
      f"{best_combo['models']}  F1={best_combo['f1']:.4f}")
print(f"Improvement       : {improv:+.4f} "
      f"({improv/best_single['f1']*100:+.2f} %)")
print(f"\n💾 Files written: {out_all.name}, {out_ind.name}")


  ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
  ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
  ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)
  ts = pd.to_datetime(df["timestamp"], errors="coerce", utc=True)


🔧 Loading prediction files …
Rows after inner-join & cut-to-latest-start: 3,105


100%|██████████| 1013/1013 [00:04<00:00, 213.17combo/s]

🏁 combos done in 4.8 s

Top-10 ensembles by F1:
                    models  n_models  precision   recall       f1
      gru+lstm+rf+cnn_lstm         4   0.615721 0.174074 0.271415
          cnn+gru+lstm+xgb         4   0.614391 0.205556 0.308048
catboost+gru+lgbm+cnn_lstm         4   0.614000 0.189506 0.289623
           cnn+gru+lstm+rf         4   0.613828 0.224691 0.328965
      gru+lgbm+rf+cnn_lstm         4   0.612981 0.157407 0.250491
          gru+lgbm+lstm+rf         4   0.611307 0.213580 0.316560
    catboost+gru+lgbm+lstm         4   0.611285 0.240741 0.345438
      catboost+gru+lgbm+rf         4   0.608215 0.237654 0.341767
catboost+gru+lstm+cnn_lstm         4   0.608084 0.213580 0.316126
  catboost+gru+rf+cnn_lstm         4   0.607018 0.213580 0.315982

Best single model : tcn  F1=0.6857
Best ensemble     : gru+lstm+rf+cnn_lstm  F1=0.2714
Improvement       : -0.4143 (-60.42 %)

💾 Files written: majority_vote_results.csv, individual_model_metrics.csv





In [4]:
import pandas as pd
from pathlib import Path

# Path to the CSV written by the ensemble script
BASE   = Path(r"C:\Users\ADMIN\Desktop\Coding_projects\stock_market_prediction"
              r"\Stock-Market-Prediction\Final_runs_csv")
CSV    = BASE / "majority_vote_results.csv"

# Read, sort, display
df = pd.read_csv(CSV)
top15 = (df.sort_values(["precision", "f1", "recall"],
                        ascending=[False, False, False])
           .head(15))

print("\nTop 15 combinations – ranked by precision")
print(top15[["models", "n_models", "precision", "recall", "f1"]]
      .to_string(index=False, formatters={
          "precision": "{:.4f}".format,
          "recall"   : "{:.4f}".format,
          "f1"       : "{:.4f}".format
      }))



Top 15 combinations – ranked by precision
                            models  n_models precision recall     f1
              gru+lstm+rf+cnn_lstm         4    0.6157 0.1741 0.2714
                  cnn+gru+lstm+xgb         4    0.6144 0.2056 0.3080
        catboost+gru+lgbm+cnn_lstm         4    0.6140 0.1895 0.2896
                   cnn+gru+lstm+rf         4    0.6138 0.2247 0.3290
              gru+lgbm+rf+cnn_lstm         4    0.6130 0.1574 0.2505
                  gru+lgbm+lstm+rf         4    0.6113 0.2136 0.3166
            catboost+gru+lgbm+lstm         4    0.6113 0.2407 0.3454
              catboost+gru+lgbm+rf         4    0.6082 0.2377 0.3418
        catboost+gru+lstm+cnn_lstm         4    0.6081 0.2136 0.3161
          catboost+gru+rf+cnn_lstm         4    0.6070 0.2136 0.3160
           gru+logreg+xgb+cnn_lstm         4    0.6069 0.1963 0.2966
             gru+lgbm+xgb+cnn_lstm         4    0.6066 0.1580 0.2507
catboost+cnn+gru+lstm+xgb+cnn_lstm         6    0.6066 0.307