<a href="https://colab.research.google.com/github/Nikhil-gitub/23CSBTB27_PDS/blob/main/Ensemble_Tech.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Ensemble fast demo for volatility prediction
# Paste into Google Colab. Uses only common libraries available in Colab.

import os, random, time
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier, StackingClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.utils import class_weight

# ---------------- Config ----------------
DATA_PATH = "stocks.csv"   # change path if needed
HIGH_VOL_PERCENTILE = 90             # top X% absolute return -> high volatility label
ROLL_WINDOW = 5
TEST_SIZE = 0.20
RANDOM_STATE = 42

# Speed-focused hyperparameters (adjust if you want longer training)
RF_N = 60
GB_N = 60
ADA_N = 40
MLP_MAX_ITER = 200

# ---------------- Load & preprocess ----------------
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"File not found at {DATA_PATH}. Upload it to Colab and re-run.")

df = pd.read_csv(DATA_PATH)
print("Columns:", df.columns.tolist())

# choose a numeric price column
price_col = next((c for c in df.columns if "price" in c.lower()), None)
if price_col is None:
    num_cols = df.select_dtypes(include=[np.number]).columns
    if len(num_cols) == 0:
        raise ValueError("No numeric column found to use as price.")
    price_col = num_cols[0]
print("Using price column:", price_col)

# sort by time if available
time_col = next((c for c in df.columns if "time" in c.lower() or "date" in c.lower()), None)
if time_col is not None:
    try:
        df = df.sort_values(time_col).reset_index(drop=True)
        print("Sorted by", time_col)
    except Exception:
        pass

# compute features
df["price"] = pd.to_numeric(df[price_col], errors="coerce")
df = df.dropna(subset=["price"]).reset_index(drop=True)
df["return"] = df["price"].pct_change().fillna(0)
df["abs_return"] = df["return"].abs()
df["rolling_vol"] = df["abs_return"].rolling(ROLL_WINDOW, min_periods=1).std().fillna(0)
df["rolling_mean_ret"] = df["return"].rolling(ROLL_WINDOW, min_periods=1).mean().fillna(0)

# use volume if present otherwise synthetic
vol_col = next((c for c in df.columns if "vol" in c.lower()), None)
if vol_col:
    df["volume"] = pd.to_numeric(df[vol_col], errors="coerce").fillna(0)
else:
    df["volume"] = np.random.randint(100, 1000, size=len(df))

# label: next-step high abs return
thresh = np.percentile(df["abs_return"].values, HIGH_VOL_PERCENTILE)
df["high_vol_next"] = (df["abs_return"].shift(-1) > thresh).astype(int)
df = df.dropna().reset_index(drop=True)

print("After processing, rows:", len(df))
print("High-vol fraction:", df["high_vol_next"].mean())

# features and target
features = ["price", "return", "rolling_vol", "rolling_mean_ret", "volume"]
X = df[features].values
y = df["high_vol_next"].astype(int).values

# scale features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# chronological split (no shuffle)
split_idx = int(len(X) * (1 - TEST_SIZE))
X_train, X_test = X[:split_idx], X[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
print("Train / Test sizes:", X_train.shape[0], X_test.shape[0])

# class weights for imbalance
cw = class_weight.compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = {i: cw[i] for i in range(len(cw))}
print("Class weights:", class_weights)

# ---------------- Base learners (fast) ----------------
start = time.time()
print("\nTraining base learners (fast settings)...")

# Logistic Regression (strong fast baseline)
lr = LogisticRegression(max_iter=200, class_weight="balanced", random_state=RANDOM_STATE, solver="liblinear")

# Random Forest (smaller for speed)
rf = RandomForestClassifier(n_estimators=RF_N, max_depth=6, random_state=RANDOM_STATE, n_jobs=-1, class_weight="balanced")

# Gradient Boosting (sklearn's, modest size)
gb = GradientBoostingClassifier(n_estimators=GB_N, max_depth=3, random_state=RANDOM_STATE)

# AdaBoost (fast)
ada = AdaBoostClassifier(n_estimators=ADA_N, random_state=RANDOM_STATE)

# Small MLP
mlp = MLPClassifier(hidden_layer_sizes=(32,), max_iter=MLP_MAX_ITER, random_state=RANDOM_STATE)

base_models = [("lr", lr), ("rf", rf), ("gb", gb), ("ada", ada), ("mlp", mlp)]

# Fit each quickly
for name, model in base_models:
    print(f" - fitting {name} ...", end="", flush=True)
    # some models support class_weight directly, others we can pass sample weights
    if name in ("rf", "lr"):
        model.fit(X_train, y_train)
    else:
        # use simple fit (fast)
        model.fit(X_train, y_train)
    print(" done.")

print("Base learners trained in %.2f s" % (time.time() - start))

# ---------------- Voting ensemble (soft voting) ----------------
print("\nBuilding Voting ensemble (soft)...")
voting = VotingClassifier(estimators=base_models, voting="soft", n_jobs=-1)
voting.fit(X_train, y_train)
preds_voting = voting.predict(X_test)
acc_voting = accuracy_score(y_test, preds_voting)
print("Voting accuracy:", acc_voting)

# ---------------- Stacking ensemble ----------------
print("\nBuilding Stacking ensemble (fast)...")
# use logistic regression as final estimator (fast)
stack = StackingClassifier(estimators=base_models, final_estimator=LogisticRegression(max_iter=200), n_jobs=-1, passthrough=False)
stack.fit(X_train, y_train)
preds_stack = stack.predict(X_test)
acc_stack = accuracy_score(y_test, preds_stack)
print("Stacking accuracy:", acc_stack)

# ---------------- Individual model evaluations ----------------
print("\nIndividual model performances (test):")
individual_results = {}
for name, model in base_models:
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    individual_results[name] = acc
    print(f"{name:6s} acc: {acc:.4f}")

# ---------------- Optional quick supervised "DQN trick" to push high accuracy -------------
# NOTE: The user asked for >95% accuracy. I cannot fabricate results.
# If you insist on reaching >95% for demo, you can train a strong supervised classifier (e.g., RandomForest tuned or
# an overfitted MLP) — but that only reflects supervised classification, not RL.
# Below is an optional small supervised overfitting step (UNCOMMENT to run):
#
# big_mlp = MLPClassifier(hidden_layer_sizes=(128,64), max_iter=200, random_state=RANDOM_STATE)
# big_mlp.fit(X_train, y_train)
# preds_big = big_mlp.predict(X_test)
# print("Big MLP acc (may overfit):", accuracy_score(y_test, preds_big))
#
# I do NOT run the above automatically to keep runtime short and honest.

# ---------------- Summary + Detailed reports ----------------
print("\n--- Summary ---")
print("Voting    acc:", acc_voting)
print("Stacking  acc:", acc_stack)
for k,v in individual_results.items():
    print(f"{k:8s} acc: {v:.4f}")

# show confusion and classification report for the best-of-the-ensembles
best_name = "voting" if acc_voting >= acc_stack else "stacking"
best_preds = preds_voting if best_name == "voting" else preds_stack
print(f"\nBest ensemble: {best_name} (accuracy {max(acc_voting, acc_stack):.4f})")
print("Confusion matrix (best):")
print(confusion_matrix(y_test, best_preds))
print("\nClassification report (best):")
print(classification_report(y_test, best_preds, digits=4))

print("\nDone. Total time: %.2f s" % (time.time() - start))

Columns: ['timestamp', 'name', 'last', 'high', 'low', 'chg_', 'chg_%', 'vol_', 'time']
Using price column: last
Sorted by timestamp
After processing, rows: 136838
High-vol fraction: 0.09997222993612885
Train / Test sizes: 109470 27368
Class weights: {0: np.float64(0.5541045342727853), 1: np.float64(5.120684816166152)}

Training base learners (fast settings)...
 - fitting lr ... done.
 - fitting rf ... done.
 - fitting gb ... done.
 - fitting ada ... done.
 - fitting mlp ... done.
Base learners trained in 74.20 s

Building Voting ensemble (soft)...




Voting accuracy: 0.9187737503653902

Building Stacking ensemble (fast)...
Stacking accuracy: 0.9233045893013738

Individual model performances (test):
lr     acc: 0.8708
rf     acc: 0.8969
gb     acc: 0.9255
ada    acc: 0.9225
mlp    acc: 0.9131

--- Summary ---
Voting    acc: 0.9187737503653902
Stacking  acc: 0.9233045893013738
lr       acc: 0.8708
rf       acc: 0.8969
gb       acc: 0.9255
ada      acc: 0.9225
mlp      acc: 0.9131

Best ensemble: stacking (accuracy 0.9233)
Confusion matrix (best):
[[23808   569]
 [ 1530  1461]]

Classification report (best):
              precision    recall  f1-score   support

           0     0.9396    0.9767    0.9578     24377
           1     0.7197    0.4885    0.5820      2991

    accuracy                         0.9233     27368
   macro avg     0.8297    0.7326    0.7699     27368
weighted avg     0.9156    0.9233    0.9167     27368


Done. Total time: 535.20 s
