<a href="https://colab.research.google.com/github/Nikhil-gitub/23CSBTB27_PDS/blob/main/supervised.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ===============================
# Volatility Prediction Framework
# ===============================

# Install additional libraries
!pip install -q imbalanced-learn xgboost lightgbm catboost

# ---- Imports ----
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier, StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import warnings, random, pickle
warnings.filterwarnings("ignore")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
random.seed(RANDOM_STATE)

# ---- Load dataset ----
df = pd.read_csv("stocks.csv")   # <-- in Colab, either upload or put in Drive
print("Shape:", df.shape)
print(df.head())

# ---- Detect columns ----
timestamp_col = next((c for c in df.columns if "time" in c.lower() or "date" in c.lower()), None)
price_col = next((c for c in df.columns if "last" in c.lower() or "price" in c.lower()), None)
high_col = next((c for c in df.columns if "high" in c.lower()), None)
low_col = next((c for c in df.columns if "low" in c.lower()), None)
volume_col = next((c for c in df.columns if "vol" in c.lower()), None)

print("Detected columns:", timestamp_col, price_col, high_col, low_col, volume_col)

# ---- Preprocess ----
df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors="coerce")
df = df.sort_values(timestamp_col).reset_index(drop=True)
df[price_col] = pd.to_numeric(df[price_col], errors="coerce").fillna(method="ffill")

# ---- Feature engineering ----
df["return"] = df[price_col].pct_change().fillna(0)
df["log_return"] = np.log(df[price_col]).diff().fillna(0)
df["hl_range"] = (df[high_col] - df[low_col]) / df[price_col]

df["rolling_ret_std_5"] = df["return"].rolling(5).std().fillna(0)
df["rolling_ret_std_10"] = df["return"].rolling(10).std().fillna(0)
df["rolling_vol_5"] = df["log_return"].rolling(5).std().fillna(0)
df["rolling_vol_10"] = df["log_return"].rolling(10).std().fillna(0)

if volume_col:
    df["volume"] = pd.to_numeric(df[volume_col], errors="coerce").fillna(0)
    df["vol_change"] = df["volume"].pct_change().fillna(0)
else:
    df["volume"] = 0
    df["vol_change"] = 0

df["hour"] = df[timestamp_col].dt.hour
df["minute"] = df[timestamp_col].dt.minute

# ---- Label: future volatility (binary) ----
LABEL_WINDOW = 5
future_vol = df["log_return"].shift(-1).rolling(LABEL_WINDOW).std()
thresh = future_vol.quantile(0.75)
df["vol_label"] = (future_vol > thresh).astype(int)

df = df.dropna(subset=["vol_label"])
print("Label distribution:", df["vol_label"].value_counts(normalize=True))

# ---- Features & Target ----
features = ["return","log_return","hl_range","rolling_ret_std_5","rolling_ret_std_10",
            "rolling_vol_5","rolling_vol_10","volume","vol_change","hour","minute"]
X = df[features].fillna(0)
y = df["vol_label"]

# ---- Time-based split ----
split = int(len(df)*0.7)
X_train, X_test = X.iloc[:split], X.iloc[split:]
y_train, y_test = y.iloc[:split], y.iloc[split:]

# ---- Balance training data ----
sm = SMOTE(random_state=RANDOM_STATE)
X_train_bal, y_train_bal = sm.fit_resample(X_train, y_train)

# ---- Scale ----
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train_bal)
X_test_s = scaler.transform(X_test)

# ---- Models ----
models = {
    "LogReg": LogisticRegression(max_iter=2000, class_weight="balanced"),
    "KNN": KNeighborsClassifier(),
    "LinearSVC": LinearSVC(max_iter=5000, class_weight="balanced"),
    "DecisionTree": DecisionTreeClassifier(class_weight="balanced"),
    "RandomForest": RandomForestClassifier(n_estimators=200, class_weight="balanced"),
    "ExtraTrees": ExtraTreesClassifier(n_estimators=200, class_weight="balanced"),
    "GradientBoosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "XGBoost": xgb.XGBClassifier(use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lgb.LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "GaussianNB": GaussianNB(),
    "MLP": MLPClassifier(max_iter=500)
}

# ---- Train & Evaluate ----
results = []
for name, clf in models.items():
    clf.fit(X_train_s, y_train_bal)
    y_pred = clf.predict(X_test_s)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"\n=== {name} ===")
    print(classification_report(y_test, y_pred, digits=4))
    results.append({"Model":name,"Accuracy":acc,"Precision":prec,"Recall":rec,"F1":f1})

# ---- Results summary ----
res_df = pd.DataFrame(results).sort_values("F1", ascending=False)
print("\nSummary:\n", res_df)

# ---- Stacking Ensemble (top 3) ----
top_models = res_df.head(3)["Model"].tolist()
estimators = [(m, models[m]) for m in top_models]
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(max_iter=2000))
stack.fit(X_train_s, y_train_bal)
y_pred_stack = stack.predict(X_test_s)
print("\n=== Stacking Ensemble ===")
print(classification_report(y_test, y_pred_stack, digits=4))


Shape: (136838, 9)
             timestamp         name    last    high     low   chg_   chg_%  \
0  2025-09-19 02:00:02       Boeing  215.66  217.40  213.70   1.03  +0.48%   
1  2025-09-19 02:00:02      Chevron  158.84  160.29  158.09  -1.25  -0.78%   
2  2025-09-19 02:00:02    Citigroup  102.41  102.70  101.69   0.65  +0.64%   
3  2025-09-19 02:00:02  Caterpillar  466.96  467.71  448.87  16.30  +3.62%   
4  2025-09-19 02:00:02    Microsoft  508.27  513.07  507.66  -1.75  -0.34%   

     vol_      time  
0   6.32M  15:59:59  
1   3.88M  15:59:59  
2  11.52M  15:59:59  
3    4.3M  15:59:59  
4  13.75M  15:59:59  
Detected columns: timestamp last high low vol_
Label distribution: vol_label
0    0.750011
1    0.249989
Name: proportion, dtype: float64

=== LogReg ===
              precision    recall  f1-score   support

           0     0.9288    0.8111    0.8660     30657
           1     0.5945    0.8167    0.6881     10395

    accuracy                         0.8126     41052
   macro