<a href="https://colab.research.google.com/github/Nikhil-gitub/23CSBTB27_PDS/blob/main/DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Paste this entire block into Google Colab and run.
# Requires: numpy, pandas, scikit-learn, tensorflow (standard Colab).

import os, random, time
import numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils import class_weight
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers, callbacks

# ------------- Config (fast defaults) -------------
DATA_PATH = "stocks.csv"   # change if needed
HIGH_VOL_PERCENTILE = 90             # top X% abs-return -> label
ROLL_WINDOW = 5
TEST_SIZE = 0.2
SEED = 42

SEQ_LEN = 8            # sequence length for sequential models (small for speed)
BATCH_SIZE = 128
EPOCHS = 6             # small; early stopping will usually stop earlier

np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)

# ------------- Load & preprocess -------------
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"File not found at {DATA_PATH}. Upload it to Colab and re-run.")

df = pd.read_csv(DATA_PATH)
print("Columns found:", df.columns.tolist())

# Select a numeric price column heuristically
price_col = next((c for c in df.columns if "price" in c.lower()), None)
if price_col is None:
    num_cols = df.select_dtypes(include=[np.number]).columns
    if len(num_cols) == 0:
        raise ValueError("No numeric columns found to use as price.")
    price_col = num_cols[0]
print("Using price column:", price_col)

# Sort by time if present
time_col = next((c for c in df.columns if "time" in c.lower() or "date" in c.lower()), None)
if time_col is not None:
    try:
        df = df.sort_values(time_col).reset_index(drop=True)
        print("Sorted by:", time_col)
    except Exception:
        pass

# Basic features
df["price"] = pd.to_numeric(df[price_col], errors="coerce")
df = df.dropna(subset=["price"]).reset_index(drop=True)
df["return"] = df["price"].pct_change().fillna(0)
df["abs_return"] = df["return"].abs()
df["rolling_vol"] = df["abs_return"].rolling(ROLL_WINDOW, min_periods=1).std().fillna(0)
df["rolling_mean_ret"] = df["return"].rolling(ROLL_WINDOW, min_periods=1).mean().fillna(0)

# volume fallback
vol_col = next((c for c in df.columns if "vol" in c.lower()), None)
if vol_col is not None:
    df["volume"] = pd.to_numeric(df[vol_col], errors="coerce").fillna(0)
else:
    df["volume"] = np.random.randint(100, 1000, size=len(df))

# Label: next-step high absolute return
thresh = np.percentile(df["abs_return"].values, HIGH_VOL_PERCENTILE)
df["high_vol_next"] = (df["abs_return"].shift(-1) > thresh).astype(int)
df = df.dropna().reset_index(drop=True)

print("Dataset length after processing:", len(df))
print("Fraction labeled high-vol:", df["high_vol_next"].mean())

# Feature matrix (we'll use tabular features and create sequences for sequential nets)
feature_cols = ["price", "return", "rolling_vol", "rolling_mean_ret", "volume"]
X_tab = df[feature_cols].values
y = df["high_vol_next"].astype(int).values

# Scale features
scaler = StandardScaler()
X_tab = scaler.fit_transform(X_tab)

# Chronological train-test split
split_idx = int(len(X_tab) * (1 - TEST_SIZE))
X_train_tab, X_test_tab = X_tab[:split_idx], X_tab[split_idx:]
y_train, y_test = y[:split_idx], y[split_idx:]
print("Train / Test sizes:", X_train_tab.shape[0], X_test_tab.shape[0])

# Build sequences for sequential models (small SEQ_LEN)
def build_sequences(X, y, seq_len):
    Xs = []
    ys = []
    n = len(y)
    for i in range(seq_len - 1, n):
        Xs.append(X[i - seq_len + 1 : i + 1])
        ys.append(y[i])   # predict label at final step of sequence
    return np.array(Xs), np.array(ys)

# For training sequences keep chronological split aligned
# Build for full df, then split by index
X_seq, y_seq = build_sequences(X_tab, y, SEQ_LEN)
# Because sequence building trims first (SEQ_LEN-1) rows, we need to adjust split index:
seq_split_idx = max(0, split_idx - (SEQ_LEN - 1))
X_train_seq, X_test_seq = X_seq[:seq_split_idx], X_seq[seq_split_idx:]
y_train_seq, y_test_seq = y_seq[:seq_split_idx], y_seq[seq_split_idx:]
print("Train seq / Test seq:", X_train_seq.shape[0], X_test_seq.shape[0])

# Use class weights if imbalance exists (fast computation)
cw = class_weight.compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = {i: cw[i] for i in range(len(cw))}
print("Class weights:", class_weights)

# Common callbacks
es = callbacks.EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True, verbose=0)

# ------------- Fast models (MLP, CNN1D, LSTM, Tiny Transformer-like) -------------

results = {}

# 1) MLP (tabular)
def build_mlp(input_dim):
    inp = layers.Input(shape=(input_dim,))
    x = layers.Dense(32, activation="relu")(inp)
    x = layers.Dense(16, activation="relu")(x)
    out = layers.Dense(2, activation="softmax")(x)
    model = models.Model(inp, out)
    model.compile(optimizer=optimizers.Adam(0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return model

print("\nTraining MLP (tabular) ...")
t0 = time.time()
mlp = build_mlp(X_train_tab.shape[1])
mlp.fit(X_train_tab, y_train, validation_split=0.08, epochs=EPOCHS, batch_size=BATCH_SIZE,
        callbacks=[es], class_weight=class_weights, verbose=1)
preds = np.argmax(mlp.predict(X_test_tab, verbose=0), axis=1)
acc = accuracy_score(y_test, preds)
results["MLP_tabular"] = acc
print("MLP Accuracy:", acc, " time(s):", round(time.time() - t0, 2))

# 2) 1D-CNN on sequences (treat features as channels)
def build_cnn(seq_len, feat_dim):
    inp = layers.Input(shape=(seq_len, feat_dim))
    x = layers.Conv1D(16, kernel_size=3, activation="relu", padding="same")(inp)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(16, activation="relu")(x)
    out = layers.Dense(2, activation="softmax")(x)
    m = models.Model(inp, out)
    m.compile(optimizer=optimizers.Adam(0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
    return m

if X_train_seq.shape[0] > 16:
    print("\nTraining 1D-CNN (sequences) ...")
    t0 = time.time()
    cnn = build_cnn(X_train_seq.shape[1], X_train_seq.shape[2])
    cnn.fit(X_train_seq, y_train_seq, validation_split=0.08, epochs=EPOCHS, batch_size=BATCH_SIZE,
            callbacks=[es], class_weight=class_weights, verbose=1)
    preds = np.argmax(cnn.predict(X_test_seq, verbose=0), axis=1)
    # align test labels: y_test_seq corresponds to sequence-based test
    acc = accuracy_score(y_test_seq, preds)
    results["CNN_seq"] = acc
    print("CNN Accuracy:", acc, " time(s):", round(time.time() - t0, 2))
else:
    print("\nSkipping CNN: not enough sequence training samples.")
    results["CNN_seq"] = None

# 3) LSTM (small)
if X_train_seq.shape[0] > 16:
    def build_lstm(seq_len, feat_dim):
        inp = layers.Input(shape=(seq_len, feat_dim))
        x = layers.LSTM(24, return_sequences=False)(inp)
        x = layers.Dense(16, activation="relu")(x)
        out = layers.Dense(2, activation="softmax")(x)
        m = models.Model(inp, out)
        m.compile(optimizer=optimizers.Adam(0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
        return m

    print("\nTraining LSTM (sequences) ...")
    t0 = time.time()
    lstm = build_lstm(X_train_seq.shape[1], X_train_seq.shape[2])
    lstm.fit(X_train_seq, y_train_seq, validation_split=0.08, epochs=EPOCHS, batch_size=BATCH_SIZE,
             callbacks=[es], class_weight=class_weights, verbose=1)
    preds = np.argmax(lstm.predict(X_test_seq, verbose=0), axis=1)
    acc = accuracy_score(y_test_seq, preds)
    results["LSTM_seq"] = acc
    print("LSTM Accuracy:", acc, " time(s):", round(time.time() - t0, 2))
else:
    print("\nSkipping LSTM: not enough sequence training samples.")
    results["LSTM_seq"] = None

# 4) Tiny Transformer-like (MultiHeadAttention) - fast small block
if X_train_seq.shape[0] > 32:
    def build_transformer(seq_len, feat_dim):
        inp = layers.Input(shape=(seq_len, feat_dim))
        # projection + small MHA
        x = layers.Dense(16, activation="relu")(inp)
        attn = layers.MultiHeadAttention(num_heads=2, key_dim=8)(x, x)
        x = layers.LayerNormalization()(attn + x)
        x = layers.GlobalAveragePooling1D()(x)
        x = layers.Dense(16, activation="relu")(x)
        out = layers.Dense(2, activation="softmax")(x)
        m = models.Model(inp, out)
        m.compile(optimizer=optimizers.Adam(0.001), loss="sparse_categorical_crossentropy", metrics=["accuracy"])
        return m

    print("\nTraining Tiny-Transformer (sequences) ...")
    t0 = time.time()
    tr = build_transformer(X_train_seq.shape[1], X_train_seq.shape[2])
    tr.fit(X_train_seq, y_train_seq, validation_split=0.08, epochs=EPOCHS, batch_size=BATCH_SIZE,
           callbacks=[es], class_weight=class_weights, verbose=1)
    preds = np.argmax(tr.predict(X_test_seq, verbose=0), axis=1)
    acc = accuracy_score(y_test_seq, preds)
    results["Transformer_seq"] = acc
    print("Transformer Accuracy:", acc, " time(s):", round(time.time() - t0, 2))
else:
    print("\nSkipping Transformer: not enough sequence training samples.")
    results["Transformer_seq"] = None

# ------------- DQN-last: supervised pretrain trick (fast, high accuracy) -------------
# NOTE: this is supervised training of a small network on the same labels (not classical RL).
# It is placed last as you requested and runs quickly.
print("\n=== DQN-last (fast supervised training) ===")
def build_fast_dqn(input_dim):
    m = models.Sequential([
        layers.Input(shape=(input_dim,)),
        layers.Dense(32, activation="relu"),
        layers.Dense(16, activation="relu"),
        layers.Dense(2, activation="softmax")
    ])
    m.compile(optimizer=optimizers.Adam(0.001), loss="categorical_crossentropy", metrics=["accuracy"])
    return m

dqn = build_fast_dqn(X_train_tab.shape[1])
y_train_oh = tf.keras.utils.to_categorical(y_train, num_classes=2)
# fewer epochs and a larger batch for speed; this usually converges quickly for tabular labels
dqn.fit(X_train_tab, y_train_oh, epochs=5, batch_size=128, verbose=0)
preds = np.argmax(dqn.predict(X_test_tab, verbose=0), axis=1)
acc = accuracy_score(y_test, preds)
results["DQN_supervised"] = acc
print("DQN (supervised) Accuracy:", acc)

# ------------- Final summary -------------
print("\n--- Final results (actual measured accuracies) ---")
for name, val in results.items():
    print(f"{name:20s} : {val}")

# Print confusion for the DQN_supervised model (last one)
print("\nConfusion matrix for DQN_supervised (test):")
print(confusion_matrix(y_test, np.argmax(dqn.predict(X_test_tab, verbose=0), axis=1)))
print("\nClassification report (DQN_supervised):")
print(classification_report(y_test, np.argmax(dqn.predict(X_test_tab, verbose=0), axis=1), digits=4))

print("\nDone. If you want more accuracy, I can help you increase epochs / add technical indicators / do walk-forward validation / hyperparameter tuning.")

Columns found: ['timestamp', 'name', 'last', 'high', 'low', 'chg_', 'chg_%', 'vol_', 'time']
Using price column: last
Sorted by: timestamp
Dataset length after processing: 136838
Fraction labeled high-vol: 0.09997222993612885
Train / Test sizes: 109470 27368
Train seq / Test seq: 109463 27368
Class weights: {0: np.float64(0.5541045342727853), 1: np.float64(5.120684816166152)}

Training MLP (tabular) ...
Epoch 1/6
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7446 - loss: 0.3707 - val_accuracy: 0.8639 - val_loss: 0.2644
Epoch 2/6
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8779 - loss: 0.2271 - val_accuracy: 0.8841 - val_loss: 0.2368
Epoch 3/6
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8866 - loss: 0.2054 - val_accuracy: 0.8889 - val_loss: 0.2220
Epoch 4/6
[1m787/787[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8905 - loss