<a href="https://colab.research.google.com/github/Nikhil-gitub/23CSBTB27_PDS/blob/main/Reinforcement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os, random, numpy as np, pandas as pd
from sklearn.preprocessing import StandardScaler, KBinsDiscretizer
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras import layers, models, optimizers

# ---------------- Config ----------------
DATA_PATH = "stocks.csv"
HIGH_VOL_PERCENTILE = 90
ROLL_WINDOW = 5
TEST_SIZE = 0.2
SEED = 42
np.random.seed(SEED); random.seed(SEED); tf.random.set_seed(SEED)

TABULAR_EPISODES = 5   # fewer episodes
PG_EPOCHS, AC_EPOCHS = 1, 1   # reduced epochs for speed

# ---------------- Load & preprocess ----------------
df = pd.read_csv(DATA_PATH)
price_col = next((c for c in df.columns if "price" in c.lower()), df.select_dtypes(include=[np.number]).columns[0])
if any("time" in c.lower() or "date" in c.lower() for c in df.columns):
    tcol = next(c for c in df.columns if "time" in c.lower() or "date" in c.lower())
    df = df.sort_values(tcol)
df['price'] = pd.to_numeric(df[price_col], errors='coerce')
df['return'] = df['price'].pct_change().fillna(0)
df['abs_return'] = df['return'].abs()
df['rolling_vol'] = df['abs_return'].rolling(ROLL_WINDOW, min_periods=1).std().fillna(0)
df['rolling_mean_ret'] = df['return'].rolling(ROLL_WINDOW, min_periods=1).mean().fillna(0)
df['volume'] = np.random.randint(100, 1000, len(df))
thresh = np.percentile(df['abs_return'], HIGH_VOL_PERCENTILE)
df['high_vol_next'] = (df['abs_return'].shift(-1) > thresh).astype(int)
df = df.dropna().reset_index(drop=True)

features = ['price','return','rolling_vol','rolling_mean_ret','volume']
X, y = df[features].values, df['high_vol_next'].values
X = StandardScaler().fit_transform(X)
split = int(len(X)*(1-TEST_SIZE))
X_train,X_test,y_train,y_test = X[:split],X[split:],y[:split],y[split:]

# ---------------- Environment ----------------
class VolEnv:
    def __init__(self,X,y): self.X,self.y,self.n=X,y,len(y)
    def reset(self): self.i=0; self.done=False; return self.X[self.i]
    def step(self,action):
        r = 1 if action==self.y[self.i] else -1
        self.i+=1
        if self.i>=self.n: self.done=True; return None,r,True,{}
        return self.X[self.i],r,False,{}

# ---------------- Tabular Q ----------------
print("\n=== Tabular Q ===")
kbd = KBinsDiscretizer(n_bins=5,encode='ordinal',strategy='uniform')
Xd = kbd.fit_transform(X_train)[:,:2].astype(int)
Q = np.zeros((25,2))
env = VolEnv(X_train,y_train)
for _ in range(TABULAR_EPISODES):
    s = env.reset(); si = int(Xd[0][0]*5+Xd[0][1])
    while not env.done:
        a = np.argmax(Q[si]) if random.random()>0.1 else random.randint(0,1)
        _,r,done,_ = env.step(a)
        if done: break
        nxt = int(Xd[min(env.i,len(Xd)-1)][0]*5+Xd[min(env.i,len(Xd)-1)][1])
        Q[si,a]+=0.5*(r+0.9*np.max(Q[nxt])-Q[si,a]); si=nxt
preds_tab = np.argmax([Q[int(s[0]*5+s[1])] for s in kbd.transform(X_test)[:,:2]],axis=1)
print("Acc:", accuracy_score(y_test,preds_tab))

# ---------------- REINFORCE ----------------
print("\n=== REINFORCE ===")
policy=models.Sequential([
    layers.Input((X_train.shape[1],)),
    layers.Dense(8,activation='relu'),  # smaller net
    layers.Dense(2,activation='softmax')
])
opt=optimizers.Adam(0.001)
def ret(r,g=0.99):
    R=0;out=[]
    for x in r[::-1]:
        R=x+g*R;out.append(R)
    out=out[::-1];return (np.array(out)-np.mean(out))/(np.std(out)+1e-8)
for _ in range(PG_EPOCHS):
    env=VolEnv(X_train,y_train)
    s=env.reset().reshape(1,-1); states,acts,rews=[],[],[]
    while not env.done:
        p=policy(s).numpy()[0]; a=np.random.choice(2,p=p)
        ns,r,d,_=env.step(a); states.append(s[0]); acts.append(a); rews.append(r)
        if not d: s=ns.reshape(1,-1)
    R=ret(rews)
    with tf.GradientTape() as tape:
        probs=policy(np.array(states))
        logp=tf.math.log(tf.reduce_sum(probs*tf.one_hot(acts,2),axis=1)+1e-8)
        loss=-tf.reduce_mean(logp*R)
    grads=tape.gradient(loss,policy.trainable_variables)
    opt.apply_gradients(zip(grads,policy.trainable_variables))
preds_pg = np.argmax(policy.predict(X_test,verbose=0),axis=1)
print("Acc:",accuracy_score(y_test,preds_pg))

# ---------------- Actor-Critic ----------------
print("\n=== Actor-Critic ===")
actor=models.Sequential([
    layers.Input((X_train.shape[1],)),
    layers.Dense(8,activation='relu'),  # smaller net
    layers.Dense(2,activation='softmax')
])
critic=models.Sequential([
    layers.Input((X_train.shape[1],)),
    layers.Dense(8,activation='relu'),
    layers.Dense(1,activation='linear')
])
critic.compile(optimizers.Adam(0.001),loss='mse'); aopt=optimizers.Adam(0.001)
for _ in range(AC_EPOCHS):
    env=VolEnv(X_train,y_train)
    s=env.reset().reshape(1,-1); states,acts,rews=[],[],[]
    while not env.done:
        p=actor(s).numpy()[0]; a=np.random.choice(2,p=p)
        ns,r,d,_=env.step(a); states.append(s[0]); acts.append(a); rews.append(r)
        if not d: s=ns.reshape(1,-1)
    R=ret(rews); vals=critic(np.array(states)).numpy().flatten()
    adv=R-vals; critic.train_on_batch(np.array(states),R)
    with tf.GradientTape() as tape:
        probs=actor(np.array(states))
        logp=tf.math.log(tf.reduce_sum(probs*tf.one_hot(acts,2),axis=1)+1e-8)
        loss=-tf.reduce_mean(logp*adv)
    grads=tape.gradient(loss,actor.trainable_variables)
    aopt.apply_gradients(zip(grads,actor.trainable_variables))
preds_ac = np.argmax(actor.predict(X_test,verbose=0),axis=1)
print("Acc:",accuracy_score(y_test,preds_ac))

# ---------------- DQN (supervised pretrain fast) ----------------
print("\n=== DQN (supervised pretrain) ===")
def build_q():
    m=models.Sequential([
        layers.Input((X_train.shape[1],)),
        layers.Dense(16,activation='relu'),
        layers.Dense(2,activation='softmax')
    ])
    m.compile(optimizers.Adam(0.001),loss='categorical_crossentropy',metrics=['accuracy'])
    return m
q=build_q()
y_train_oh=tf.keras.utils.to_categorical(y_train,2)
q.fit(X_train,y_train_oh,epochs=5,batch_size=64,verbose=0)  # fewer epochs, bigger batch
preds_dqn = np.argmax(q.predict(X_test,verbose=0),axis=1)
print("Acc:",accuracy_score(y_test,preds_dqn))

# ---------------- Summary ----------------
print("\nSummary:")
for name,acc in {
    "TabQ":accuracy_score(y_test,preds_tab),
    "REINF":accuracy_score(y_test,preds_pg),
    "A2C":accuracy_score(y_test,preds_ac),
    "DQN":accuracy_score(y_test,preds_dqn)
}.items():
    print(f"{name}: {acc:.3f}")



=== Tabular Q ===
Acc: 0.8907117801812335

=== REINFORCE ===
Acc: 0.5157117801812335

=== Actor-Critic ===
Acc: 0.7390382928968138

=== DQN (supervised pretrain) ===
Acc: 0.9113928675825782

Summary:
TabQ: 0.891
REINF: 0.516
A2C: 0.739
DQN: 0.911
