In [None]:
# === CONFIG: set your DB connection string and symbol ===
# Fill DB_CONNECTION if you want the notebook to talk to a database via SQLAlchemy.
# Example (Postgres):
# DB_CONNECTION = 'postgresql+psycopg2://user:password@host:5432/dbname'

DB_CONNECTION = None  # <- set this to your SQLAlchemy connection string if you have a DB
SYMBOL = 'BANKNIFTY'  # change as needed

print('DB_CONNECTION set?' , DB_CONNECTION is not None)
print('SYMBOL:', SYMBOL)

In [None]:
# === FEATURE GENERATION (from trades_minute & options_oi_minute) ===
# This cell will try to fetch data from DB if DB_CONNECTION is set. Otherwise, run the synthetic demo below.

import pandas as pd
import numpy as np
from sqlalchemy import create_engine

if DB_CONNECTION:
    engine = create_engine(DB_CONNECTION)
    sql = f"SELECT * FROM labeled_minute WHERE symbol = '{SYMBOL}' ORDER BY minute_ts"
    df = pd.read_sql(sql, engine, parse_dates=['minute_ts'])
    print('Loaded rows from DB:', len(df))
else:
    print('DB_CONNECTION not set — run the synthetic demo section or set DB_CONNECTION to your connection string')

try:
    df
except NameError:
    print('No df in memory — run synthetic demo or set DB connection.')


In [None]:
# === Synthetic demo: data generation, features, detectors, model, and plots ===
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, precision_score, roc_curve, auc
from sklearn.model_selection import GroupKFold

# 1) Generate synthetic 7 trading days of minute data (assuming 390 minutes per day)
TRADING_MINUTES_PER_DAY = 390
DAYS = 7
rng = pd.date_range(start=pd.Timestamp('2025-07-01 09:15'), periods=TRADING_MINUTES_PER_DAY*DAYS, freq='T')
np.random.seed(42)
price = 42000 + np.cumsum(np.random.normal(scale=1.0, size=len(rng)))  # synthetic underlying
vwap = price + np.random.normal(scale=0.2, size=len(rng))
volume = np.random.poisson(lam=2000, size=len(rng))
block_qty = (np.random.rand(len(rng)) < 0.005).astype(int) * np.random.randint(100,1000,size=len(rng))

df_demo = pd.DataFrame({'minute_ts': rng, 'symbol': 'BANKNIFTY', 'price_avg': price, 'vwap': vwap, 'volume': volume, 'block_qty': block_qty})
df_demo['minute_of_day'] = df_demo['minute_ts'].dt.hour*60 + df_demo['minute_ts'].dt.minute

def insert_suspicious_windows(df, windows):
    df = df.copy()
    df['label'] = 0
    for (start_idx, end_idx) in windows:
        df.loc[start_idx:end_idx, 'label'] = 1
    return df

# create two synthetic event windows (by index)
windows = [(500, 520), (2000, 2030)]
df_demo = insert_suspicious_windows(df_demo, windows)

# 2) Features: returns, vol rolling, obv, baseline vwap by minute_of_day using simple historical mean
df_demo['price_ret_1m'] = df_demo['price_avg'].pct_change(1).fillna(0)
df_demo['price_ret_5m'] = df_demo['price_avg'].pct_change(5).fillna(0)
df_demo['vol_5m'] = df_demo['volume'].rolling(window=5, min_periods=1).sum()

# baseline vwap per minute_of_day using previous days mean (for simplicity use all available)
baseline = df_demo.groupby('minute_of_day')['vwap'].mean().rename('vwap_base_mean')
df_demo = df_demo.merge(baseline, on='minute_of_day', how='left')

df_demo['vwap_dev'] = (df_demo['vwap'] - df_demo['vwap_base_mean']) / (df_demo['vwap_base_mean'].rolling(window=60, min_periods=1).std().replace(0,np.nan))
df_demo['vwap_dev'] = df_demo['vwap_dev'].fillna(0)

# rel_vol_5m: divide by rolling median of vol_5m (simple proxy)
df_demo['rel_vol_5m'] = df_demo['vol_5m'] / (df_demo['vol_5m'].rolling(window=390, min_periods=1).median().replace(0,np.nan))
df_demo['rel_vol_5m'] = df_demo['rel_vol_5m'].fillna(0)

# OBV-like cumulative sign volume (use price ret sign)
df_demo['side_sign'] = np.sign(df_demo['price_ret_1m']).replace(0,1)
df_demo['obv'] = (df_demo['volume'] * df_demo['side_sign']).cumsum()
df_demo['obv_slope_5m'] = df_demo['obv'].diff(5).fillna(0)

# delta_oi synthetic (small random noise)
df_demo['delta_oi'] = np.random.normal(scale=5, size=len(df_demo))

# block_share
df_demo['block_share'] = df_demo['block_qty'] / df_demo['volume'].replace(0, np.nan)
df_demo['block_share'] = df_demo['block_share'].fillna(0)

# 3) Rule-based detectors (z-score rule)
# z-alert: high vwap_dev AND high rel_vol
z_alert = ((df_demo['vwap_dev'].abs() > 2.5) & (df_demo['rel_vol_5m'] > 2)).astype(int)
df_demo['z_alert'] = z_alert

# 4) Simple CUSUM detector on price_avg

def cusum(series, threshold=0.5):
    pos, neg = 0.0, 0.0
    out = [0]
    for i in range(1, len(series)):
        diff = series.iloc[i] - series.iloc[i-1]
        pos = max(0, pos + diff - threshold)
        neg = min(0, neg + diff + threshold)
        out.append(1 if (pos>1.0 or abs(neg)>1.0) else 0)
    return np.array(out)

df_demo['cusum_alert'] = cusum(df_demo['price_avg'])
df_demo['rule_alert'] = ((df_demo['z_alert']==1) | (df_demo['cusum_alert']==1)).astype(int)

# 5) Quick evaluation of rule detector
rule_precision = precision_score(df_demo['label'], df_demo['rule_alert'], zero_division=0)
print('Rule detector precision:', round(rule_precision,3))

# 6) Train a RandomForest triage model with GroupKFold by date
feat_cols = ['vwap_dev','rel_vol_5m','obv_slope_5m','delta_oi','block_share','price_ret_1m','price_ret_5m']
X = df_demo[feat_cols].fillna(0)
y = df_demo['label']
df_demo['date'] = df_demo['minute_ts'].dt.date

gkf = GroupKFold(n_splits=4)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
probs = np.zeros(len(df_demo))
for train_idx, test_idx in gkf.split(X, y, groups=df_demo['date']):
    rf.fit(X.iloc[train_idx], y.iloc[train_idx])
    probs[test_idx] = rf.predict_proba(X.iloc[test_idx])[:,1]

auc_score = roc_auc_score(y, probs)
print('Cross-validated ROC AUC (RF):', round(auc_score,3))

df_demo['prob'] = probs

# 7) Precision@20 (top 20 minutes)
topk = df_demo.sort_values('prob', ascending=False).head(20)
prec20 = topk['label'].sum() / 20.0
print('Precision@20:', round(prec20,3))

# 8) Plots: time series of prob and labels, and ROC curve
import matplotlib.pyplot as plt

plt.figure(figsize=(10,3))
plt.plot(df_demo['minute_ts'], df_demo['prob'])
plt.title('Model score (prob) over time — synthetic demo')
plt.xlabel('time')
plt.ylabel('prob')
plt.tight_layout()
plt.show()

plt.figure(figsize=(10,2))
plt.plot(df_demo['minute_ts'], df_demo['label'])
plt.title('Label (synthetic suspicious windows)')
plt.xlabel('time')
plt.ylabel('label')
plt.tight_layout()
plt.show()

fpr, tpr, _ = roc_curve(y, probs)
roc_auc = auc(fpr, tpr)
plt.figure(figsize=(5,5))
plt.plot(fpr, tpr)
plt.title(f'ROC (AUC={roc_auc:.3f})')
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.tight_layout()
plt.show()

alerts = df_demo.sort_values('prob', ascending=False).head(50)[['minute_ts','symbol','prob','label']+feat_cols]
alerts.to_csv('/mnt/data/demo_top_alerts.csv', index=False)
print('Saved demo top alerts to /mnt/data/demo_top_alerts.csv')

df_demo.to_csv('/mnt/data/demo_minute_data.csv', index=False)
print('Saved demo minute data to /mnt/data/demo_minute_data.csv')
