# Project B — Jane Street LightGBM pipeline (Auto-select real data via Kaggle if available)

**Summary:** This notebook will attempt to download the Jane Street dataset from Kaggle if `~/.kaggle/kaggle.json` is present and the Kaggle CLI is available. If the download succeeds, it prepares features from the real dataset and trains a LightGBM baseline with grouped time CV. If not, it falls back to a synthetic sample so you can run the whole pipeline immediately.

**Result:** After running all cells, you'll have either trained LightGBM models on the real Jane Street data (files in `data/janestreet/`) or trained models on a synthetic sample (saved under the same folder).

## 0 — Requirements
Run this cell to install requirements if not already installed.

In [None]:
%%bash
python -m pip install --upgrade pip
python -m pip install numpy pandas pyarrow scikit-learn lightgbm kaggle matplotlib tqdm

## 1 — Download real Jane Street dataset via Kaggle if available, else create synthetic sample
This cell checks for `~/.kaggle/kaggle.json`. If present it will attempt to download the competition dataset; otherwise a synthetic sample is created.

In [None]:

import os, subprocess, shutil
from pathlib import Path
DATA_DIR = Path('data/janestreet')
KAGGLE_TOKEN = Path.home() / '.kaggle' / 'kaggle.json'
DATA_DIR.mkdir(parents=True, exist_ok=True)
downloaded = False
if KAGGLE_TOKEN.exists():
    print('Kaggle token found. Attempting to download Jane Street dataset via Kaggle CLI (may be large).')
    try:
        cmd = ['kaggle', 'competitions', 'download', '-c', 'jane-street-market-prediction', '-p', str(DATA_DIR), '--unzip']
        print('Running:', ' '.join(cmd))
        subprocess.run(cmd, check=True)
        downloaded = True
        print('Download complete. Look for train.csv or train.parquet in', DATA_DIR)
    except Exception as e:
        print('Kaggle download failed with error:', e)
        downloaded = False

if not downloaded:
    print('Creating small synthetic sample (train_sample.parquet) for quick runs.')
    import numpy as np, pandas as pd
    n_dates = 10
    rows_per_date = 200
    dates = np.repeat(np.arange(n_dates), rows_per_date)
    ids = np.arange(len(dates))
    np.random.seed(42)
    data = {'date': dates, 'id': ids}
    for i in range(10):
        data[f'feature_{i}'] = np.random.normal(0,1,size=len(dates)) + (dates * 0.01)
    data['resp'] = 0.001 * (np.random.randn(len(dates)) + data['feature_0']*0.5 - data['feature_1']*0.3)
    df = pd.DataFrame(data)
    out_path = DATA_DIR / 'train_sample.parquet'
    df.to_parquet(out_path, index=False)
    print('Synthetic sample saved to', out_path)


## 2 — Prepare features (works for real or synthetic data)
This cell reads available parquet/csv and computes lag and rolling features. It saves `prepared.parquet`. If you downloaded the real dataset, point `DATA_PATH` to the real file before running.

In [None]:

import pandas as pd, os
from pathlib import Path
DATA_DIR = Path('data/janestreet')
candidates = ['train.parquet','train.csv','train_sample.parquet']
data_path = None
for c in candidates:
    p = DATA_DIR / c
    if p.exists():
        data_path = p; break
if data_path is None:
    for p in DATA_DIR.glob('*.parquet'):
        data_path = p; break
if data_path is None:
    raise RuntimeError('No data file found in data/janestreet/. Place train.csv or train.parquet, or let the previous cell create a sample.')
print('Using data file:', data_path)
if data_path.suffix == '.parquet':
    df = pd.read_parquet(data_path)
else:
    df = pd.read_csv(data_path)
feat_cols = [c for c in df.columns if str(c).startswith('feature')]
df = df.sort_values(['date','id']).reset_index(drop=True)
for lag in [1,2,3]:
    for c in feat_cols:
        df[f'{c}_lag{lag}'] = df.groupby('date')[c].shift(lag).fillna(0.0)
for c in feat_cols[:10]:
    df[f'{c}_rm3'] = df.groupby('date')[c].rolling(3, min_periods=1).mean().reset_index(0,drop=True)
target = 'resp' if 'resp' in df.columns else 'target'
train_df = df[df[target].notnull()]
out_path = DATA_DIR / 'prepared.parquet'
train_df.to_parquet(out_path, index=False)
print('Prepared features saved to', out_path, 'rows=', len(train_df))


## 3 — Train LightGBM baseline with GroupKFold (time-aware CV)
This cell trains LightGBM using `date` as group for CV. Works on real or sample prepared data.

In [None]:

import pandas as pd, numpy as np, os
from sklearn.model_selection import GroupKFold
import lightgbm as lgb
DATA = 'data/janestreet/prepared.parquet'
if not os.path.exists(DATA):
    raise RuntimeError('Prepared data not found. Run previous cells.')
df = pd.read_parquet(DATA)
target = 'resp' if 'resp' in df.columns else 'target'
feats = [c for c in df.columns if c not in ['date','id',target]]
groups = df['date'].values
gkf = GroupKFold(n_splits=5)
oof = np.zeros(len(df))
importances = []
for fold, (tr_idx, val_idx) in enumerate(gkf.split(df, df[target], groups)):
    tr = df.iloc[tr_idx]; val = df.iloc[val_idx]
    dtrain = lgb.Dataset(tr[feats], label=tr[target])
    dval = lgb.Dataset(val[feats], label=val[target])
    params = {'objective':'regression','metric':'rmse','verbosity':-1,'learning_rate':0.05,'num_leaves':31}
    model = lgb.train(params, dtrain, valid_sets=[dtrain,dval], early_stopping_rounds=50, num_boost_round=500, verbose_eval=50)
    oof[val_idx] = model.predict(val[feats], num_iteration=model.best_iteration)
    importances.append(pd.DataFrame({'feature':feats, 'gain':model.feature_importance(importance_type='gain')}))
    model.save_model(f'data/janestreet/lgb_fold{fold}.txt')
rmse = ((oof - df[target])**2).mean()**0.5
print('OOF RMSE:', rmse)
imp = pd.concat(importances).groupby('feature').mean().reset_index().sort_values('gain', ascending=False)
imp.to_csv('data/janestreet/feature_importances.csv', index=False)
print('Saved feature importances and fold models to data/janestreet/')

## 4 — Simple backtest / evaluation (uses saved fold models if present)
Demonstrates a simple strategy based on model predictions.

In [None]:

import pandas as pd, numpy as np, os
df = pd.read_parquet('data/janestreet/prepared.parquet').reset_index(drop=True)
models = [p for p in os.listdir('data/janestreet') if p.startswith('lgb_fold') and p.endswith('.txt')]
preds = None
if models:
    import lightgbm as lgb
    preds = np.zeros(len(df))
    feats = [c for c in df.columns if c not in ['date','id','resp','target']]
    for m in models:
        mdl = lgb.Booster(model_file=os.path.join('data/janestreet', m))
        preds += mdl.predict(df[feats], num_iteration=mdl.best_iteration)
    preds /= len(models)
else:
    preds = np.zeros(len(df))
actual = df['resp'].values if 'resp' in df.columns else df['target'].values
positions = np.sign(preds)
strategy_ret = positions * actual
cum_strategy = np.cumprod(1 + strategy_ret) - 1
cum_buy = np.cumprod(1 + actual) - 1
import matplotlib.pyplot as plt
plt.figure(figsize=(8,4))
plt.plot(cum_strategy, label='Strategy')
plt.plot(cum_buy, label='Buy & Hold')
plt.legend(); plt.title('Cumulative returns (strategy vs buy & hold)'); plt.show()
print('Final strategy return:', cum_strategy[-1], 'Final buy & hold:', cum_buy[-1])

----

**End of Notebook B.**
