In [None]:
#!pip install seedir

In [None]:
import os
import copy
import glob
import numpy as np
import pandas as pd
from sklearn import model_selection
import xgboost as xgb
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import shap
#import seedir as sd

In [None]:
#data_dir = '/kaggle/input/optiver-realized-volatility-prediction'
#sd.seedir(data_dir, style='emoji')

### Data structure

📁 input/<br>
└──📁 optiver-realized-volatility-prediction/<br>
      &emsp;&emsp;├──📁 trade_train.parquet/<br>
      &emsp;&emsp;│&emsp; ├──📁 stock_id=97/<br>
      &emsp;&emsp;│&emsp; │&emsp; └──📄 888f813404d8417ca8d6b8aebd5f2951.parquet<br>
      &emsp;&emsp;│&emsp; ├──📁 stock_id=43/<br>
      &emsp;&emsp;│&emsp; │&emsp;└──📄 bb0efa57f511470e817880842e3e2afa.parquet<br>
      &emsp;&emsp;│&emsp; ├──📁 ...<br>
      &emsp;&emsp;│&emsp; ├──📁 ...<br>
      &emsp;&emsp;├──📁 book_train.parquet/<br>
      &emsp;&emsp;│&emsp; ├──📁 stock_id=97/<br>
      &emsp;&emsp;│&emsp; │&emsp; └──📄 52e74e4ef0d84c5c989fc4704e46b527.parquet<br>
      &emsp;&emsp;│&emsp; ├──📁 stock_id=43/<br>
      &emsp;&emsp;│&emsp; │&emsp; └──📄 ce8f54442b1142338d2e4b02b9dc578a.parquet<br>
      &emsp;&emsp;│&emsp; ├──📁 ...<br>
      &emsp;&emsp;│&emsp; ├──📁 ...<br>
      &emsp;&emsp;├──📁 trade_test.parquet/<br>
      &emsp;&emsp;│&emsp; └──📁 stock_id=0/<br>
      &emsp;&emsp;│&emsp; │&emsp; └──📄 31c83a67d81349208e7d5eace9dbbac8.parquet<br>
      &emsp;&emsp;├──📁 book_test.parquet/<br>
      &emsp;&emsp;│&emsp; └──📁 stock_id=0/<br>
      &emsp;&emsp;│&emsp; │&emsp; └──📄 7832c05caae3489cbcbbb9b02cf61711.parquet<br>
      &emsp;&emsp;├──📄 sample_submission.csv<br>
      &emsp;&emsp;├──📄 train.csv<br>
      &emsp;&emsp;└──📄 test.csv

### Parquet files

From the notebook https://www.kaggle.com/sohier/working-with-parquet, 

Apache Parquet is an efficient columnar storage format. Compared to saving this dataset in csvs using parquet:

* Greatly reduces the necessary disk space
* Loads the data into Pandas with memory efficient datatypes
* Enables fast reads from disk
* Allows us to easily work with partitions of the data

Pandas has a parquet integration that makes loading data into a dataframe trivial


### **Goal: predict column target from train.csv using features from book_train/ and trade_train/**

More info about data can be found here https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data

In [None]:
data_dir = '/kaggle/input/optiver-realized-volatility-prediction'
df_train = pd.read_csv(f'{data_dir}/train.csv')
df_train.head()

In [None]:
plt.rcParams["figure.figsize"] = (12,8)
fig, axes = plt.subplots(2,2)

df_train['target'].hist(bins=100, ax=axes[0][0])
df_train['target'].to_frame().boxplot(ax=axes[1][0])

df_train['stock_id'].hist(bins=100, ax=axes[0][1])
df_train['stock_id'].to_frame().boxplot(ax=axes[1][1])

axes[0][0].set_title('Target distribution')
axes[1][0].set_title('Target box plot')
axes[0][1].set_title('Stock id distribution')
axes[1][1].set_title('Stock id box plot')

In [None]:
print('Gaps in the stock_id histogram above:')

print(sorted(list(set(range(120)).difference(df_train['stock_id'].unique()))))

In [None]:
df_test = pd.read_csv(f'{data_dir}/test.csv')
df_test.head()

In [None]:
book_train_0 = pd.read_parquet(f'{data_dir}/book_train.parquet')
book_train_0 = book_train_0[book_train_0['stock_id'] == 0]
book_train_0.head()

In [None]:
book_train_0.hist(figsize = (15, 15), bins=50)
plt.show()

In [None]:
trade_train_0 = pd.read_parquet(f'{data_dir}/trade_train.parquet')
trade_train_0 = trade_train_0[trade_train_0['stock_id'] == 0]
trade_train_0.head()

In [None]:
trade_train_0.hist(figsize = (15, 15), bins=50)
plt.show()

# Feature engineering

Features will be constructed using book_train / and trade_train / files. Besides `realized_volatility`, features encompass statistical features as min, max, mean, median, and std for each of the variable.

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

def prepare_book_features(file_path, raw_features, agg_f):
    
    df_book_data = pd.read_parquet(file_path)
    df_book_data = df_book_data.rename(columns = {'seconds_in_bucket':'seconds_in_bucket_book'})
    
    df_book_data['wap'] =(df_book_data['bid_price1'] * df_book_data['ask_size1']+
                          df_book_data['ask_price1'] * df_book_data['bid_size1'])/(
        df_book_data['bid_size1']+ df_book_data['ask_size1'])
        
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    
    df_stat = df_book_data.groupby('time_id').agg({ckey:agg_f for ckey in raw_features})
    df_stat.columns = df_stat.columns.map('_'.join)
    df_stat = df_stat.reset_index()
    
    df_realized_vol_per_stock =  pd.DataFrame(df_book_data.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':'realized_volatility'})
    
    df_realized_vol_per_stock = df_realized_vol_per_stock.merge(df_stat, how='left')
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    return df_realized_vol_per_stock

def prepare_trade_features(file_path, raw_features, agg_f):
    
    df_trade_data = pd.read_parquet(file_path)
    df_trade_data = df_trade_data.rename(columns = {'seconds_in_bucket':'seconds_in_bucket_trade'})
    df_trade_data = df_trade_data.groupby('time_id').agg({ckey:agg_f for ckey in raw_features})
    df_trade_data.columns = df_trade_data.columns.map('_'.join)
    df_trade_data = df_trade_data.reset_index()
    
    stock_id = file_path.split('=')[1]
    df_trade_data['row_id'] = df_trade_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    return df_trade_data

def process_book_files(files_dir, book_raw_features, agg_f):
    
    df_features = pd.DataFrame()
    list_file = glob.glob(files_dir)
    
    for file in tqdm(list_file):
        df_features = df_features.append(prepare_book_features(file, book_raw_features, agg_f))
            
    return df_features

def process_trade_files(files_dir, trade_raw_features, agg_f):
    
    df_features = pd.DataFrame()
    list_file = glob.glob(files_dir)
    
    for file in tqdm(list_file):
        df_features = df_features.append(prepare_trade_features(file, trade_raw_features, agg_f))
            
    return df_features

In [None]:
agg_f = ['min', 'max', 'mean','std', 'median']
book_raw_features = ['seconds_in_bucket_book', 'bid_price1', 'ask_price1', 'bid_price2', 'ask_price2',
               'bid_size1', 'ask_size1', 'bid_size2', 'ask_size2', 'wap', 'log_return']

trade_raw_features = ['seconds_in_bucket_trade', 'price', 'size', 'order_count']

book_features = process_book_files(f'{data_dir}/book_train.parquet/*', book_raw_features, agg_f)
trade_features = process_trade_files(f'{data_dir}/trade_train.parquet/*', trade_raw_features, agg_f)

In [None]:
book_features.head()

In [None]:
trade_features.head()

In [None]:
book_features = book_features.drop('time_id', axis=1)
trade_features = trade_features.drop('time_id', axis=1)
df_features = pd.merge(book_features, trade_features, left_on=['row_id'], right_on=['row_id'], how='left')
df_features = df_features.fillna(0)

df_train = pd.read_csv(f'{data_dir}/train.csv')
df_train['row_id'] = df_train[['stock_id', 'time_id']].apply(lambda row: f'{row[0]}-{row[1]}',axis=1)
df_train = df_train.drop('stock_id', axis=1)
df_train = df_train.drop('time_id', axis=1)

df_train = pd.merge(df_train, df_features, left_on=['row_id'], right_on=['row_id'], how='left')
df_train.head()

In [None]:
# define indices for 5-fold CV

df_train.loc[:, 'kfold'] = -1
df_train.sample(frac=1).reset_index(drop=True)
y = df_train['target'].values
skf = model_selection.KFold(n_splits=5, shuffle=True)

for f, (t_, v_) in enumerate(skf.split(X=df_train, y=y)):
    df_train.loc[v_, 'kfold'] = f

df_train.head()

**Train and validate XGBoost using all features and 5-fold CV**

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

all_features = list(df_train.columns)
for f in ['target', 'row_id', 'kfold']:
    all_features.remove(f)

for fold in range(5):
    
    df_tr = df_train[df_train.kfold != fold].reset_index(drop=True)
    df_val = df_train[df_train.kfold == fold].reset_index(drop=True)
    
    x_tr = df_tr[all_features].values
    x_val = df_val[all_features].values

    y_tr = df_tr['target'].values
    y_val = df_val['target'].values
    
    model = xgb.XGBRegressor(n_estimators=50)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    r = rmspe(y_val, pred)
    
    print(f'Fold {fold}, RMSPE:{r}')

**Show features importance and efect of the last CV fold using SHAP summary plot**

In [None]:
explainer = shap.Explainer(model, df_val[all_features])
shap_values = explainer.shap_values(df_val[all_features])
shap.summary_plot(shap_values, df_val[all_features], title='SHAP XGB summary plot', show=False)

**Train the XGBoost model using all training data and make submission**

In [None]:
book_features_test = process_book_files(f'{data_dir}/book_test.parquet/*', book_raw_features, agg_f)
trade_features_test = process_trade_files(f'{data_dir}/trade_test.parquet/*', trade_raw_features, agg_f)
df_features_test = pd.merge(book_features_test, trade_features_test, left_on=['row_id'], right_on=['row_id'], how='left')
df_features_test = df_features_test.fillna(0)

model = xgb.XGBRegressor(n_estimators=50)
model.fit(df_train[all_features].values, df_train['target'].values)

pred = model.predict(df_features_test[all_features].values)

df_features_test['target']=pred

df_test = pd.read_csv(f'{data_dir}/test.csv')
df_test = pd.merge(df_test, df_features_test, left_on=['row_id'], right_on=['row_id'], how='left')
df_test = df_test.fillna(0)

df_test[['row_id', 'target']].to_csv('submission.csv',index = False)

In [None]:
df_test[['row_id', 'target']].head()

References:
* https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data,
* https://www.kaggle.com/sohier/working-with-parquet,
* https://www.kaggle.com/konradb/we-need-to-go-deeper