In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import os
import glob

from joblib import Parallel, delayed

import lightgbm as lgb
from sklearn import preprocessing, model_selection
from sklearn.metrics import r2_score


In [None]:
data_path = '../input/optiver-realized-volatility-prediction/'

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true,y_pred):
    return np.sqrt(np.mean(np.square((y_true-y_pred)/y_true)))

def avg_tick_size(ticks):
    return (abs(ticks).sum() / np.count_nonzero(ticks))

def mad(series):
    return np.median(np.absolute(series - np.median(series)))


In [None]:
def get_features(stock_id:int, dataType = 'train'):
    
    #key = ['stock_id','time_id','seconds_in_bucket']

    # Load data
    book_df = pd.read_parquet(os.path.join(data_path,'book_{}.parquet/stock_id={}'.format(dataType,stock_id)))
    trade_df =  pd.read_parquet(os.path.join(data_path,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
    
    ##############################
    #### ORDER BOOOK FEATURES ####
    ##############################
    
    #### Preprocessing ####
    
    book_df['stock_id'] = stock_id

    #WAPs
    book_df['wap'] = (book_df['bid_price1'] * book_df['ask_size1'] + 
                      book_df['ask_price1'] * book_df['bid_size1'] + 
                      book_df['bid_price2'] * book_df['ask_size2'] +
                      book_df['ask_price2'] * book_df['bid_size2']) / (book_df['bid_size1'] + book_df['ask_size1'] + book_df['bid_size2'] + book_df['ask_size2'])
    
    book_df['wap1'] = (book_df['bid_price1'] * book_df['ask_size1'] +
                        book_df['ask_price1'] * book_df['bid_size1']) / (book_df['bid_size1'] + book_df['ask_size1'])
    
    book_df['wap2'] = (book_df['bid_price2'] * book_df['ask_size2'] +
                        book_df['ask_price2'] * book_df['bid_size2']) / (book_df['bid_size2'] + book_df['ask_size2'])

    #Log returns
    book_df['log_return_wap'] = book_df.groupby(by = ['time_id'])['wap'].apply(log_return).fillna(0)
    book_df['log_return_wap1'] = book_df.groupby(by = ['time_id'])['wap1'].apply(log_return).fillna(0)
    book_df['log_return_wap2'] = book_df.groupby(by = ['time_id'])['wap2'].apply(log_return).fillna(0)
    

    #Duration
    book_df['duration'] = book_df['seconds_in_bucket'].diff().shift(-1)
    book_df['duration'] = book_df['duration'].mask(book_df['duration'].lt(0))
    book_df['duration'] = book_df['duration'].values.astype(np.float)
    book_df['duration'].fillna((600 - book_df['seconds_in_bucket']),inplace=True)
    
    #Tick size
    book_df['tick_size_wap'] = abs(book_df['wap'].diff())
    book_df['tick_size_wap'].fillna(0,inplace=True)
    book_df['tick_size_wap1'] = abs(book_df['wap1'].diff())
    book_df['tick_size_wap1'].fillna(0,inplace=True)
    book_df['tick_size_wap2'] = abs(book_df['wap2'].diff())
    book_df['tick_size_wap2'].fillna(0,inplace=True)
    
    #Spreads 
    book_df['spread1'] = (book_df['ask_price1'] - book_df['bid_price1'])
    book_df['spread2'] = (book_df['ask_price2'] - book_df['bid_price2'])
    
    #to sum
    book_df['spread1_duration'] = book_df['spread1'] * book_df['duration'] / 600
    book_df['spread2_duration'] = book_df['spread2'] * book_df['duration'] / 600
    
    #Slopes - to sum
    book_df['mid_price'] = book_df['ask_price1'] - book_df['bid_price1']
    book_df['bid_slope'] = ((np.log(book_df['bid_size1']) / abs(book_df['bid_price1'] / book_df['mid_price'] - 1)) + ((np.log(book_df['bid_size2'])/np.log(book_df['bid_size1'])-1) / abs(book_df['bid_price2'] / book_df['bid_price1'] - 1))) * book_df['duration'] / 1200
    book_df['ask_slope'] = ((np.log(book_df['ask_size1']) / (book_df['ask_price1'] / book_df['mid_price'] - 1)) + ((np.log(book_df['ask_size2'])/np.log(book_df['ask_size1'])-1) / abs(book_df['ask_price2'] / book_df['ask_price1'] - 1))) * book_df['duration'] / 1200

    #Dispersion
    book_df['dispersion'] = (book_df['bid_size1']*(book_df['bid_price1'] - book_df['bid_price2']) + book_df['ask_size1']*(book_df['ask_price2'] - book_df['ask_price1'])) * book_df['duration'] / 1200

    
    #### Feature eng ####

    book_vol_stat = book_df.groupby(by = ['stock_id','time_id'])['log_return_wap','log_return_wap1','log_return_wap2'].agg(realized_volatility).reset_index().rename(columns={'log_return_wap':'vol_wap','log_return_wap1':'vol_wap1','log_return_wap2':'vol_wap2'})
    
    wap_std_stat = book_df.groupby(by = ['stock_id','time_id'])['wap','wap1','wap2'].agg('std').reset_index().rename(columns={'wap':'wap_std','wap1':'wap1_std','wap2':'wap2_std'})
    wap_mad_stat = book_df.groupby(by = ['stock_id','time_id'])['wap','wap1','wap2'].agg(mad).reset_index().rename(columns={'wap':'wap_mad','wap1':'wap1_mad','wap2':'wap2_mad'})

    bid_std_stat = book_df.groupby(by = ['stock_id','time_id'])['bid_price1','bid_price2'].agg('std').reset_index().rename(columns={'bid_price1':'bid1_std','bid_price2':'bid2_std'})
    bid_mad_stat = book_df.groupby(by = ['stock_id','time_id'])['bid_price1','bid_price2'].agg(mad).reset_index().rename(columns={'bid_price1':'bid1_mad','bid_price2':'bid2_mad'})

    ask_std_stat = book_df.groupby(by = ['stock_id','time_id'])['ask_price1','ask_price2'].agg('std').reset_index().rename(columns={'ask_price1':'ask1_std','ask_price2':'ask2_std'})
    ask_mad_stat = book_df.groupby(by = ['stock_id','time_id'])['ask_price1','ask_price2'].agg(mad).reset_index().rename(columns={'ask_price1':'ask1_mad','ask_price2':'ask2_mad'})
   
    ask2_max_stat = book_df.groupby(by = ['stock_id','time_id'])['ask_price2'].agg('max').reset_index().rename(columns={'ask_price2':'high'})
    bid2_min_stat = book_df.groupby(by = ['stock_id','time_id'])['bid_price2'].agg('min').reset_index().rename(columns={'bid_price2':'low'})
    
    #Merging
    book_stats = [wap_std_stat, wap_mad_stat, bid_std_stat, bid_mad_stat, ask_std_stat, ask_mad_stat, ask2_max_stat, bid2_min_stat]
    book_stat = book_vol_stat
    for df in book_stats:
        book_stat = book_stat.merge(df, on=['stock_id', 'time_id'], how='left')
        
    book_stat['range'] = book_stat['high'] - book_stat['low']
    
    
    #############################
    #### ORDER FLOW FEATURES ####
    #############################

    trade_df['stock_id'] = stock_id

    #Log return 
    trade_df['log_return_price'] = trade_df.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)

    
    #### Feature eng ###
    trade_vol_stat = trade_df.groupby(by = ['stock_id','time_id'])['log_return_price'].agg(realized_volatility).reset_index().rename(columns={'log_return_price':'vol_price'})
    
    price_std_stat = trade_df.groupby(by = ['stock_id','time_id'])['price'].agg('std').reset_index().rename(columns={'price':'price_std'})
    price_mad_stat = trade_df.groupby(by = ['stock_id','time_id'])['price'].agg(mad).reset_index().rename(columns={'price':'price_mad'})
    
    size_std_stat = trade_df.groupby(by = ['stock_id','time_id'])['size'].agg('std').reset_index().rename(columns={'size':'size_std'})
    size_mad_stat = trade_df.groupby(by = ['stock_id','time_id'])['size'].agg(mad).reset_index().rename(columns={'size':'size_mad'})
    
    ordercount_std_stat = trade_df.groupby(by = ['stock_id','time_id'])['order_count'].agg('std').reset_index().rename(columns={'order_count':'ordercount_std'})
    ordercount_mad_stat = trade_df.groupby(by = ['stock_id','time_id'])['order_count'].agg(mad).reset_index().rename(columns={'order_count':'ordercount_mad'})
    ordercount_max_stat = trade_df.groupby(by = ['stock_id','time_id'])['order_count'].agg('max').reset_index().rename(columns={'order_count':'ordercount_max'})
    
    #Merging
    
    trade_stat = trade_vol_stat
    trade_stats = [price_std_stat,price_std_stat,price_mad_stat,size_std_stat,size_mad_stat,ordercount_std_stat,ordercount_mad_stat,ordercount_max_stat]
        
    for df in trade_stats:
        trade_stat = trade_stat.merge(df, on=['stock_id', 'time_id'], how='left')
    
    stock_stat = book_stat.merge(trade_stat, on=['stock_id','time_id'], how='left')
    
    return stock_stat


In [None]:
def get_dataset(stock_ids: list, dataType = 'train'):

    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_features)(stock_id, dataType)
        for stock_id in stock_ids
    )
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

# Training model

In [None]:
train_df = pd.read_csv(os.path.join(data_path, 'train.csv'))
%time train_stock_stat_df = get_dataset(stock_ids = train_df['stock_id'].unique(), dataType = 'train')
train_df = pd.merge(train_df, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
print('Train shape: {}'.format(train_df.shape))

test_df = pd.read_csv(os.path.join(data_path, 'test.csv'))
test_stock_stat_df = get_dataset(stock_ids = test_df['stock_id'].unique(), dataType = 'test')
test_df = pd.merge(test_df, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print('Test shape: {}'.format(test_df.shape))

In [None]:
train_backup = train_df.copy()
test_backup = test_df.copy()

In [None]:
train_df['stock_id'] = train_df['stock_id'].astype('category')
test_df['stock_id'] = test_df['stock_id'].astype('category')

In [None]:
n_folds = 4
n_rounds = 5000
features = ['vol_wap', 'vol_wap1', 'vol_wap2', 'wap_std',
       'wap1_std', 'wap2_std', 'wap_mad', 'wap1_mad', 'wap2_mad', 'bid1_std',
       'bid2_std', 'bid1_mad', 'bid2_mad', 'ask1_std', 'ask2_std', 'ask1_mad',
       'ask2_mad', 'high', 'low', 'range', 'vol_price', 'price_std_x',
       'price_std_y', 'price_mad', 'size_std', 'size_mad', 'ordercount_std',
       'ordercount_mad', 'ordercount_max']
cat_features = ['stock_id']

train_df['pred'] = 0

params_lgbm = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'objective': 'regression',
        'metric': 'None',
        'max_depth': -1,
        'n_jobs': -1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'lambda_l2': 1,
        'verbose': -1
        #'bagging_freq': 5
}

def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

In [None]:
kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=11)
counter = 1
scores_folds = []
test_df['target'] = 0

for train_ind, test_ind in kf.split(range(len(train_df))):
    x_train = train_df.loc[train_ind, features]
    y_train = train_df.loc[train_ind, 'target'].values
    x_test = train_df.loc[test_ind, features]
    y_test = train_df.loc[test_ind, 'target'].values

    train_data = lgb.Dataset(x_train, label=y_train, weight=1/np.power(y_train,2))
    test_data = lgb.Dataset(x_test, label=y_test, weight=1/np.power(y_test,2))

    model = lgb.train(params_lgbm, train_data, n_rounds, valid_sets=test_data, feval=feval_RMSPE, verbose_eval=250, early_stopping_rounds=500)

    preds = model.predict(train_df.loc[test_ind,features])
    train_df.loc[test_ind,'pred'] = preds
    score = round(rmspe(y_true = y_test, y_pred = preds),5)
    print('Fold {} : {}'.format(counter, score))
    scores_folds.append(score)
    counter += 1
    test_df['target'] += model.predict(test_df[features]).clip(0,1e10)

del train_data, test_data



In [None]:
importances = pd.DataFrame({'Feature': model.feature_name(), 
                            'Importance': model.feature_importance(importance_type='gain')})
importances.sort_values(by = 'Importance', inplace=True)
importances2 = importances.nlargest(50,'Importance', keep='first').sort_values(by='Importance', ascending=True)
importances2[['Importance', 'Feature']].plot(kind = 'barh', x = 'Feature', figsize = (8,6), color = 'blue', fontsize=11);plt.ylabel('Feature', fontsize=12)

In [None]:
test_df['target'] = test_df['target']/n_folds
test_df[['row_id', 'target']].to_csv('submission.csv',index = False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=5b95cd06-73d7-4903-b3de-e60226cfba1f' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>