Generate a series of short-term signals from the book and trade data of a fixed 10-minute window to predict the realized volatility of the next 10-minute window

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob

In [None]:
# Data directory
data_dir = '../input/optiver-realized-volatility-prediction/'

# Functions for preprocessing

In [None]:
# To calculate first WAP
def calc_wap(df):
    wap = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return wap

# To calculate second WAP
def calc_wap2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

# To calculate log return
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

# To calculate realized volatility 
def realized_volatility(series):
    return np.sqrt(np.sum(series**2))

# To count unique elements of a series
def count_unique(series):
    return len(np.unique(series))

# Book data

In [None]:
# Load parquet
book_train = pd.read_parquet(data_dir + "book_train.parquet/stock_id=15")

# Book data snapshot
book_train.head()

# Functions to preprocess book data

In [None]:
# To preprocess book data (for each stock_id)
def preprocessor_book(file_path):
    df = pd.read_parquet(file_path)
    
    # Calculate WAP
    df['wap'] = calc_wap(df)
    df['wap2'] = calc_wap2(df)
    
    # Calculate log return
    df['log_return'] = df.groupby('time_id')['wap'].apply(log_return)
    df['log_return2'] = df.groupby('time_id')['wap2'].apply(log_return)
    
    # Calculate WAP balance
    df['wap_balance'] = abs(df['wap'] - df['wap2'])
    
    # Calculate spread
    df['price_spread'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1'])/2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))

    # Dict for aggregations
    create_feature_dict = {
        'wap': [np.sum, np.mean, np.std],
        'wap2': [np.sum, np.mean, np.std],
        'log_return': [np.sum, realized_volatility, np.mean, np.std],
        'log_return2': [np.sum, realized_volatility, np.mean, np.std],
        'wap_balance': [np.sum, np.mean, np.std],
        'price_spread': [np.sum, np.mean, np.std],
        'bid_spread': [np.sum, np.mean, np.std],
        'ask_spread': [np.sum, np.mean, np.std],
        'total_volume': [np.sum, np.mean, np.std],
        'volume_imbalance': [np.sum, np.mean, np.std]
    }
    
    # Functions to get group stats for different time windows (0-199, 200-399, 400-599)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by time window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
        # Add a suffix to differentiate time windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get stats for 3 time windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    
    # Merge them all
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    
    # Drop unecessary time_id__200 and time_id__400
    df_feature.drop(['time_id__200', 'time_id__400'], axis = 1, inplace = True)
    
    # Create row_id to merge book and trade data
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['time_id_'].apply(lambda x: f'{stock_id}-{x}')
    df_feature.drop(['time_id_'], axis = 1, inplace = True)
    
    return df_feature

In [None]:
%%time
file_path = data_dir + "book_train.parquet/stock_id=0"
preprocessor_book(file_path)

# Trade data

In [None]:
# Load parquet
trade_train = pd.read_parquet(data_dir + "trade_train.parquet/stock_id=0")

# Trade data snapshot
trade_train.head()

## Functions to preprocess trade data

In [None]:
# To preprocess trade data (for each stock_id)
def preprocessor_trade(file_path):
    df = pd.read_parquet(file_path)
    df['log_return'] = df.groupby('time_id')['price'].apply(log_return)
    
    # Dict for aggregations
    create_feature_dict = {
        'log_return':[realized_volatility],
        'seconds_in_bucket':[count_unique],
        'size':[np.sum],
        'order_count':[np.mean]
    }
    
    # Functions to get group stats for different time windows (0-199, 200-399, 400-599)
    def get_stats_window(seconds_in_bucket, add_suffix = False):
        # Group by time window
        df_feature = df[df['seconds_in_bucket'] >= seconds_in_bucket].groupby(['time_id']).agg(create_feature_dict).reset_index()
        # Rename columns joining suffix
        df_feature.columns = ['_'.join(col) for col in df_feature.columns] #time_id is changed to time_id_
        # Add a suffix to differentiate time windows
        if add_suffix:
            df_feature = df_feature.add_suffix('_' + str(seconds_in_bucket))
        return df_feature
    
    # Get stats for 3 time windows
    df_feature = get_stats_window(seconds_in_bucket = 0, add_suffix = False)
    df_feature_200 = get_stats_window(seconds_in_bucket = 200, add_suffix = True)
    df_feature_400 = get_stats_window(seconds_in_bucket = 400, add_suffix = True)
    
    # Merge them all
    df_feature = df_feature.merge(df_feature_200, how = 'left', left_on = 'time_id_', right_on = 'time_id__200')
    df_feature = df_feature.merge(df_feature_400, how = 'left', left_on = 'time_id_', right_on = 'time_id__400')
    
    # Drop unecessary time_id__200 and time_id__400
    df_feature.drop(['time_id__200', 'time_id__400'], axis = 1, inplace = True)
    
    # Add prefix trade_
    df_feature = df_feature.add_prefix('trade_')
    
    # Create row_id to merge with book and trade data
    stock_id = file_path.split('=')[1]
    df_feature['row_id'] = df_feature['trade_time_id_'].apply(lambda x:f'{stock_id}-{x}')
    df_feature.drop(['trade_time_id_'], axis = 1, inplace = True)
    
    return df_feature

In [None]:
%%time
file_path = data_dir + "trade_train.parquet/stock_id=0"
preprocessor_trade(file_path)

# Functions to get group stats for the stock_id and time_id

In [None]:
def get_time_stock(df):
    # Get realized volatility columns
    vol_cols = ['log_return_realized_volatility', 'log_return2_realized_volatility', 
                'log_return_realized_volatility_200', 'log_return2_realized_volatility_200', 
                'log_return_realized_volatility_400', 'log_return2_realized_volatility_400', 
                'trade_log_return_realized_volatility', 
                'trade_log_return_realized_volatility_200', 
                'trade_log_return_realized_volatility_400'
               ]

    # Group by the stock id
    df_stock_id = df.groupby(['stock_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_stock_id.columns = ['_'.join(col) for col in df_stock_id.columns]
    df_stock_id = df_stock_id.add_suffix('_' + 'stock')

    # Group by the stock id
    df_time_id = df.groupby(['time_id'])[vol_cols].agg(['mean', 'std', 'max', 'min', ]).reset_index()
    
    # Rename columns joining suffix
    df_time_id.columns = ['_'.join(col) for col in df_time_id.columns]
    df_time_id = df_time_id.add_suffix('_' + 'time')
    
    # Merge with original dataframe
    df = df.merge(df_stock_id, how = 'left', left_on = ['stock_id'], right_on = ['stock_id__stock'])
    df = df.merge(df_time_id, how = 'left', left_on = ['time_id'], right_on = ['time_id__time'])
    
    # Drop unecessary columns
    df.drop(['stock_id__stock', 'time_id__time'], axis = 1, inplace = True)
    
    return df

# Combined preprocessor function

In [None]:
# To make preprocessing function in parallel (for each stock_id)
def preprocessor(list_stock_ids, is_train = True):
    from joblib import Parallel, delayed # parallel computing to save time
    df = pd.DataFrame()
    
    # Parallel for loop
    def for_joblib(stock_id):
        # Train
        if is_train:
            file_path_book = data_dir + "book_train.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_train.parquet/stock_id=" + str(stock_id)
        # Test
        else:
            file_path_book = data_dir + "book_test.parquet/stock_id=" + str(stock_id)
            file_path_trade = data_dir + "trade_test.parquet/stock_id=" + str(stock_id)
        # Preprocess book & trade data and merge them    
        df_tmp = pd.merge(preprocessor_book(file_path_book), preprocessor_trade(file_path_trade), on = 'row_id', how = 'left')
        # Return the merged DataFrame
        return pd.concat([df,df_tmp])
    
    # Use parallel API to call parallel for loop
    df = Parallel(n_jobs = -1, verbose = 1)(delayed(for_joblib)(stock_id) for stock_id in list_stock_ids)
    # Concatinate all the DataFrames that return from parallel
    df = pd.concat(df, ignore_index = True)
    
    return df

In [None]:
list_stock_ids = [0,1]
preprocessor(list_stock_ids, is_train = True)

# **Change data type to reduce memory usage**

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int8','int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
%%time
train = reduce_mem_usage(pd.read_csv(data_dir + 'train.csv'))
#test = reduce_mem_usage(pd.read_csv(data_dir + 'test.csv'))
print("Shape of train set: ",train.shape)
#print("Shape of test set: ",test.shape)

# Training set

In [None]:
train_ids = train.stock_id.unique()

In [None]:
%%time
df_train = preprocessor(list_stock_ids = train_ids, is_train = True)

In [None]:
#Combine stock_id and time_id
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
train = train[['row_id','target']]

df_train = train.merge(df_train, on = ['row_id'], how = 'left')

In [None]:
df_train.head()

# Test set

In [None]:
test = pd.read_csv(data_dir + 'test.csv')

In [None]:
test_ids = test.stock_id.unique()

In [None]:
%%time
df_test = preprocessor(list_stock_ids = test_ids, is_train = False)

In [None]:
df_test = test.merge(df_test, on = ['row_id'], how = 'left')

## Target encoding by stock_id

In [None]:
from sklearn.model_selection import KFold
#stock_id target encoding
df_train['stock_id'] = df_train['row_id'].apply(lambda x:x.split('-')[0])
df_test['stock_id'] = df_test['row_id'].apply(lambda x:x.split('-')[0])

stock_id_target_mean = df_train.groupby('stock_id')['target'].mean() 
df_test['stock_id_target_enc'] = df_test['stock_id'].map(stock_id_target_mean) # test_set

#training
tmp = np.repeat(np.nan, df_train.shape[0])
kf = KFold(n_splits = 20, shuffle = True,random_state = 99)
for idx_1, idx_2 in kf.split(df_train):
    target_mean = df_train.iloc[idx_1].groupby('stock_id')['target'].mean()

    tmp[idx_2] = df_train['stock_id'].iloc[idx_2].map(target_mean)
df_train['stock_id_target_enc'] = tmp

## Model Building

In [None]:
df_train.head()

In [None]:
df_test.head()

# LightGBM

In [None]:
import lightgbm as lgbm
from bayes_opt import BayesianOptimization

In [None]:
# Transform stock_id to integer
df_train['stock_id'] = df_train['stock_id'].astype(int)
df_test['stock_id'] = df_test['stock_id'].astype(int)

### Cross Validation

In [None]:
# Split features and target
X = df_train.drop(['row_id','target'], axis=1)
y = df_train['target']

In [None]:
# Hyper-parameter tuning (Bayesian Optimization)
def bayes_parameter_opt_lgb(X, y, init_round = 15, opt_round = 25, n_folds = 3, random_seed = 6, n_estimators = 10000, output_process = False):
    # prepare data
    train_data = lgbm.Dataset(data = X, label = y, free_raw_data = False)
    # parameters
    def lgb_eval(learning_rate,num_leaves, feature_fraction, bagging_fraction, max_depth, max_bin, min_data_in_leaf,min_sum_hessian_in_leaf,subsample):
        params = {'application':'binary', 'metric':'auc'}
        params['learning_rate'] = max(min(learning_rate, 1), 0)
        params["num_leaves"] = int(round(num_leaves))
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['max_depth'] = int(round(max_depth))
        params['max_bin'] = int(round(max_depth))
        params['min_data_in_leaf'] = int(round(min_data_in_leaf))
        params['min_sum_hessian_in_leaf'] = min_sum_hessian_in_leaf
        params['subsample'] = max(min(subsample, 1), 0)
        
        cv_result = lgbm.cv(params, train_data, nfold = n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
        return max(cv_result['auc-mean'])
     
    lgbBO = BayesianOptimization(lgb_eval, {'learning_rate': (0.01, 1.0), # recommended to use smaller learning_rate with larger num_iterations
                                            'num_leaves': (24, 30000), # num_leaves = 2^(max_depth)
                                            'feature_fraction': (0.1, 0.9), # randomly select a subset of features on each iteration (tree) (%) -> speed + overfit
                                            'bagging_fraction': (0.8, 1),
                                            'max_depth': (5, 30),
                                            'max_bin':(20,90),
                                            'min_data_in_leaf': (20, 1000),
                                            'min_sum_hessian_in_leaf':(0,100),
                                           'subsample': (0.01, 1.0)}, random_state=200)

    
    #n_iter: How many steps of bayesian optimization you want to perform. The more steps the more likely to find a good maximum you are.
    #init_points: How many steps of random exploration you want to perform. Random exploration can help by diversifying the exploration space.
    
    lgbBO.maximize(init_points = init_round, n_iter = opt_round)
    
    model_auc=[]
    for model in range(len( lgbBO.res)):
        model_auc.append(lgbBO.res[model]['target'])
    
    # return best parameters
    return lgbBO.res[pd.Series(model_auc).idxmax()]['target'],lgbBO.res[pd.Series(model_auc).idxmax()]['params']

opt_params = bayes_parameter_opt_lgb(X, y, init_round = 5, opt_round = 10, n_folds = 3, random_seed = 6,n_estimators = 10000)

In [None]:
# Optimized parameters
opt_params[1]["num_leaves"] = int(round(opt_params[1]["num_leaves"]))
opt_params[1]['max_depth'] = int(round(opt_params[1]['max_depth']))
opt_params[1]['min_data_in_leaf'] = int(round(opt_params[1]['min_data_in_leaf']))
opt_params[1]['max_bin'] = int(round(opt_params[1]['max_bin']))
opt_params[1]['objective'] = 'binary'
opt_params[1]['metric'] = 'auc'
opt_params[1]['is_unbalance'] = True
opt_params[1]['boost_from_average'] = False
opt_params = opt_params[1]
opt_params

In [None]:
# To calculate RMSPE
def rmspe(y_true, y_pred):
    return (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Early stop with RMSPE
def feval_RMSPE(preds, lgbm_train): #Customized evaluation function
    labels = lgbm_train.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

seed = 42
params = {
      'objective': 'rmse', 
      'metric': 'rmse', 
      'boosting_type': 'gbdt',
      'learning_rate': 0.01,        
        'num_leaves': 27293,
        'feature_fraction': 0.2812379369053784,
        'min_sum_hessian_in_leaf': 35.74236812873617,
        'bagging_fraction': 0.9895264513703341,
        'max_bin': 50,
        'min_data_in_leaf': 23,
        'max_depth': 24,
        'seed': seed,
        'feature_fraction_seed': seed,
        'bagging_seed': seed,
        'drop_seed': seed,
        'data_random_seed': seed,
        'verbosity': -1,
        'n_jobs': -1
  }

In [None]:
from sklearn.model_selection import KFold
# Create a KFold object
kf = KFold(n_splits = 20, random_state = 19901028, shuffle = True)
oof = pd.DataFrame()                 # out-of-fold result
models = []                          # models
scores = 0.0                         # validation score

In [None]:
# Iterate through each fold
#%%time
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):

    print("Fold", fold + 1)
    
    # Create dataset
    X_train, y_train = X.loc[trn_idx], y[trn_idx]
    X_valid, y_valid = X.loc[val_idx], y[val_idx]
    
    # RMSPE weights
    train_weights = 1/np.square(y_train)
    lgbm_train = lgbm.Dataset(X_train, y_train, weight = train_weights)

    valid_weights = 1/np.square(y_valid)
    lgbm_valid = lgbm.Dataset(X_valid, y_valid, reference = lgbm_train, weight = valid_weights)
    
    # Model 
    model = lgbm.train(params = params,
                      train_set = lgbm_train,
                      valid_sets = [lgbm_train, lgbm_valid],
                      num_boost_round = 10000,
                      early_stopping_rounds = 100, # rule of thumb is to have it at 10% of your num_iterations.
                      feval = feval_RMSPE,
                      verbose_eval = 200, # the eval metric on the valid set is printed at every 100 boosting stage. 
                      categorical_feature = ['stock_id']                
                     )
    
    # validation 
    y_pred = model.predict(X_valid, num_iteration = model.best_iteration)

    RMSPE = round(rmspe(y_true = y_valid, y_pred = y_pred),3)
    print(f'Performance of the　prediction: , RMSPE: {RMSPE}')

    #keep scores and models
    scores += RMSPE / 20
    models.append(model)
    print("*" * 100)

In [None]:
scores

# Test set

In [None]:
df_test.columns

In [None]:
df_train.columns

In [None]:
y_pred = df_test[['row_id']]
X_test = df_test.drop(['time_id', 'row_id'], axis = 1)

In [None]:
X_test

In [None]:
target = np.zeros(len(X_test))

# light gbm models
for model in models:
    pred = model.predict(X_test[X_valid.columns], num_iteration=model.best_iteration)
    target += pred / len(models)

In [None]:
y_pred = y_pred.assign(target = target)

In [None]:
y_pred

In [None]:
y_pred.to_csv('submission.csv',index = False)