For EDA on the the dataset refer to:
https://www.kaggle.com/ravinderkotwal/optiver-volatility-prediction-eda2

Here I will be working on features

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import glob
from tqdm.notebook import tqdm
import gc

First lets look into **train.csv and test.csv**

In [None]:
df_train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
df_train.head()

Train file has three columns.

1.stock_id: Id of the stock

2.time_id: Id of the time bucket

3.target: Realized volatility of the next 10 minute window under the same stock_id/time_id

Target value is given for different time_id for various stocks

In [None]:
# reading test.csv

df_test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
df_test.head()

The test file copntains three columns

1 stock_id: Id of the stock

2 time_id: Id of the time

3 row_id: combined stock_id and time_id with a hypen(-)

Lets take look at how to submit the test results

In [None]:
submission = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')
submission.head()

So while submisson we will use two columns

One is the row_id from the test file

And the other is the target value we have predicted for that row_id i.e. stock_id-time_id(stock_id at particular time_id)

**Functions for calculating realized volatility**

To undestand below functions and why they are used refer to notebook mentioned in the first cell.


**To calculate realized volatility we go through the following precudure:**



Calculate **Weighted Averaged price(WAP)** from the **bid price** and **ask price** and **their size**. WAP is a fixed price.

    
**𝑊𝐴𝑃** =( 𝐵𝑖𝑑𝑃𝑟𝑖𝑐𝑒1 ∗ 𝐴𝑠𝑘𝑆𝑖𝑧𝑒1 + 𝐴𝑠𝑘𝑃𝑟𝑖𝑐𝑒1 ∗ 𝐵𝑖𝑑𝑆𝑖𝑧𝑒1) /( 𝐵𝑖𝑑𝑆𝑖𝑧𝑒1 + 𝐴𝑠𝑘𝑆𝑖𝑧𝑒1 )
      
    
 Similary using above formula we can calculate **WAP2 for bid_prce2, ask_price2 and their sizes.**
    

 Then we calculate the **log return value of the WAP**
    
 $\huge r_{t-1, t} = \log \left( \frac{S_{t-1}}{S_{t1}} \right)$    
 
 where St is the fixed price at time t.In book_train the calculated WAP is the fixed price.
        
        
        
 Then we calculate the **realized volatility using log return value **
 
  
 $\huge \sigma = \sqrt{\sum_{t}r_{t-1, t}^2}$   
 
 

       



In [None]:
def WAP1(df):
    WAP = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * 
           df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return WAP

def WAP2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * 
           df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(WAP):
    return np.log(WAP).diff() 

def realized_volatility(log_r):
    return np.sqrt((log_r**2).sum())

**Functions for reading data**

In [None]:
def read_data(path):
    trade = pd.read_parquet(path)
    return trade


def consol_book_df(path):

    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1]) #extract stock id by removing directory
    
    # Caulculating WAP
    df['WAP1'] = WAP1(df)
    df['WAP2'] = WAP2(df)
    
    #calculating log return
    df['book_log_ret1'] = df.groupby('time_id')['WAP1'].apply(log_return).fillna(0)
    df['book_log_ret2'] = df.groupby('time_id')['WAP2'].apply(log_return).fillna(0)
    
    #calculating spread
    # As explained in the dataset description the difference between bid value and ask value i.e. spread is correlated to volatile nature of stock
    # the bigger the spread the higher volatile stock will be
    
    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['bid_ask_price_ratio1'] = df['bid_price1'] / df['ask_price1']
    df['bid_ask_price_ratio2'] = df['bid_price2'] / df['ask_price2']
    
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
    
    #Book features
    
    '''the features that will be returned from book_data are:
        1.Realized volatiltiy1:calculated from WAP1
        2.Realized volatility2: calculated from WAP2
        3.Price_spread1: The spread betwwen ask_price1 and bid_price1
        4.Price_spread2: The spread betwwen ask_price2 and bid_price2
        5.Bid_spread: The spread between the two bidding prices
        6.Ask_spread: The spread between the two ask prices
    '''
    final_book = df.groupby(['stock_id', 'time_id']).agg(
                                              real_vol_1 =('book_log_ret1', realized_volatility),
                                              real_vol_2 = ('book_log_ret2', realized_volatility),
                                              price_spread1 =('price_spread1', 'mean'),
                                              price_spread2 =('price_spread2', 'mean'),
                                              bid_spread =('bid_spread', 'mean'),
                                              ask_spread =('ask_spread', 'mean'),
                                              bid_ask_price_ratio1 =('bid_ask_price_ratio1', 'mean'),
                                              bid_ask_price_ratio2 =('bid_ask_price_ratio2', 'mean'),
                                              total_volume =('total_volume', 'sum'),
                                              volume_imbalance =('volume_imbalance', 'mean'),
    
        
                                              
                                              ).reset_index()
    return final_book



# consol_trade_df works on trade_train data
#It return realized volatility calculated from the price column of trada data

def consol_trade_df(path):
    
    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1])  #extract stock id by removing directory
    
    #trade log return from fixed price in trade book
    df['trade_log_ret'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)

    
    #Trade features
    final_trade = df.groupby(['time_id', 'stock_id']).agg(
                                                     real_vol_trade=('trade_log_ret', realized_volatility)).reset_index()

    return final_trade

In [None]:
# Function to combine and get features from train file, book_train file and trade_train file

def create_dataSet(df, book_paths, trade_paths):
    final_df = pd.DataFrame()
    for book_path, trade_path in tqdm(zip(book_paths, trade_paths)):
        book = consol_book_df(book_path)
        trade = consol_trade_df(trade_path)
        merged_df = (pd.merge(book, trade, on=['stock_id', 'time_id'], how='left')
                     .merge(df, on=['stock_id', 'time_id'], how='left'))
        final_df = pd.concat([final_df, merged_df])
        gc.collect()
    return final_df

In [None]:
#reading data_path for each stock
order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
trade_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
train_df = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')


# passing the file_path for each stock to create_dataSet function
train_set = create_dataSet(train_df, order_book_training, trade_training)

In [None]:
train_set.head()

In [None]:
train_set.describe()

In [None]:
#checking null values in the dataset
train_set.isnull().sum()

In [None]:
# handling null values

# real_vol_trade contains 19 null values
# So removing those 19 rows from the dataframe

train_set_final = train_set.replace([np.inf,-np.inf],np.nan).dropna()
train_set_final[train_set_final.isnull().any(axis=1)]
train_set_final

In [None]:
train_set_final.describe()

In [None]:
train_set_final.head()

In [None]:
#Correlation between target variable and other features

train_set_final.corr()['target']

In [None]:
fig, ax = plt.subplots(figsize=(15,15))
sns.heatmap(train_set_final.corr(),annot=True,ax=ax)

**Plotting target vs other features**

In [None]:
sns.scatterplot(x=train_set_final['bid_ask_price_ratio1'],y=train_set_final['target'])

In [None]:
sns.scatterplot(x=train_set_final['bid_ask_price_ratio2'],y=train_set_final['target'])

In [None]:
sns.scatterplot(x=train_set_final['total_volume'],y=train_set_final['target'])

In [None]:
sns.scatterplot(x=train_set_final['volume_imbalance'],y=train_set_final['target'])

In [None]:
# Plottting Real_vol1_1 with target
sns.scatterplot(x=train_set_final['real_vol_1'],y=train_set_final['target'])

In [None]:
# Plottting Real_vol1_2 with target
sns.scatterplot(x=train_set_final['real_vol_2'],y=train_set_final['target'])

In [None]:
# Plottting Real_vol1_target with target
sns.scatterplot(x=train_set_final['real_vol_trade'],y=train_set_final['target'])

In [None]:
# Plottting price_spread1 with target
sns.scatterplot(x=train_set_final['price_spread1'],y=train_set_final['target'])

In [None]:
# Plottting price_spread1 with target
sns.scatterplot(x=train_set_final['price_spread2'],y=train_set_final['target'])

In [None]:
# Plottting bid_spread with target

sns.scatterplot(x=train_set_final['bid_spread'],y=train_set_final['target'])

In [None]:
## Plottting ask_spread with target
sns.scatterplot(x=train_set_final['ask_spread'],y=train_set_final['target'])

From above plots we can see that all the features are correlated with the target variable.
All fwatures are positively correlated except the ask_apread.Ask_spread is negatively correlated with target value

Predicted value is evluated by two metrics:

    RMSPE:Root Mean Square Percentage Error
    
    R quared

In [None]:
from sklearn.metrics import r2_score
def rmspe_R_squared(y_true, y_pred):
    return  ((np.sqrt(np.mean(np.square((y_true - y_pred) / y_true)))),r2_score(y_true, y_pred))