In [None]:


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from tqdm.notebook import tqdm


pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import probplot

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import glob
import gc

In [None]:
train_test_dtypes = {
    'stock_id': np.uint8,
    'time_id': np.uint16,
    'target': np.float64
}
df_train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv', dtype=train_test_dtypes)
df_test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv', usecols=['stock_id', 'time_id'], dtype=train_test_dtypes)


In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

We have to predict the target value for stock at different time_id.
The target value is the total volatility


In [None]:
df_train.describe()

In [None]:
df_train.shape

In [None]:
df_train.info()

In [None]:
#Train csv file contains data of 112 unique stock at different time_id

print(df_train.stock_id.unique(),"Total stocks=",len(df_train.stock_id.unique()))

In [None]:
# Number of time_id's for each stock

df_train.stock_id.value_counts().sort_index()

In [None]:
def visualize_target(target):
    
    print(f'{target}\n{"-" * len(target)}')
        
    print(f'Mean: {df_train[target].mean():.4f}  -  Median: {df_train[target].median():.4f}  -  Std: {df_train[target].std():.4f}')
    print(f'Min: {df_train[target].min():.4f}  -  25%: {df_train[target].quantile(0.25):.4f}  -  50%: {df_train[target].quantile(0.5):.4f}  -  75%: {df_train[target].quantile(0.75):.4f}  -  Max: {df_train[target].max():.4f}')
    print(f'Skew: {df_train[target].skew():.4f}  -  Kurtosis: {df_train[target].kurtosis():.4f}')
    missing_values_count = df_train[df_train[target].isnull()].shape[0]
    training_samples_count = df_train.shape[0]
    print(f'Missing Values: {missing_values_count}/{training_samples_count} ({missing_values_count * 100 / training_samples_count:.4f}%)')

    fig, axes = plt.subplots(ncols=2, figsize=(24, 8), dpi=100)
    sns.kdeplot(df_train[target], label=target, fill=True, ax=axes[0])
    axes[0].axvline(df_train[target].mean(), label=f'{target} Mean', color='r', linewidth=2, linestyle='--')
    axes[0].axvline(df_train[target].median(), label=f'{target} Median', color='b', linewidth=2, linestyle='--')
    probplot(df_train[target], plot=axes[1])
    axes[0].legend(prop={'size': 16})
    
    for i in range(2):
        axes[i].tick_params(axis='x', labelsize=12.5, pad=10)
        axes[i].tick_params(axis='y', labelsize=12.5, pad=10)
        axes[i].set_xlabel('')
        axes[i].set_ylabel('')
    axes[0].set_title(f'{target} Distribution in Training Set', fontsize=20, pad=15)
    axes[1].set_title(f'{target} Probability Plot', fontsize=20, pad=15)

    plt.show()

In [None]:
visualize_target('target')

In [None]:
target_means = df_train.groupby('stock_id')['target'].mean()
target_stds = df_train.groupby('stock_id')['target'].std()

target_means_and_stds = pd.concat([target_means, target_stds], axis=1)
target_means_and_stds.columns = ['mean', 'std']
target_means_and_stds.sort_values(by='mean', ascending=True, inplace=True)

fig, ax = plt.subplots(figsize=(32, 48))
ax.barh(
    y=np.arange(len(target_means_and_stds)),
    width=target_means_and_stds['mean'],
    xerr=target_means_and_stds['std'],
    align='center',
    ecolor='black',
    capsize=3
)

ax.set_yticks(np.arange(len(target_means_and_stds)))
ax.set_yticklabels(target_means_and_stds.index)
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Mean Realized Volatility of Stocks', size=25, pad=20)

plt.show()

del target_means, target_stds, target_means_and_stds

Instead of entire stocks, individual time buckets from different stocks are ranked based on their realized volatility. The most volatile 10 time buckets can be seen below. The most volatile time bucket belongs to stock 77 and its time_id is 24600. The most volatile stock was stock 18 and it has 3 time buckets in this list.

In [None]:
df_train['stock_time_id'] = df_train['stock_id'].astype(str) + '_' + df_train['time_id'].astype(str)

fig, ax = plt.subplots(figsize=(32, 10))
ax.barh(
    y=np.arange(10),
    width=df_train.sort_values(by='target', ascending=True).tail(10)['target'],
    align='center',
    ecolor='black',
)

ax.set_yticks(np.arange(10))
ax.set_yticklabels(df_train.sort_values(by='target', ascending=True).tail(10)['stock_time_id'])
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_time_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Top 10 Most Volatile Time Buckets', size=25, pad=20)

plt.show()

df_train.drop(columns=['stock_time_id'], inplace=True)

The least volatile 10 time buckets are also visualized and they can be seen below. All of the least volatile 10 time buckets belong to stock 31, even though it has an average volatility overall. This could be an anomaly and it must be explored further.

In [None]:
df_train['stock_time_id'] = df_train['stock_id'].astype(str) + '_' + df_train['time_id'].astype(str)

fig, ax = plt.subplots(figsize=(32, 10))
ax.barh(
    y=np.arange(10),
    width=df_train.sort_values(by='target', ascending=True).head(10)['target'],
    align='center',
    ecolor='black',
)

ax.set_yticks(np.arange(10))
ax.set_yticklabels(df_train.sort_values(by='target', ascending=True).head(10)['stock_time_id'])
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_time_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Top 10 Least Volatile Time Buckets', size=25, pad=20)

plt.show()

df_train.drop(columns=['stock_time_id'], inplace=True)

**Book_Train.parquet**

In [None]:
# Book train parquet
#getting values for stock_id=0
book_trainparquet = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0")
book_trainparquet

Above data is of stock_id 0.

Bid is the price the buyer wants to buy the stock, and Ask is the price the seller wants to sell the stock.

**stock_id**: Stock (which stock) Parquet coerces this column to the categorical data type when loaded; you may wish to convert it to int8.
**time_id**: id of which time information (linked to time_id in submission file)
seconds_in_bucket: How many seconds after starting from 0 in time_id? Maybe you're predicting a total volatility of 10 minutes, so seconds_in_bucket should be up to 600 sec
**bid_price1,2**: 1st and 2nd desired bid price of the stock (Normalized prices of the most / second most competitive buy level. )

**ask_price1,2**: Desired selling price of the stock(Normalized prices of the most/second most competitive sell level.)

**bid_size1,2**: The number of shares on the most/second most competitive buy level.

**ask_size1,2**: The number of shares on the most/second most competitive sell level.


In [None]:
#values in train.csv for stock_id=0
df_train.loc[df_train['stock_id'] == 0]

**Trade_train.parquet**

Contains data on trades that actually executed.

In [None]:
#data regarding stock_id 0

trade_example = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
trade_example

**price** - The average price of executed transactions happening in one second. Prices have been normalized and the average has been weighted by the number of shares traded in each transaction.
**size** - The sum number of shares traded.

**order_count** - The number of unique trade orders taking place.

To calculate realized volatility first we calculate the weighted average price using the formula:

**𝑊𝐴𝑃** =( 𝐵𝑖𝑑𝑃𝑟𝑖𝑐𝑒1 ∗ 𝐴𝑠𝑘𝑆𝑖𝑧𝑒1 + 𝐴𝑠𝑘𝑃𝑟𝑖𝑐𝑒1 ∗ 𝐵𝑖𝑑𝑆𝑖𝑧𝑒1) /( 𝐵𝑖𝑑𝑆𝑖𝑧𝑒1 + 𝐴𝑠𝑘𝑆𝑖𝑧𝑒1 )

Then we calculate the log return of each stock using WAP:

$\huge r_{t-1, t} = \log \left( \frac{S_{t-1}}{S_{t1}} \right)$

Using the log return then we calculate the Realized volatility

$\huge \sigma = \sqrt{\sum_{t}r_{t-1, t}^2}$

In [None]:
def WAP1(df):
    WAP = (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * 
           df['bid_size1'])/(df['bid_size1'] + df['ask_size1'])
    return WAP

def WAP2(df):
    wap = (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * 
           df['bid_size2'])/(df['bid_size2'] + df['ask_size2'])
    return wap

def log_return(WAP):
    return np.log(WAP).diff() 

def realized_volatility(log_r):
    return np.sqrt((log_r**2).sum())

In [None]:
def read_data(path):
    trade = pd.read_parquet(path)
    return trade


def consol_book_df(path):

    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1]) #extract stock id by removing directory
    
    #WAP
    df['WAP1'] = WAP1(df)
    df['WAP2'] = WAP2(df)
    
    #log return
    df['book_log_ret1'] = df.groupby('time_id')['WAP1'].apply(log_return).fillna(0)
    df['book_log_ret2'] = df.groupby('time_id')['WAP2'].apply(log_return).fillna(0)
    
    #Book features
    final_book = df.groupby(['stock_id', 'time_id']).agg(
                                              real_vol_1 =('book_log_ret1', realized_volatility),
                                              real_vol_2 = ('book_log_ret2', realized_volatility),
                                              ).reset_index()
    return final_book

def consol_trade_df(path):
    
    #read stock pq file
    df = read_data(path)
    
    #add stock-id column
    df['stock_id'] = int(path.split("=")[1])  #extract stock id by removing directory
    
    #trade log return
    df['trade_log_ret'] = df.groupby('time_id')['price'].apply(log_return).fillna(0)
    
    #Trade features
    final_trade = df.groupby(['time_id', 'stock_id']).agg(
                                                     real_vol_trade=('trade_log_ret', realized_volatility)).reset_index()

    return final_trade

In [None]:
def create_dataSet(df, book_paths, trade_paths):
    final_df = pd.DataFrame()
    for book_path, trade_path in tqdm(zip(book_paths, trade_paths)):
        book = consol_book_df(book_path)
        trade = consol_trade_df(trade_path)
        merged_df = (pd.merge(book, trade, on=['stock_id', 'time_id'], how='left')
                     .merge(df, on=['stock_id', 'time_id'], how='left'))
        final_df = pd.concat([final_df, merged_df])
        gc.collect()
    return final_df 

In [None]:
order_book_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')
trade_training = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/trade_train.parquet/*')
train_df = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')

train_set = create_dataSet(train_df, order_book_training, trade_training)

In [None]:
train_set

In [None]:
train_set.describe()

In [None]:
#null values in the dataset
train_set.isnull().sum()

In [None]:
# real_vol_trade contains 19 null values
# So 19 rows will be dropped from the dataframe

train_set_final = train_set.replace([np.inf,-np.inf],np.nan).dropna()
train_set_final[train_set_final.isnull().any(axis=1)]
train_set_final

In [None]:
train_set_final.describe()

In [None]:
train_set_final.to_csv('mycsvfile.csv',index=False)

**VERSION 2** 

DATE:13-09-2021

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
book_train_stock0 = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0")
book_train_stock0.head(5)

In [None]:
book_train_stock0.head(-5)

In [None]:
book_stock0_example = book_train_stock0[book_train_stock0['time_id']==5]

In [None]:
print("total entries for stock 0 at time_id =5 is ",len(book_stock0_example),"\n\n")
book_stock0_example.head()


In [None]:
book_stock0_example.describe()

**Comparing bid_price and ask_price for stock_0 at time_id_0**

In [None]:
plt.figure(figsize=(15,10)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['bid_price1'],label='bid_price1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['bid_price2'],label='bid_price2')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['ask_price1'],label='ask_price1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['ask_price2'],label='ask_price2')

plt.xlabel("seconds_in_bucket")
plt.ylabel("bid_price and ask price")
plt.legend()
plt.title("bid_price and ask price of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
bid_price1_mean=book_stock0_example['bid_price1'].mean()
bid_price2_mean=book_stock0_example['bid_price2'].mean()
ask_price1_mean=book_stock0_example['ask_price1'].mean()
ask_price2_mean=book_stock0_example['ask_price2'].mean()


print("bid_price1_mean= ",bid_price1_mean)
print("bid_price2_mean= ",bid_price2_mean)
print("ask_price1_mean= ",ask_price1_mean)
print("ask_price2_mean= ",ask_price2_mean)

**To calculate realized volatility we go throught the following precudure:**

    1. calculate wap from the bid price and ask price and their size

    2. calculate the log return of  the wap

    3. then we calculate the realized volatility using log return value

**Calculating the wap**

In [None]:
book_stock0_example['wap1'] = (book_stock0_example['bid_price1'] * book_stock0_example['ask_size1'] +
                                book_stock0_example['ask_price1'] * book_stock0_example['bid_size1']) / (
                                       book_stock0_example['bid_size1']+ book_stock0_example['ask_size1'])

In [None]:

book_stock0_example.head(2)

In [None]:
book_stock0_example.describe()

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['wap1'])

plt.xlabel("seconds_in_bucket")
plt.ylabel("WAP1")
plt.title("WAP1 of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
book_stock0_example['wap2'] = (book_stock0_example['bid_price2'] * book_stock0_example['ask_size2'] +
                                book_stock0_example['ask_price2'] * book_stock0_example['bid_size2']) / (
                                       book_stock0_example['bid_size2']+ book_stock0_example['ask_size2'])

In [None]:
book_stock0_example.head(2)

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['wap2'])

plt.xlabel("seconds_in_bucket")
plt.ylabel("WAP2")
plt.title("WAP2 of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['wap1'],label='WAP1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['wap2'],label='WAP2')
plt.xlabel("seconds_in_bucket")
plt.ylabel("WAP2")
plt.legend()
plt.title("WAP1 and WAP2 of stock0 at time_id_5 w.r.t seconds in buckets")

**Calculating the log return**

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

In [None]:
book_stock0_example1=book_stock0_example.copy()

In [None]:
book_stock0_example.loc[:,'log_return1'] = log_return(book_stock0_example['wap1'])
book_stock0_example = book_stock0_example[~book_stock0_example['log_return1'].isnull()]

book_stock0_example1.loc[:,'log_return2'] = log_return(book_stock0_example1['wap2'])
book_stock0_example1 = book_stock0_example1[~book_stock0_example1['log_return2'].isnull()]

In [None]:
print("total entries for stock 0 at time_id =5 with log_return1 is ",len(book_stock0_example),"\n\n")
book_stock0_example.head(5)

In [None]:
print("total entries for stock 0 at time_id =5 with log_return2 is ",len(book_stock0_example1),"\n\n")
book_stock0_example1.head(5)

In [None]:
book_stock0_example['log_return2']=book_stock0_example1['log_return2']
book_stock0_example.head()

In [None]:
book_stock0_example.describe()

To calculate the log return we compute the log ratio between two consecutive WAP.

That is why the number of entries in the table decreases by 1

For stock_0 initialy it has 302 entries but now has 301

**Comapring log_return values with seconds_in_bucket for stock_0 at time_id_5**

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['log_return1'])

plt.xlabel("seconds_in_bucket")
plt.ylabel("log_return1")
plt.title("log_return1 of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['log_return2'])

plt.xlabel("seconds_in_bucket")
plt.ylabel("log_return2")
plt.title("log_return2 of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['log_return1'],label='log_return1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['log_return2'],label='log_return2')
plt.xlabel("seconds_in_bucket")
plt.ylabel("log_return")
plt.legend()
plt.title("log_return of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend(fontsize=15)

**Calculating the realized volatility**

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))


In [None]:
realized_vol1 = realized_volatility(book_stock0_example['log_return1'])
realized_vol2 = realized_volatility(book_stock0_example['log_return2'])
print("Calculated realized volatility for stock_id 0 on time_id 5 is")
print("Realized_volatiltiy1= ",realized_vol1)
print("Realized_volatiltiy2= ",realized_vol2)


**Trade_train file**

In [None]:
  
trade_train_stock0 = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
trade_train_stock0.head(5)

In [None]:
trade_stock0_example = trade_train_stock0[trade_train_stock0['time_id']==5]

In [None]:
print("Number of entries for stock_0 at time_id_5 is= ",len(trade_stock0_example))
trade_stock0_example.head()

In [None]:
trade_stock0_example.describe()

In [None]:
plt.figure(figsize=(10,8)) 
plt.plot(trade_stock0_example['seconds_in_bucket'],trade_stock0_example['price'],label='price')

plt.xlabel("seconds_in_bucket")
plt.ylabel("traded_price")
plt.legend()
plt.title("traded_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

> **Comparing traded price with bid_price and ask_price from book_train**

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(trade_stock0_example['seconds_in_bucket'],trade_stock0_example['price'],label='price')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['bid_price1'],label='bid_price1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['bid_price2'],label='bid_price2')
plt.xlabel("seconds_in_bucket")
plt.ylabel("traded_price and bid_price")
plt.legend()
plt.title("traded_price and bid_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

In [None]:
plt.figure(figsize=(12,8)) 
plt.plot(trade_stock0_example['seconds_in_bucket'],trade_stock0_example['price'],label='price')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['ask_price1'],label='ask_price1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['ask_price2'],label='ask_price2')
plt.xlabel("seconds_in_bucket")
plt.ylabel("traded_price and ask_price")
plt.legend()
plt.title("traded_price and ask_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

In [None]:
plt.figure(figsize=(15,10)) 
plt.plot(trade_stock0_example['seconds_in_bucket'],trade_stock0_example['price'],label='price')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['bid_price1'],label='bid_price1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['bid_price2'],label='bid_price2')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['ask_price1'],label='ask_price1')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['ask_price2'],label='ask_price2')
plt.xlabel("seconds_in_bucket")
plt.ylabel("traded_price,bid_price and ask_price")
plt.legend()
plt.title("traded_price , bid_price and ask_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

Trade_train file contains data regarding the trade that has been executed

Here the price column indicate dthe price at which the stock_0 is traded at time_id_5

We will be calculating the realized volatility using this price

Using the price we will calculate the log return 

Then calculate the realized volatility

**Calculating the log_return in trade_train**

In [None]:
trade_stock0_example.loc[:,'log_return1'] = log_return(trade_stock0_example['price'])

In [None]:
trade_stock0_example.head(2)

In [None]:
trade_stock0_example = trade_stock0_example[~trade_stock0_example['log_return1'].isnull()]
trade_stock0_example.head(2)

In [None]:
trade_stock0_example.describe()

In [None]:
plt.figure(figsize=(15,10)) 

plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['log_return1'],label='log_return1 of Book_train')
plt.plot(book_stock0_example['seconds_in_bucket'],book_stock0_example['log_return2'],label='log_return2 of Book_train')
plt.plot(trade_stock0_example['seconds_in_bucket'],trade_stock0_example['log_return1'],label='log_return of Trade_train')
plt.xlabel("seconds_in_bucket")
plt.ylabel("log_return of price in trade_train and book_train")
plt.legend()
plt.title("log_return of price in trade_train and book_train of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

In [None]:
realized_vol_trade = realized_volatility(trade_stock0_example['log_return1'])
print("Calculated realized volatility for stock_id 0 on time_id 5 from trade_train_file is= ",realized_vol_trade)