In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

The competition objective is to predict the volatility(degree  of variation) in trading price of different stocks

In the dataset we are given following files:

1.Train.csv , Test.csv

2.Book_train , Book_test

3.Trade_train , Trade_test

First let's look into **train.csv**

In [None]:

train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

train.csv file contains three columns

1.**stock_id**: Id of the stock

2.**time_id**: Id of the time bucket

3.**target**: Realized volatility of the next 10 minute window under the same stock_id/time_id

In [None]:
print('Training Set Shape: ',train.shape)

In [None]:
# Number of unique stocks in train file

print("Total stocks=",len(train.stock_id.unique()))

In [None]:
# number of time_id for each stock
train.stock_id.value_counts().sort_index()

In [None]:
train.describe()

In [None]:
# Mean value of target in train

print("Mean value of volatility is= ",train.target.mean())

Plotting target value frequency in train file and its mean and median

In [None]:
print(" Mean of target= ",train['target'].mean())
print(" Median of target= ",train['target'].median())


plt.figure(figsize=(10, 8))
sns.histplot(train['target'], kde=True)
plt.axvline(train['target'].mean(), color='r', linestyle='--', label="Mean")
plt.axvline(train['target'].median(), color='g', linestyle='--', label="Median")

plt.legend()



**Calculating mean target value for each stock**

In [None]:
stock_id_mean_target=pd.DataFrame(train.groupby('stock_id',as_index=False)['target'].mean())

In [None]:

stock_id_mean_target.head()

In [None]:
print("Stock_id with highest volatile value=\n\n",stock_id_mean_target[stock_id_mean_target.target==max(stock_id_mean_target.target)].to_string(index=False),"\n")
print("Stock_id with lowest volatile value=\n\n",stock_id_mean_target[stock_id_mean_target.target==min(stock_id_mean_target.target)].to_string(index=False),"\n")
plt.figure(figsize=(10, 8))
plt.plot(stock_id_mean_target.stock_id,stock_id_mean_target.target)
plt.xlabel("stock_id")
plt.ylabel("mean target value")


**Visualizing stock mean taregt value in sorted order**

In [None]:
#sorting stocks in order form by target value

stock_id_mean_target_sorted=stock_id_mean_target.sort_values(by='target', ascending=True)
stock_id_mean_target_sorted.head()

In [None]:
#plotting mean realized volatility of all stcoks in highset to lowest

fig, ax = plt.subplots(figsize=(32, 48))
ax.barh(
    y=np.arange(len(stock_id_mean_target_sorted)),
    width=stock_id_mean_target_sorted['target'],
   
    align='center',
    ecolor='black',
    capsize=3
)

ax.set_yticks(np.arange(len(stock_id_mean_target_sorted)))
ax.set_yticklabels(stock_id_mean_target_sorted.stock_id)
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Mean Realized Volatility of Stocks', size=25, pad=20)

plt.show()

**Visualizing most volatile time_id w.r.t stock_id**

In [None]:
#plotting top 10 most volatile stocks with their time_buckets

train['stock_time_id'] = train['stock_id'].astype(str) + '_' + train['time_id'].astype(str)

fig, ax = plt.subplots(figsize=(32, 10))
ax.barh(
    y=np.arange(10),
    width=train.sort_values(by='target', ascending=True).tail(10)['target'],
    align='center',
    ecolor='black',
)

ax.set_yticks(np.arange(10))
ax.set_yticklabels(train.sort_values(by='target', ascending=True).tail(10)['stock_time_id'])
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_time_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Top 10 Most Volatile Time Buckets', size=25, pad=20)

plt.show()

train.drop(columns=['stock_time_id'], inplace=True)

**Visualizing least volatile time_id w.r.t stock_id**

In [None]:
train['stock_time_id'] = train['stock_id'].astype(str) + '_' + train['time_id'].astype(str)

fig, ax = plt.subplots(figsize=(32, 10))
ax.barh(
    y=np.arange(10),
    width=train.sort_values(by='target', ascending=True).tail(10)['target'],
    align='center',
    ecolor='black',
)

ax.set_yticks(np.arange(10))
ax.set_yticklabels(train.sort_values(by='target', ascending=True).head(10)['stock_time_id'])
ax.set_xlabel('target', size=20, labelpad=15)
ax.set_ylabel('stock_time_id', size=20, labelpad=15)
ax.tick_params(axis='x', labelsize=20, pad=10)
ax.tick_params(axis='y', labelsize=20, pad=10)
ax.set_title('Top 10 Least Volatile Time Buckets', size=25, pad=20)

plt.show()

train.drop(columns=['stock_time_id'], inplace=True)

**Now lets see the test file**

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')
test.head()

The test file copntains three columns

1 stock_id: Id of the stock

2 time_id: Id of the time

3 row_id: combined stock_id and time_id with a hypen(-)

**Lets take look at how to submit the test results**

In [None]:
submission = pd.read_csv('../input/optiver-realized-volatility-prediction/sample_submission.csv')
submission.head()

So while submisson we will use two columns

One is the row_id from the test file

And the other is the target value we have predicted for that row_id i.e. stock_id-time_id(stock_id at particular time_id)

Now, lets take look at **book_train.parquet** file

**book_train.parquet is a Order book**

Order book is a list of buy and sell orders.It also contains the list of number of shares being bid on or offered.

book_train.parquet contains data for stocks available in train.csv

Here for exploratory analysis we are using stock_0 as example

In [None]:
book_train_stock0 = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0")
book_train_stock0.head(5)

There are 10 columns in each book data for every stock.

1.  time_id - ID of the time bucket
2.  seconds_in_bucket - Number of seconds passed since the start of the bucket
3.  bid_price1 - Highest buy price after normalization
4.  ask_price1 - Lowest sell price after normalization
5.  bid_price2 - Second highest buy price after normalization
6.  ask_price2 - Second lowest sell price after normalization
7.  bid_size1 - Number of shares on the highest buy price
8.  ask_size1 - Number of shares on the lowest sell price
9.  bid_size2 - Number of shares on the second highest buy price
10. ask_size2 - Number of shares on the second lowest sell price


**For every time bucket order book last for 10 minutes and also we target volatility we are predicting for stock_id-time_id is next 10 minutes.**

Also the **seconds in bucket** is the number of seconds passed since the start of bucket and its **max value** will go 10 minutes i.e. **600 seconds**

In [None]:
book_train_stock0.describe()

There are **917553 rows** of data for stock_0.

For analysis lets break the data and analyse it for time_id=5

In [None]:
# book stock data of stock 0 at timeid =5

book_stock0__time_id5 = book_train_stock0[book_train_stock0['time_id']==5]

In [None]:
#printing number of entries in bokk data for stock0 at time_id_5 
print("total entries for stock 0 at time_id =5 is ",len(book_stock0__time_id5),"\n\n")
book_stock0__time_id5.head()

In [None]:
book_stock0__time_id5.describe()

**Plotting the columns of book_train for stock0 at time_id5 w.r.t their count**

In [None]:


book_stock0__time_id5.hist(figsize=(20,12))
plt.show()

**Comparing bid_price and ask_price for stock_0 at time_id_5**

In [None]:
print("Difference between the mean value of ask_price1 and buy_price1= " ,book_stock0__time_id5['ask_price1'].mean()-book_stock0__time_id5['bid_price1'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 
plt.plot(book_stock0__time_id5['bid_price1'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price1')

plt.plot(book_stock0__time_id5['ask_price1'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price1')


plt.axvline(book_stock0__time_id5['bid_price1'].mean(), color='r', linestyle='--', label="Mean of bid_price1")
plt.axvline(book_stock0__time_id5['ask_price1'].mean(), color='g', linestyle='--', label="Mean of ask_price1")

plt.ylabel("seconds_in_bucket")
plt.xlabel("bid_price and ask price")
plt.legend()
plt.title("bid_price1 and ask_price1 of stock0 at time_id_5 w.r.t seconds in buckets")

From above plot we can see that for stock_0 at time_id5 for different seconds in bucket the **ask_price1 value is greater than the buy_price1** 

Also visible through the plot of mean values.

In [None]:
#comapring bid_price2 and ask_price2

print("Difference between the mean value of ask_price2 and buy_price2= " ,book_stock0__time_id5['ask_price2'].mean()-book_stock0__time_id5['bid_price2'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 
plt.plot(book_stock0__time_id5['bid_price2'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price2')

plt.plot(book_stock0__time_id5['ask_price2'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price2')


plt.axvline(book_stock0__time_id5['bid_price2'].mean(), color='r', linestyle='--', label="Mean of bid_price2")
plt.axvline(book_stock0__time_id5['ask_price2'].mean(), color='g', linestyle='--', label="Mean of ask_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("bid_price and ask price")
plt.legend()
plt.title("bid_price2 and ask_price2 of stock0 at time_id_5 w.r.t seconds in buckets")

Similarly  **ask_price2 value is greater than the buy_price2**

Comparing **bid_price1** and **bid_price2**

In [None]:
print("Difference between the mean value of bid_price1 and bid_price2= " ,book_stock0__time_id5['bid_price1'].mean()-book_stock0__time_id5['bid_price2'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 
plt.plot(book_stock0__time_id5['bid_price1'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price1')

plt.plot(book_stock0__time_id5['bid_price2'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price2')


plt.axvline(book_stock0__time_id5['bid_price1'].mean(), color='r', linestyle='--', label="Mean of bid_price1")
plt.axvline(book_stock0__time_id5['bid_price2'].mean(), color='g', linestyle='--', label="Mean of bid_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("bid_price1 and bid_price2")
plt.legend()
plt.title("bid_price1 and bid_price2 of stock0 at time_id_5 w.r.t seconds in buckets")

Comparing **ask_price1** and **ask_price2**

In [None]:
print("Difference between the mean value of ask_price1 and ask_price2= " ,book_stock0__time_id5['ask_price1'].mean()-book_stock0__time_id5['ask_price2'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 
plt.plot(book_stock0__time_id5['ask_price1'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price1')

plt.plot(book_stock0__time_id5['ask_price2'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price2')


plt.axvline(book_stock0__time_id5['ask_price1'].mean(), color='r', linestyle='--', label="Mean of ask_price1")
plt.axvline(book_stock0__time_id5['ask_price2'].mean(), color='g', linestyle='--', label="Mean of ask_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("ask_price1 and ask_price2")
plt.legend()
plt.title("ask_price1 and ask_price2 of stock0 at time_id_5 w.r.t seconds in buckets")

Plotting both values of bid_price and ask_price

In [None]:
plt.figure(figsize=(15,10)) 
plt.plot(book_stock0__time_id5['bid_price1'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price1')
plt.plot(book_stock0__time_id5['bid_price2'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price2')
plt.plot(book_stock0__time_id5['ask_price1'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price1')
plt.plot(book_stock0__time_id5['ask_price2'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price2')

plt.axvline(book_stock0__time_id5['bid_price1'].mean(), color='r', linestyle='--', label="Mean of bid_price1")
plt.axvline(book_stock0__time_id5['ask_price1'].mean(), color='g', linestyle='--', label="Mean of ask_price2")
plt.axvline(book_stock0__time_id5['bid_price2'].mean(), color='k', linestyle='--', label="Mean of bid_price1")
plt.axvline(book_stock0__time_id5['ask_price2'].mean(), color='c', linestyle='--', label="Mean of ask_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("bid_price and ask price")
plt.legend()
plt.title("bid_price and ask_price of stock0 at time_id_5 w.r.t seconds in buckets")

Now using the featues in book_train we can calculate the realized volatiltiy for that 10 minute window.

**To calculate realized volatility we go through the following precudure:**



Calculate **Weighted Averaged price(WAP)** from the **bid price** and **ask price** and **their size**. WAP is a fixed price.

    
**𝑊𝐴𝑃** =( 𝐵𝑖𝑑𝑃𝑟𝑖𝑐𝑒1 ∗ 𝐴𝑠𝑘𝑆𝑖𝑧𝑒1 + 𝐴𝑠𝑘𝑃𝑟𝑖𝑐𝑒1 ∗ 𝐵𝑖𝑑𝑆𝑖𝑧𝑒1) /( 𝐵𝑖𝑑𝑆𝑖𝑧𝑒1 + 𝐴𝑠𝑘𝑆𝑖𝑧𝑒1 )
      
    
 Similary using above formula we can calculate **WAP2 for bid_prce2, ask_price2 and their sizes.**
    

 Then we calculate the **log return value of the WAP**
    
 $\huge r_{t-1, t} = \log \left( \frac{S_{t-1}}{S_{t1}} \right)$    
 
 where St is the fixed price at time t.In book_train the calculated WAP is the fixed price.
        
        
        
 Then we calculate the **realized volatility using log return value **
 
  
 $\huge \sigma = \sqrt{\sum_{t}r_{t-1, t}^2}$   
 
 

       

**Calculating the wap for stock0 at time_id5 using the above formula of WAP**

In [None]:
book_stock0__time_id5['wap1'] = (book_stock0__time_id5['bid_price1'] * book_stock0__time_id5['ask_size1'] +
                                book_stock0__time_id5['ask_price1'] * book_stock0__time_id5['bid_size1']) / (
                                       book_stock0__time_id5['bid_size1']+ book_stock0__time_id5['ask_size1'])

In [None]:

book_stock0__time_id5.head(2)

In [None]:
book_stock0__time_id5.describe()

In [None]:
#plotting change in WAP1 for stock0 at time_id_5 

plt.figure(figsize=(12,8)) 
plt.plot(book_stock0__time_id5['wap1'],book_stock0__time_id5['seconds_in_bucket'])
plt.axvline(book_stock0__time_id5['wap1'].mean(), color='r', linestyle='--', label="Mean of WAP1")
plt.xlabel("seconds_in_bucket")
plt.ylabel("WAP1")
plt.legend()
plt.title("WAP1 of stock0 at time_id_5 w.r.t seconds in buckets")

**Calculating WAP2 for stock0 at time_id5 for bid_price2 and ask_price2**

In [None]:
book_stock0__time_id5['wap2'] = (book_stock0__time_id5['bid_price2'] * book_stock0__time_id5['ask_size2'] +
                               book_stock0__time_id5['ask_price2'] * book_stock0__time_id5['bid_size2']) / (
                                       book_stock0__time_id5['bid_size2']+ book_stock0__time_id5['ask_size2'])

In [None]:
book_stock0__time_id5.head(2)

In [None]:
#plotting change in WAP2 for stock0 at time_id_5 

plt.figure(figsize=(12,8)) 
plt.plot(book_stock0__time_id5['wap2'],book_stock0__time_id5['seconds_in_bucket'])
plt.axvline(book_stock0__time_id5['wap2'].mean(), color='r', linestyle='--', label="Mean of WAP2")
plt.xlabel("seconds_in_bucket")
plt.ylabel("WAP2")
plt.legend()
plt.title("WAP2 of stock0 at time_id_5 w.r.t seconds in buckets")

**Comparing WAP1 and WAP2**

In [None]:
print("Difference in mean valuse of WAP1 and WAP2= ",book_stock0__time_id5['wap1'].mean()-book_stock0__time_id5['wap2'].mean(),"\n\n\n")

plt.figure(figsize=(12,8)) 
plt.plot(book_stock0__time_id5['wap1'],book_stock0__time_id5['seconds_in_bucket'],label="WAP1")
plt.plot(book_stock0__time_id5['wap2'],book_stock0__time_id5['seconds_in_bucket'],label="WAP1")
plt.axvline(book_stock0__time_id5['wap1'].mean(), color='r', linestyle='--', label="Mean of WAP1")
plt.axvline(book_stock0__time_id5['wap2'].mean(), color='g', linestyle='--', label="Mean of WAP2")
plt.xlabel("seconds_in_bucket")
plt.ylabel("WAP")
plt.legend()
plt.title("WAP of stock0 at time_id_5 w.r.t seconds in buckets")

**Calculating the log return**

In [None]:
#function to calculate the log_return value

def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()

In [None]:
#making copy of book_stock for calculating log_return2 
# As we calculaate the log return value number of rows decrease by 1 in dataset
book_stock0__time_id5_1=book_stock0__time_id5.copy()

In [None]:
#calculating log_return1 from WAP1
book_stock0__time_id5.loc[:,'log_return1'] = log_return(book_stock0__time_id5['wap1'])
book_stock0__time_id5 = book_stock0__time_id5[~book_stock0__time_id5['log_return1'].isnull()]


#calculating log_return2 from WAP2
book_stock0__time_id5_1.loc[:,'log_return2'] = log_return(book_stock0__time_id5_1['wap2'])
book_stock0__time_id5_1 = book_stock0__time_id5_1[~book_stock0__time_id5_1['log_return2'].isnull()]

In [None]:
print("total entries for stock 0 at time_id =5 with log_return1 is ",len(book_stock0__time_id5),"\n\n")
book_stock0__time_id5.head(5)

In [None]:
print("total entries for stock 0 at time_id =5 with log_return2 is ",len(book_stock0__time_id5_1),"\n\n")
book_stock0__time_id5_1.head(5)

In [None]:
# Adding log_return2 column in the initial dataframe

book_stock0__time_id5['log_return2']=book_stock0__time_id5_1['log_return2']
book_stock0__time_id5.head()

In [None]:
book_stock0__time_id5.describe()

To calculate the log return we compute the log ratio between two consecutive WAP.

That is why the number of entries in the table decreases by 1

For stock_0 initialy it has 302 entries but now has 301

**Visualizing and Comparing log_return values with seconds_in_bucket for stock_0 at time_id_5**

In [None]:
#plotting log_return1
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0__time_id5['log_return1'],book_stock0__time_id5['seconds_in_bucket'])
plt.axvline(book_stock0__time_id5['log_return1'].mean(), color='r', linestyle='--', label="Mean of log_return1")
plt.ylabel("seconds_in_bucket")
plt.xlabel("log_return1")
plt.legend()
plt.title("log_return1 of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
#plotting log_return2
plt.figure(figsize=(12,8)) 
plt.plot(book_stock0__time_id5['log_return2'],book_stock0__time_id5['seconds_in_bucket'])
plt.axvline(book_stock0__time_id5['log_return2'].mean(), color='g', linestyle='--', label="Mean of log_return2")
plt.ylabel("seconds_in_bucket")
plt.xlabel("log_return2")
plt.legend()
plt.title("log_return2 of stock0 at time_id_5 w.r.t seconds in buckets")

In [None]:
#plotting log_return1 anf log_return2

print("Mean of log_return1=  ",book_stock0__time_id5.log_return1.mean(),"\nMean of log_return2=  ",book_stock0__time_id5.log_return2.mean(),"\n\n")

plt.figure(figsize=(12,8)) 
plt.plot(book_stock0__time_id5['log_return1'],book_stock0__time_id5['seconds_in_bucket'],label='log_return1')
plt.plot(book_stock0__time_id5['log_return2'],book_stock0__time_id5['seconds_in_bucket'],label='log_return2')
plt.axvline(book_stock0__time_id5['log_return1'].mean(), color='r', linestyle='--', label="Mean of log_return1")
plt.axvline(book_stock0__time_id5['log_return2'].mean(), color='g', linestyle='--', label="Mean of log_return2")
plt.xlabel("seconds_in_bucket")
plt.ylabel("log_return")
plt.legend()
plt.title("log_return of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend(fontsize=15)

**Calculating the realized volatility**

In [None]:
#function to calculate the realized volatilty from the log_return value

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
#calculating the realize dvolatility

realized_vol1 = realized_volatility(book_stock0__time_id5['log_return1'])
realized_vol2 = realized_volatility(book_stock0__time_id5['log_return2'])
print("Calculated realized volatility for stock_id 0 on time_id 5 is")
print("Realized_volatiltiy1= ",realized_vol1)
print("Realized_volatiltiy2= ",realized_vol2)

** Noe lets take a look into Trade_train file**

The file contains trade data for ecery stock in train file

Her efor analysis we are using trade data of stock0

In [None]:
  
trade_train_stock0 = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=0")
trade_train_stock0.head(5)

Trade data contains the data of trades that are executed.

There are 5 columns in every trade data partition. The columns are:

* time_id - ID of the time bucket

* seconds_in_bucket - Number of seconds passed since the start of the bucket

* price - Weighted average price of all executed trades happening in one second

* size - Total number of traded shares happening in one second

* order_count - Number of unique trade orders happening in one second

In [None]:
trade_train_stock0.describe()

From below code we can see that for all time_id in train file it has entry in trade_file

In [None]:
# number of unqiue time_id for stock0 in train file
train[train.stock_id==0]['time_id'].nunique()

In [None]:
#number of unique time_id for stock0 in trade data
trade_train_stock0['time_id'].nunique()

Comparing 'seconds_in_bucket' column in trade data with book data fro stock0 at time_id=5

In [None]:
#unique values of seconds in bucket in book data
book_stock0__time_id5['seconds_in_bucket'].nunique()

In [None]:
#unique values of seconds in bucket in trade data

trade_train_stock0[trade_train_stock0.time_id==5]['seconds_in_bucket'].nunique()

The number of values of seconds_in_bucket in book data and trade data differ.

**This implies that those orders in book data were not executed as it is not available in trade file**

For stock0 trade_train file contains 123443 rows.

For analysis , lets llok into data for only time_id=5

In [None]:
trade_stock0_example = trade_train_stock0[trade_train_stock0['time_id']==5]

In [None]:
print("Number of entries for stock_0 at time_id_5 is= ",len(trade_stock0_example))
trade_stock0_example.head()

In [None]:
trade_stock0_example.describe()

In [None]:
plt.figure(figsize=(10,8)) 
plt.plot(trade_stock0_example['price'],trade_stock0_example['seconds_in_bucket'],label='price')
plt.axvline(trade_stock0_example['price'].mean(), color='g', linestyle='--', label="Mean of trade price ")
plt.ylabel("seconds_in_bucket")
plt.xlabel("traded_price")
plt.legend()
plt.title("traded_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

> **Comparing traded price with bid_price and ask_price from book_train**

In [None]:
#comparing trade price and bid_price

print("Mean of trade price= ",trade_stock0_example['price'].mean() ,"\nMean of bid_price1 = ",book_stock0__time_id5['bid_price1'].mean() ,"\nMean of bid_price2= ",book_stock0__time_id5['bid_price2'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 
plt.plot(trade_stock0_example['price'],trade_stock0_example['seconds_in_bucket'],label='price')
plt.plot(book_stock0__time_id5['bid_price1'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price1')
plt.plot(book_stock0__time_id5['bid_price2'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price2')

plt.axvline(trade_stock0_example['price'].mean(), color='r', linestyle='--', label="Mean of trade price")
plt.axvline(book_stock0__time_id5['bid_price1'].mean(), color='g', linestyle='--', label="Mean of bid_price1")
plt.axvline(book_stock0__time_id5['bid_price2'].mean(), color='c', linestyle='--', label="Mean of bid_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("traded_price and bid_price")
plt.legend()
plt.title("traded_price and bid_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

In [None]:
#comparing trade price and ask_price

print("Mean of trade price= ",trade_stock0_example['price'].mean() ,"\nMean of ask_price1 = ",book_stock0__time_id5['ask_price1'].mean() ,"\nMean of ask_price2= ",book_stock0__time_id5['ask_price2'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 
plt.plot(trade_stock0_example['price'],trade_stock0_example['seconds_in_bucket'],label='price')
plt.plot(book_stock0__time_id5['ask_price1'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price1')
plt.plot(book_stock0__time_id5['ask_price2'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price2')

plt.axvline(trade_stock0_example['price'].mean(), color='r', linestyle='--', label="Mean of trade price")
plt.axvline(book_stock0__time_id5['ask_price1'].mean(), color='g', linestyle='--', label="Mean of ask_price1")
plt.axvline(book_stock0__time_id5['ask_price2'].mean(), color='c', linestyle='--', label="Mean of ask_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("traded_price and ask_price")
plt.legend()
plt.title("traded_price and ask_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

In [None]:
#comparing trade price ,bid_price and ask_price

print("Mean of trade price= ",trade_stock0_example['price'].mean() ,"\nMean of ask_price1 = ",book_stock0__time_id5['ask_price1'].mean() ,"\nMean of ask_price2= ",book_stock0__time_id5['ask_price2'].mean(),"\n\n\n")



print("\nMean of bid_price1 = ",book_stock0__time_id5['bid_price1'].mean() ,"\nMean of bid_price2= ",book_stock0__time_id5['bid_price2'].mean(),"\n\n\n")

plt.figure(figsize=(15,10)) 

plt.plot(book_stock0__time_id5['bid_price1'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price1')
plt.plot(book_stock0__time_id5['bid_price2'],book_stock0__time_id5['seconds_in_bucket'],label='bid_price2')


plt.axvline(book_stock0__time_id5['bid_price1'].mean(), color='k', linestyle='--', label="Mean of bid_price1")
plt.axvline(book_stock0__time_id5['bid_price2'].mean(), color='m', linestyle='--', label="Mean of bid_price2")


plt.plot(trade_stock0_example['price'],trade_stock0_example['seconds_in_bucket'],label='price')
plt.plot(book_stock0__time_id5['ask_price1'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price1')
plt.plot(book_stock0__time_id5['ask_price2'],book_stock0__time_id5['seconds_in_bucket'],label='ask_price2')

plt.axvline(trade_stock0_example['price'].mean(), color='r', linestyle='--', label="Mean of trade price")
plt.axvline(book_stock0__time_id5['ask_price1'].mean(), color='g', linestyle='--', label="Mean of ask_price1")
plt.axvline(book_stock0__time_id5['ask_price2'].mean(), color='c', linestyle='--', label="Mean of ask_price2")

plt.ylabel("seconds_in_bucket")
plt.xlabel("Price")
plt.legend()
plt.title("traded_price bid_price and ask_price of stock0 at time_id_5 w.r.t seconds in buckets")
plt.legend()

Trade_train file contains data regarding the trade that has been executed

Here the price column indicate the price at which the stock_0 is traded at time_id_5

We will be calculating the realized volatility using this price

For calculating the realized volatility from trade data we will follow the same procedure as followed in bokk_data.

**Here we already have a fixed normalized price i.e. column price.**



Using this price column we will calculate the **log return value.**


Then calculate the **realized volatility.**

**Calculating the log_return in trade_train**

In [None]:
trade_stock0_example.loc[:,'log_return1'] = log_return(trade_stock0_example['price'])

In [None]:
trade_stock0_example.head(2)

In [None]:
trade_stock0_example = trade_stock0_example[~trade_stock0_example['log_return1'].isnull()]
trade_stock0_example.head(2)

In [None]:
trade_stock0_example.describe()

Calculating the realized volatility from the log_return value calculated

In [None]:
realized_vol_trade = realized_volatility(trade_stock0_example['log_return1'])
print("Calculated realized volatility for stock_id 0 on time_id 5 from trade_train_file is= ",realized_vol_trade)