<h1 id="title" style="color:white;background:black">
    <center>
        Optiver Realized Volatility Prediction
    </center>
    <center>
        Basic EDA + α
    </center>
</h1>

## This notebook explains basic EDA and how to use AutoML.

I will continue to do EDA and visualization for stock data little by little. :)

p.s. If you use this notebook as a submission, there is an issue with `memory overflow` during submission time.

It is recommended to safely separate the notebook that training and inference model for submission.

## If this kernel is useful, <font color='orange'>please upvote</font>!

# CFG

In [None]:
# ====================================================
# CFG
# ====================================================
class CFG:   
    seed=2021
    n_fold=5
    max_model=10
    max_runtime_secs=180 #10800

# Import Libraries

In [None]:
import os
import random
import glob
import gc
from tqdm import tqdm

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import missingno as msno

import pyarrow as pa
import pyarrow.parquet as pq

import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

import warnings
warnings.filterwarnings("ignore")

In [None]:
# ====================================================
# Utils
# ====================================================
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(seed=CFG.seed)

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
msno.matrix(train, fontsize = 16)

There is no NaN values in train.

# Basic Utils

- https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff()


def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))


def calculate_wap(df):
    '''
    https://www.kaggle.com/konradb/we-need-to-go-deeper
    '''
#     a = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
#     b = df['bid_size1']+ df['ask_size1']
        
    a1 = df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']
    a2 = df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']
    b = df['bid_size1'] + df['ask_size1'] + df['bid_size2']+ df['ask_size2']
    
    x = (a1 + a2) / b
    return x


def get_log_return_df_per_time_id(file_path):
    #df_book_data = pd.read_parquet(file_path)
    dataset = pq.ParquetDataset(file_path)
    book_dataset = dataset.read()
    df_book_data = book_dataset.to_pandas()
    
    df_book_data['wap'] = calculate_wap(df_book_data)
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]

    stock_id = file_path.split('=')[1]
    df_book_data['row_id'] = df_book_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    del dataset, book_dataset
    gc.collect()
    
    return df_book_data



def get_realized_volatility_df_per_time_id(file_path):
    #df_book_data = pd.read_parquet(file_path)
    dataset = pq.ParquetDataset(file_path)
    book_dataset = dataset.read()
    df_book_data = book_dataset.to_pandas()
    
    df_book_data['wap'] = calculate_wap(book_example)
    df_book_data['log_return'] = df_book_data.groupby(['time_id'])['wap'].apply(log_return)
    df_book_data = df_book_data[~df_book_data['log_return'].isnull()]
    
    df_book_data['realized_volatility'] = df_book_data.groupby(['time_id'])['log_return'].apply(realized_volatility)
    df_book_data = df_book_data[~df_book_data['realized_volatility'].isnull()]

    stock_id = file_path.split('=')[1]
    df_book_data['row_id'] = df_book_data['time_id'].apply(lambda x:f'{stock_id}-{x}')
    
    del dataset, book_dataset
    gc.collect()    
    
    return df_book_data


def realized_volatility_per_time_id(file_path, prediction_column_name):
    df_book = pd.read_parquet(file_path)
    df_book['wap'] = calculate_wap(df_book)
    df_book['log_return'] = df_book.groupby(['time_id'])['wap'].apply(log_return)
    df_book = df_book[~df_book['log_return'].isnull()]
    df_realized_vol_per_stock =  pd.DataFrame(df_book.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()
    df_realized_vol_per_stock = df_realized_vol_per_stock.rename(columns = {'log_return':prediction_column_name})
    stock_id = file_path.split('=')[1]
    df_realized_vol_per_stock['row_id'] = df_realized_vol_per_stock['time_id'].apply(lambda x:f'{stock_id}-{x}')
    return df_realized_vol_per_stock[['row_id',prediction_column_name]]


# Using ParquetDataset

In [None]:
%%time
book_example = pd.read_parquet('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')

In [None]:
%%time
dataset = pq.ParquetDataset('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=0')
book_example = dataset.read()
book_example = book_example.to_pandas()

## Check Utils

I think most people will use `stock_id=0` as an example. I will use `stock_id=5` and `time_id=5` as an example here.

In [None]:
dataset = pq.ParquetDataset('../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=5')
book_example = dataset.read()
book_example = book_example.to_pandas()

In [None]:
trade_dataset = pq.ParquetDataset('../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=5')
trade_example = trade_dataset.read()
trade_example = trade_example.to_pandas()

In [None]:
# stock_id = 5, time_id = 5
stock_id = '5'
time_id = 5

book_example = book_example[book_example['time_id']==time_id]
book_example.loc[:,'stock_id'] = stock_id

trade_example = trade_example[trade_example['time_id']==time_id]
trade_example.loc[:,'stock_id'] = stock_id

In [None]:
book_example.head()

In [None]:
trade_example.head()

In [None]:
msno.matrix(book_example, fontsize = 16)

In [None]:
msno.matrix(trade_example, fontsize = 16)

### WAP

Get `WAP` using the method below.

In [None]:
# Get WAP
book_example['wap'] = calculate_wap(book_example)

In [None]:
fig = px.line(book_example, x="seconds_in_bucket", y="wap", title='WAP of stock_id_5, time_id_5')
fig.show()

### log return

In [None]:
book_example

In [None]:
book_example.loc[:,'log_return'] = log_return(book_example['wap'])
book_example

One missing value occurs. I will remove `NaN` value.

In [None]:
book_example = book_example[~book_example['log_return'].isnull()]

fig = px.line(book_example, x="seconds_in_bucket", y="log_return", title='Log return of stock_id_5, time_id_5')
fig.show()

### Realized Volatility

Based on `time_id`, I will apply `realized_volatility` method to `log_return`.

In [None]:
%%time
book_example.groupby(['time_id'])['log_return'].apply(realized_volatility)

Currently `time_id=5`, so there is only one value.

In [None]:
book_example.loc[:,'realized_volatility'] = book_example.groupby(['time_id'])['log_return'].apply(realized_volatility)
book_example

We can also use `agg` instead of `apply`. CPU times of `agg` is a little faster than `apply`.

In [None]:
%%time
book_example.groupby(['time_id'])['log_return'].agg(realized_volatility)

In [None]:
book_example.loc[:,'realized_volatility'] = book_example.groupby(['time_id'])['log_return'].agg(realized_volatility)
book_example

We can get 1 row with `NaN` values removed.

In [None]:
new_book_example = book_example[~book_example['realized_volatility'].isnull()].reset_index()
new_book_example

And this is the same result as in the original example in `realized_volatility_per_time_id` method.
- https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data


In [None]:
new_book_example[['time_id', 'log_return']]

In [None]:
pd.DataFrame(book_example.groupby(['time_id'])['log_return'].agg(realized_volatility)).reset_index()

# Basic EDA

## bid_price, ask_price

### Min, max, mean, median

We know that various items have two features 1 and 2. for example, there are `bid_price1`, `bid_price2`.

In [None]:
book_example

let's look at `bid_price1`, `bid_price2`.

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["bid_price1"], 
        mode='lines', 
        name='bid_price1'
    )
)
fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["bid_price2"], 
        mode='lines', 
        name='bid_price2'
    )
)

fig.add_trace(
    go.Scatter(
        x=trade_example["seconds_in_bucket"], 
        y=trade_example["price"], 
        mode='lines', 
        name='trade_price'
    )
)

fig.show()

We can also see `ask_price1`, `ask_price2`.

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["ask_price1"], 
        mode='lines', 
        name='ask_price1'
    )
)
fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["ask_price2"], 
        mode='lines', 
        name='ask_price2'
    )
)

fig.add_trace(
    go.Scatter(
        x=trade_example["seconds_in_bucket"], 
        y=trade_example["price"], 
        mode='lines', 
        name='trade_price'
    )
)

fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["bid_price1"], 
        mode='lines', 
        name='bid_price1'
    )
)
fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["bid_price2"], 
        mode='lines', 
        name='bid_price2'
    )
)

fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["ask_price1"], 
        mode='lines', 
        name='ask_price1'
    )
)
fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["ask_price2"], 
        mode='lines', 
        name='ask_price2'
    )
)

fig.add_trace(
    go.Scatter(
        x=trade_example["seconds_in_bucket"], 
        y=trade_example["price"], 
        mode='lines', 
        name='trade_price'
    )
)

fig.show()

In [None]:
book_example

In [None]:
temp_aggs = book_example.groupby(['time_id']).agg(
                                            bid_price1_min = ('bid_price1', 'min'),
                                            bid_price2_max = ('bid_price2', 'max'),
                                            bid_price1_mean = ('bid_price1', 'mean'),
                                            bid_price2_mean = ('bid_price2', 'mean'),
                                            bid_price1_median = ('bid_price1', 'median'),
                                            bid_price2_median = ('bid_price2', 'median'),
                                            ask_price1_min = ('ask_price1', 'min'),
                                            ask_price2_max = ('ask_price2', 'max'),
                                            ask_price1_mean = ('ask_price1', 'mean'),
                                            ask_price2_mean = ('ask_price2', 'mean'),
                                            ask_price1_median = ('ask_price1', 'median'),
                                            ask_price2_median = ('ask_price2', 'median'),
)

In [None]:
aggs_book_example = pd.merge(new_book_example, temp_aggs, on=['time_id'], how='left')
aggs_book_example

In [None]:
aggs_book_example.columns

Now, we have `min`, `max`, `mean`, `median` values for `bid_pricd` and `ask_price`.

- bid_price1, 2

In [None]:
plt.figure(figsize=(16, 8))

plt.plot(book_example['seconds_in_bucket'], book_example['bid_price1'])
plt.plot(book_example['seconds_in_bucket'], book_example['bid_price2'])

plt.plot(trade_example['seconds_in_bucket'], trade_example['price'])

plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price1_min'],
            s=150,
            label='min'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price2_max'],
            s=150,
            label='max'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price1_mean'],
            s=150,
            label='mean1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price2_mean'],
            s=150,
            label='mean2'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price1_median'],
            s=150,
            label='median1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price2_median'],
            s=150,
            label='median2'
           )


plt.xlabel('seconds_in_bucket', fontsize=12)
plt.ylabel('bid_price', fontsize=12)

plt.legend()
plt.show()

- ask_price1, 2

In [None]:
plt.figure(figsize=(16, 8))

plt.plot(book_example['seconds_in_bucket'], book_example['ask_price1'])
plt.plot(book_example['seconds_in_bucket'], book_example['ask_price2'])

plt.plot(trade_example['seconds_in_bucket'], trade_example['price'])

plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price1_min'],
            s=150,
            label='min'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price2_max'],
            s=150,
            label='max'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price1_mean'],
            s=150,
            label='mean1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price2_mean'],
            s=150,
            label='mean2'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price1_median'],
            s=150,
            label='median1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price2_median'],
            s=150,
            label='median2'
           )

plt.xlabel('seconds_in_bucket', fontsize=12)
plt.ylabel('ask_price', fontsize=12)

plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(16, 8))

plt.plot(book_example['seconds_in_bucket'], book_example['bid_price1'])
plt.plot(book_example['seconds_in_bucket'], book_example['bid_price2'])

plt.plot(book_example['seconds_in_bucket'], book_example['ask_price1'])
plt.plot(book_example['seconds_in_bucket'], book_example['ask_price2'])

plt.plot(trade_example['seconds_in_bucket'], trade_example['price'])

plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price1_min'],
            s=150,
            label='min'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price2_max'],
            s=150,
            label='max'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price1_mean'],
            s=150,
            label='mean1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price2_mean'],
            s=150,
            label='mean2'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price1_median'],
            s=150,
            label='median1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['bid_price2_median'],
            s=150,
            label='median2'
           )

plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price1_min'],
            s=150,
            label='min'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price2_max'],
            s=150,
            label='max'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price1_mean'],
            s=150,
            label='mean1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price2_mean'],
            s=150,
            label='mean2'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price1_median'],
            s=150,
            label='median1'
           )
plt.scatter(x=aggs_book_example['seconds_in_bucket'], 
            y=aggs_book_example['ask_price2_median'],
            s=150,
            label='median2'
           )


plt.xlabel('seconds_in_bucket', fontsize=12)
plt.ylabel('bid_price & ask_price', fontsize=12)

plt.legend()
plt.show()

We can see the place for `max`, `min`, `mean`, and `median` values in the plot.

## bid_size, ask_size

Now let's look at `bid_size` and `ask_size`.

- bid_size1, 2 - The number of shares on the most/second most competitive buy level.
- ask_size1, 2 - The number of shares on the most/second most competitive sell level.

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["bid_size1"], 
        mode='lines', 
        name='bid_size1'
    )
)
fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["bid_size2"], 
        mode='lines', 
        name='bid_size2'
    )
)

fig.show()

- `bid_size1` is a little higher than `bid_size2`.

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["ask_size1"], 
        mode='lines', 
        name='ask_size1'
    )
)
fig.add_trace(
    go.Scatter(
        x=book_example["seconds_in_bucket"], 
        y=book_example["ask_size2"], 
        mode='lines', 
        name='ask_size2'
    )
)

fig.show()

- `ask_size1` is a little higher than `ask_size2`.

In this case, the stock price is expected to rise because the buy level is higher than the sell level.

It seems to be relatively high volatility.

In [None]:
temp_aggs = book_example.groupby(['time_id']).agg(
    bid_size1_min = ('bid_size1', 'min'),
    bid_size1_max = ('bid_size1', 'max'),
    bid_size2_min = ('bid_size2', 'min'),
    bid_size2_max = ('bid_size2', 'max'),
    bid_size1_mean = ('bid_size1', 'mean'),
    bid_size2_mean = ('bid_size2', 'mean'),
    bid_size1_median = ('bid_size1', 'median'),
    bid_size2_median = ('bid_size2', 'median'),
    
    ask_size1_min = ('ask_size1', 'min'),
    ask_size1_max = ('ask_size1', 'max'),
    ask_size2_min = ('ask_size2', 'min'),
    ask_size2_max = ('ask_size2', 'max'),
    ask_size1_mean = ('ask_size1', 'mean'),
    ask_size2_mean = ('ask_size2', 'mean'),
    ask_size1_median = ('ask_size1', 'median'),
    ask_size2_median = ('ask_size2', 'median'),              
)

In [None]:
aggs_book_example2 = pd.merge(aggs_book_example, temp_aggs, on=['time_id'], how='left')
aggs_book_example2

In [None]:
plt.figure(figsize=(16, 8))

# plt.plot(book_example['bid_price1'])
# plt.plot(book_example['bid_price2'])
plt.scatter(x=1, 
            y=aggs_book_example2['bid_size1_min'],
            s=50,
            label='min1'
           )
plt.scatter(x=1, 
            y=aggs_book_example2['bid_size1_max'],
            s=aggs_book_example2['bid_size1_max'] * 1.5,
            label='max1'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['bid_size2_min'],
            s=50,
            label='min2'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['bid_size2_max'],
            s=aggs_book_example2['bid_size2_max'] * 1.5,
            label='max2'
           )
plt.scatter(x=1, 
            y=aggs_book_example2['bid_size1_mean'],
            s=50,
            label='mean1'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['bid_size2_mean'],
            s=50,
            label='mean2'
           )
plt.scatter(x=1, 
            y=aggs_book_example2['bid_size1_median'],
            s=50,
            label='median1'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['bid_size2_median'],
            s=50,
            label='median2'
           )


plt.xlabel('seconds_in_bucket', fontsize=12)
plt.ylabel('bid_size', fontsize=12)

plt.legend()
plt.show()

The difference between min and max for `bid_size` is large.

In [None]:
plt.figure(figsize=(16, 8))

# plt.plot(book_example['bid_price1'])
# plt.plot(book_example['bid_price2'])
plt.scatter(x=1, 
            y=aggs_book_example2['ask_size1_min'],
            s=50,
            label='min1'
           )
plt.scatter(x=1, 
            y=aggs_book_example2['ask_size1_max'],
            s=aggs_book_example2['ask_size1_max'] * 1.5,
            label='max1'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['ask_size2_min'],
            s=aggs_book_example2['ask_size2_min'],
            label='min2'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['ask_size2_max'],
            s=aggs_book_example2['ask_size2_max'] * 1.5,
            label='max2'
           )
plt.scatter(x=1, 
            y=aggs_book_example2['ask_size1_mean'],
            s=50,
            label='mean1'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['ask_size2_mean'],
            s=50,
            label='mean2'
           )
plt.scatter(x=1, 
            y=aggs_book_example2['ask_size1_median'],
            s=50,
            label='median1'
           )
plt.scatter(x=2, 
            y=aggs_book_example2['ask_size2_median'],
            s=50,
            label='median2'
           )


plt.xlabel('size1 & size2', fontsize=12)
plt.ylabel('ask_size', fontsize=12)

plt.legend()
plt.show()

`ask_size` also has a big difference between min and max.

## WIP... Later, I will make method for features and training AutoML.

Now, I just use the submission for original notebook.

# AutoML - H2O

In [None]:
import h2o
from h2o.automl import H2OAutoML

In [None]:
h2o.init(max_mem_size='10G', nthreads=16)

In [None]:
list_order_book_file_train = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_train.parquet/*')

In [None]:
# ==============================================================================
# past_realized_volatility_per_stock -> get_realized_volatility_df_per_time_id
# ==============================================================================

# def past_realized_volatility_per_stock(list_file, isRV=True):
#     df_past_realized = pd.DataFrame()
#     for file in list_file:
#         if isRV is True:
#             df_past_realized = pd.concat([df_past_realized,
#                                          get_realized_volatility_df_per_time_id(file)])
#         else:
#             df_past_realized = pd.concat([df_past_realized,
#                                          get_log_return_df_per_time_id(file)])
#     return df_past_realized
#
# later, i will ust above method

def past_realized_volatility_per_stock(list_file,prediction_column_name):
    df_past_realized = pd.DataFrame()
    for file in tqdm(list_file):
        df_past_realized = pd.concat([df_past_realized,
                              realized_volatility_per_time_id(file,prediction_column_name)])
    return df_past_realized

In [None]:
#df_past_realized_train = past_realized_volatility_per_stock(list_order_book_file_train, isRV=False)

In [None]:
#msno.matrix(df_past_realized_train, fontsize=16)

In [None]:
#df_past_realized_train

In [None]:
'''
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
new_train = train.merge(df_past_realized_train.loc[:, 'seconds_in_bucket':'row_id'], on=['row_id'], how='left')

new_train = new_train[~new_train['seconds_in_bucket'].isnull()].reset_index(drop=True)

new_train = new_train.loc[:, :'log_return'] # later, realized_volatility will be added.
#new_train = new_train.loc[:, :'realized_volatility'] # later, realized_volatility will be added.
new_train
'''

In [None]:
'''
train_df = h2o.H2OFrame(new_train)

x = train_df.columns[4:]
y = 'target'

train_df

aml = H2OAutoML(
    max_models=CFG.max_model,
    seed=CFG.seed, 
    max_runtime_secs=CFG.max_runtime_secs, #10800
    nfolds = CFG.n_fold,
    exclude_algos = ["DeepLearning"]
)
aml.train(x=x, y=y, training_frame=train_df)

lb = aml.leaderboard 
lb.head(rows = lb.nrows)

del new_train, df_past_realized_train, train
gc.collect()
'''

# Make a submission

In [None]:
test = pd.read_csv('../input/optiver-realized-volatility-prediction/test.csv')

In [None]:
# list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')

# df_past_realized_test = past_realized_volatility_per_stock(list_order_book_file_test, isRV=False)

# test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
# new_test = test.merge(df_past_realized_test.loc[:, 'seconds_in_bucket':'row_id'], on=['row_id'], how='left')
# new_test = new_test[~new_test['seconds_in_bucket'].isnull()].reset_index(drop=True)
# temp_test = new_test.loc[:,'seconds_in_bucket':]

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')
df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock(
                                                    list_file=list_order_book_file_test,
                                                    prediction_column_name='target')

In [None]:
list_order_book_file_test = glob.glob('/kaggle/input/optiver-realized-volatility-prediction/book_test.parquet/*')

df_naive_pred_test = df_past_realized_train = past_realized_volatility_per_stock(list_file=list_order_book_file_test,
                                                           prediction_column_name='target')
df_naive_pred_test.to_csv('submission.csv',index = False)

In [None]:
df_naive_pred_test

## If this kernel is useful, <font color='orange'>please upvote</font>!