This notebook shows simple flow to deep dive into the competition.I appreciate community of kaggle.
I refered to following notebooks.

(Reference)  
**Introduction to financial concepts and data**  
https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data  

**LGB Starter**  
https://www.kaggle.com/manels/lgb-starter/notebook

# Agenda

1. Import modules  
2. Common settings
3. Function Definition
4. Preprocessing  
  4-1. Book parquet data processing  
  4-2. Trade parquet data processing  
  4-3. Merge book and trade data  
  4-4. Train data preprocessing  
  4-5. Test data preprocessing  
5. Training  
  5-1. Training function1 - Light GBM  
  5-2. Cross Validation  
6. Evaluation
7. Prediction  
8. Submission


# 1. Import modules

In [None]:
import os
import sys
import time
import glob
from pathlib import Path

import pandas as pd
import numpy as np

# Parallel processing
from joblib import Parallel
from joblib import delayed

# Preprocess
from sklearn import preprocessing
from sklearn import model_selection

# Evaluation
from sklearn.metrics import r2_score

# Visullize
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import lightgbm as lgb

# Others
import warnings
warnings.simplefilter("ignore")


# 2. Common Settings

In [None]:
# Dataset path
data_path = Path('../input/optiver-realized-volatility-prediction')

# setting display option
pd.options.display.max_columns = 50

In [None]:
# Objective variable
target = 'target'

# submission file setting
submit_file = 'submission.csv'
Id_column = 'row_id'

# 3. Functions Definition  

In [None]:
#　Log Return
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

# Realized Volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
# WAP calculation
def wap_calculation1(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def wap_calculation2(df):
    return (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

In [None]:
# RMSPE
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# 4. Preprocessing dataset

## 4-1. Book parquet data processing

In [None]:
def book_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'book_{data_type}.parquet/stock_id={stock_id}/')
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # WAP calculation
    df['wap1'] = wap_calculation1(df)
    df['wap2'] = wap_calculation2(df)
    
    # log return calculation
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return).fillna(0)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return).fillna(0)    
    # Log_return calculation each stock_id and time_id
    df_realized_vol_per_stock = pd.DataFrame(df.groupby(['stock_id','time_id'])[['log_return1','log_return2']].agg(realized_volatility)).reset_index()
    
    return df_realized_vol_per_stock

Check data content of one sample with book_preprocessing function  
e.g. stock_id = 97

In [None]:
df_book = book_preprocessing(97, 'train')
df_book.head()

## 4-2. Trade parquet data processing

In [None]:
def trade_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'trade_{data_type}.parquet/stock_id={stock_id}/')
    
    df = df.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # log return calculation
    df['trade_log_return1'] = df.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    
    # Log_return calculation each stock_id and time_id
    df = pd.DataFrame(df.groupby(['stock_id','time_id'])[['trade_log_return1']].agg(realized_volatility).reset_index())
    
    return df

Check data content of one sample with trade_preprocessing function
e.g. stock_id = 0

In [None]:
df_trade = trade_preprocessing(0,'train')
df_trade.head()

## 4-3. Merge book and trade data  
Merge two data created by preprocessed with book_preprocessing and trade_preprocessing function

In [None]:
def get_stock_stat(stock_id : int, data_type = 'train'):
    
    # parquet data processing
    book_stat = book_preprocessing(stock_id, data_type)
    trade_stat = trade_preprocessing(stock_id, data_type)
    
    #Merge book and trade features
    stock_stat = book_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    
    return stock_stat

In [None]:
def get_dataSet(stock_ids : list, data_type = 'train'):
    # Parallel process of get_stock_stat 
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, data_type) 
        for stock_id in stock_ids
    )
    # concat several stock_stats in vertical direction, axis=0(default)
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

## 4-4. Train data preprocessing

In [None]:
train=pd.read_csv(data_path / 'train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
display(train.head())
print('train data shape:', train.shape)

In [None]:
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), data_type = 'train')

# Merge train with train_stock_stat_df
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
print(f'Train shape: {train.shape}')
display(train.head(5))

train.to_pickle('train.pkl')