This notebook shows simple flow to deep dive into the competition.I appreciate community of kaggle.
I refered to following notebooks.

(Reference)  
**Introduction to financial concepts and data**  
https://www.kaggle.com/jiashenliu/introduction-to-financial-concepts-and-data  

**LGB Starter**  
https://www.kaggle.com/manels/lgb-starter/notebook

<a id='agenda'></a>

# Agenda

[1. Import modules  ](#1)   
[2. Common settings](#2)  
[3. Function Definition](#3)  
[4. Preprocessing  ](#4)  
 [4-1. Book parquet data processing](#4-1)   
 [4-2. Trade parquet data processing](#4-2)    
 [ 4-3. Merge book and trade data ](#4-3)   
  [4-4. Train data preprocessing ](#4-4)   
  [4-5. Test data preprocessing](#4-5)  
[5. Training](#5)   
  [5-1. Training function1 - Light GBM](#5-1)  
  [5-2. Cross Validation](#5-2)  
[6. Evaluation](#6)  
[7. Prediction](#7)    
[8. Submission](#8)  

<a id='1'></a>

# 1. Import modules  
[Link to Agenda](#Agenda)

In [None]:
import os
import sys
import time
import glob
from pathlib import Path

import pandas as pd
import numpy as np

# Parallel processing
from joblib import Parallel
from joblib import delayed

# Preprocess
from sklearn import preprocessing
from sklearn import model_selection

# Evaluation
from sklearn.metrics import r2_score

# Visullize
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
import lightgbm as lgb

# Others
import warnings
warnings.simplefilter("ignore")


<a id='2'></a>

# 2. Common Settings  
[Link to Agenda](#Agenda)

In [None]:
# Dataset path
data_path = Path('../input/optiver-realized-volatility-prediction')

# setting display option
pd.options.display.max_columns = 50

In [None]:
# Objective variable
target = 'target'

# submission file setting
submit_file = 'submission.csv'
Id_column = 'row_id'

<a id='3'></a>

# 3. Functions Definition  
[Link to Agenda](#Agenda)

In [None]:
#　Log Return
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

# Realized Volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
# WAP calculation
def wap_calculation1(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def wap_calculation2(df):
    return (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

In [None]:
# RMSPE
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

<a id='4'></a>

# 4. Preprocessing dataset  
[Link to Agenda](#Agenda)

<a id='4-1'></a>

## 4-1. Book parquet data processing  
[Link to Agenda](#Agenda)

In [None]:
def book_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'book_{data_type}.parquet/stock_id={stock_id}/')
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # WAP calculation
    df['wap1'] = wap_calculation1(df)
    df['wap2'] = wap_calculation2(df)
    
    # log return calculation
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return).fillna(0)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return).fillna(0)    
    # Log_return calculation each stock_id and time_id
    df_realized_vol_per_stock = pd.DataFrame(df.groupby(['stock_id','time_id'])[['log_return1','log_return2']].agg(realized_volatility)).reset_index()
    
    return df_realized_vol_per_stock

Check data content of one sample with book_preprocessing function  
e.g. stock_id = 97

In [None]:
df_book = book_preprocessing(97, 'train')
df_book.head()

<a id='4-2'></a>

## 4-2. Trade parquet data processing  
[Link to Agenda](#Agenda)

In [None]:
def trade_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'trade_{data_type}.parquet/stock_id={stock_id}/')
    
    df = df.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # log return calculation
    df['trade_log_return1'] = df.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    
    # Log_return calculation each stock_id and time_id
    df = pd.DataFrame(df.groupby(['stock_id','time_id'])[['trade_log_return1']].agg(realized_volatility).reset_index())
    
    return df

Check data content of one sample with trade_preprocessing function
e.g. stock_id = 0

In [None]:
df_trade = trade_preprocessing(0,'train')
df_trade.head()

<a id='4-3'></a>

## 4-3. Merge book and trade data  
[Link to Agenda](#Agenda)  
Merge two data created by preprocessed with book_preprocessing and trade_preprocessing function

In [None]:
def get_stock_stat(stock_id : int, data_type = 'train'):
    
    # parquet data processing
    book_stat = book_preprocessing(stock_id, data_type)
    trade_stat = trade_preprocessing(stock_id, data_type)
    
    #Merge book and trade features
    stock_stat = book_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    
    return stock_stat

In [None]:
def get_dataSet(stock_ids : list, data_type = 'train'):
    # Parallel process of get_stock_stat 
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, data_type) 
        for stock_id in stock_ids
    )
    # concat several stock_stats in vertical direction, axis=0(default)
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

<a id='4-4'></a>

## 4-4. Train data preprocessing  
[Link to Agenda](#Agenda)  

In [None]:
train=pd.read_csv(data_path / 'train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
display(train.head())
print('train data shape:', train.shape)

In [None]:
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), data_type = 'train')

# Merge train with train_stock_stat_df
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')
print(f'Train shape: {train.shape}')
display(train.head(5))

<a id='4-5'></a>

## 4-5. Test data Preprocessing  
[Link to Agenda](#Agenda)

In [None]:
test = pd.read_csv(data_path /'test.csv')
test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
display(test.head())
print('test data shape:', test.shape)

In [None]:
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), data_type = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print(f'Test shape: {test.shape}')
display(test.head(5))

<a id='5'></a>

# 5.Training  
[Link to Agenda](#Agenda)

<a id='5-1'></a>

## 5-1. Training function1 - Light GBM  
[Link to Agenda](#Agenda)

In [None]:
# Parameters of Light GBM
params_lgbm = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'objective': 'regression',
        'metric': 'None',
        'max_depth': -1,
        'n_jobs': -1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'lambda_l2': 1,
        'verbose': -1
        #'bagging_freq': 5
}

In [None]:
# Define loss function for lightGBM training
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

Following function is training with Light GBM function.If you would like to try any other function, you could define another function and call it.

In [None]:
# training function
def light_gbm(X_train, y_train, X_val ,y_val, cats):
    
    # Create dataset
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cats, weight=1/np.power(y_train,2))
    val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cats, weight=1/np.power(y_val,2))
    
    # training
    model = lgb.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 250,
                      early_stopping_rounds=500
                     )
    
    # Prediction w/ validation data
    # preds_val = model.predict(train.loc[val_index, features_columns])
    preds_val = model.predict(X_val)

    # RMSPE calculation
    score = round(rmspe(y_true = y_val, y_pred = preds_val),5)

    # Prediction w/ validation data
    test_preds = model.predict(test[features_columns]).clip(0,1e10)
    
    # delete dataset
    del train_data, val_data
    
    return score, model

<a id='5-2'></a>

## 5-2. Cross Validation  
[Link to Agenda](#Agenda)

In [None]:
# Categorical data column list
cats = ['stock_id']

model_name = 'lgb1'
pred_name = f'pred_{model_name}'

features_columns = ['stock_id', 'log_return1', 'log_return2', 'trade_log_return1']
print(f'Train dataset columns : {len(features_columns)} features')

train[pred_name] = 0
test[target] = 0

# k-flods Ensemble Training
n_folds = 4
n_rounds = 10000

kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize scores dict
scores_folds = {}
# Initialize value in scores_folds(dict) to record each step in CV
scores_folds[model_name] = []

# Initial value
cv_trial = 1

# --- Cross Validation ---
for train_index, val_index in kf.split(range(len(train))):
    
    print(f'CV trial : {cv_trial} /{n_folds}')
    
    # Divide dataset into train and validation data such as Cross Validation
    X_train = train.loc[train_index, features_columns]
    y_train = train.loc[train_index, target].values
    X_val = train.loc[val_index, features_columns]
    y_val = train.loc[val_index, target].values
    
    # train with Light GBM
    rmspe_score, model = light_gbm(X_train, y_train, X_val ,y_val, cats)
    
    # record score data at each train in CV
    scores_folds[model_name].append(rmspe_score)

    # Each validation Summary 
    print(f'Fold-{cv_trial} Model-{model_name} RMSPE: {rmspe_score}')
    print('-'*50)
    
    # Prediction w/ validation data
    test_preds = model.predict(test[features_columns]).clip(0,1e10)

    test[target] += test_preds
    cv_trial += 1

<a id='6'></a>

# 6. Evaluation  
[Link to Agenda](#Agenda)

In [None]:
# devide test target score into n_folds due to sum 4 preds value in CV process
test[target] = test[target]/n_folds

# score calculation
score = round(rmspe(y_true = train[target].values, y_pred = train[pred_name].values),5)
print(f'RMSPE {model_name}: {score} - Folds: {scores_folds[model_name]}')

display(test[[Id_column, target]].head(2))

<a id='7'></a>

# 7. Submittion  
[Link to Agenda](#Agenda)

To make sumbmission file as output

In [None]:
test[[Id_column, target]].to_csv(submit_file, index = False)