# 0.Import Librairies :

In [None]:
import pandas as pd
import numpy as np 
import glob 
import warnings 
from collections import Counter
warnings.filterwarnings("ignore")
import plotly.express as px 
import seaborn as sns 
import matplotlib.pyplot as plt 
import lightgbm as lgbm 
from sklearn.model_selection import StratifiedKFold
import math
import os 
import random
import torch 
import torch.nn as nn
from transformers import AdamW
from torch.utils.data import Dataset , DataLoader
from colorama import Fore , Style
r__=Fore.RED
g__=Fore.GREEN
st__=Style.RESET_ALL

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONASSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

In [None]:
seed_everything(42)

# 1. Helper functions :

In [None]:
def wap(row) :
    denom = row.ask_size1 + row.bid_size1
    return ((row.bid_price1 * row.ask_size1 + row.ask_price1 * row.bid_size1)/denom)

In [None]:
def log_return(list_prices):
    return np.log(list_prices).diff()

In [None]:
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
def custom_loss(ytrue,ypred) :
    squared_residual = (ytrue-ypred)**2/ytrue
    grad = squared_residual
    hess = np.ones(len(ytrue))
    
    return grad,hess

In [None]:
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False


In [None]:
def custom_rmspe_valid(y_true, y_pred):
    residual = (y_true - y_pred).astype("float")
    residual = residual ** 2 / y_true
    residual = np.mean(residual)
    return "eval_RMSPE", math.sqrt(residual), False

In [None]:
def simple_volatility(series_prix):
    mx = np.max(series_prix)
    mn = np.min(series_prix)
    moy = np.mean(series_prix)
    vol = (moy-mn)/(mx-mn)
    return vol

In [None]:
def count_unique(series):
    return len(np.unique(series))

# 2. Get a Look at Datas:

In [None]:
# load train datas 
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")

In [None]:
# get a look at train data.
train.head() 

In [None]:
# get a look at test data 
test = pd.read_csv("../input/optiver-realized-volatility-prediction/test.csv")
test.head()

In [None]:
# load book train for stock_id = 1
book_train_stock_id_1 = pd.read_parquet("../input/optiver-realized-volatility-prediction/book_train.parquet/stock_id=1")

In [None]:
# get a look at book train for stock_id = 1
book_train_stock_id_1.head()

In [None]:
# load trade train for stock_id = 1
trade_train_stock_id_1 = pd.read_parquet("../input/optiver-realized-volatility-prediction/trade_train.parquet/stock_id=1")

In [None]:
# get a look at trade train .
trade_train_stock_id_1.head()

# 3. EDA : 

In [None]:
nb_time_id_per_stock_id = Counter(train["stock_id"])
train["nb_time_id"]  = train["stock_id"].map(nb_time_id_per_stock_id)
fig = px.pie(data_frame=train,names="nb_time_id",values="nb_time_id",title="Percentage of number of time_id for each stock_id")
fig.update_layout(title={"xanchor":"center","x":0.5})
fig.show()

We notice that more than 95 % of stock_id have the same number of time_id, which is 3830 number of time_id.

## Book order features explorations :

In [None]:
#  Compute the weighted averaged price for each option.
book_train_stock_id_1["wap"] = book_train_stock_id_1.apply(wap,axis=1)

In [None]:
book_train_stock_id_1["squared_wap"] = book_train_stock_id_1["wap"] **2

In [None]:
# Compute the second weighted averaged price for each option 
book_train_stock_id_1.loc[:,"wap1"] = (book_train_stock_id_1['bid_price2'] * book_train_stock_id_1['ask_size2']+book_train_stock_id_1['ask_price2'] * book_train_stock_id_1['bid_size2'])  / (
                                      book_train_stock_id_1['bid_size2']+ book_train_stock_id_1[
                                  'ask_size2'])

In [None]:
book_train_stock_id_1.loc[:,"supply_demand"] = (book_train_stock_id_1.loc[:,"ask_size1"]+book_train_stock_id_1.loc[:,"ask_size2"])\
/(book_train_stock_id_1.loc[:,"bid_size1"] + book_train_stock_id_1.loc[:,"bid_size2"])

In [None]:
book_train_stock_id_1.loc[:,"log_supply_demand"] = book_train_stock_id_1.groupby("time_id")["supply_demand"].apply(log_return)

In [None]:
book_train_stock_id_1.loc[:,"avg_wap"] = book_train_stock_id_1.loc[:,"wap"] + \
book_train_stock_id_1.loc[:,"wap1"]/2

In [None]:
# compute the log return for each option .
book_train_stock_id_1["log_return"]=book_train_stock_id_1.groupby("time_id")["wap"].\
apply(log_return)

In [None]:
book_train_stock_id_1.loc[:,"log_return1"] = book_train_stock_id_1.groupby("time_id")["wap1"].apply(log_return)

In [None]:
book_train_stock_id_1.loc[:,"log_squared_wap"] = book_train_stock_id_1.groupby("time_id")["squared_wap"].apply(log_return)

In [None]:
book_train_stock_id_1.loc[:,"log_avg_wap"] = book_train_stock_id_1.groupby("time_id")\
["avg_wap"].apply(log_return)

In [None]:
book_train_stock_id_1["diff_bid_price"] = book_train_stock_id_1["bid_price1"]-\
book_train_stock_id_1["bid_price2"]

In [None]:
book_train_stock_id_1["diff_ask_price"] = book_train_stock_id_1["ask_price2"]-\
book_train_stock_id_1["ask_price1"]

In [None]:
book_train_stock_id_1["log_diff_ask_price"] = book_train_stock_id_1.groupby("time_id")["diff_ask_price"].\
apply(log_return)

In [None]:
book_train_stock_id_1["log_diff_bid_price"] = book_train_stock_id_1.groupby("time_id")["diff_bid_price"].\
apply(log_return)

In [None]:
book_train_stock_id_1["wap3"] = ((book_train_stock_id_1["bid_price1"] * book_train_stock_id_1["bid_size1"])+\
                                 (book_train_stock_id_1["ask_price1"] * book_train_stock_id_1["ask_size1"]))/\
(book_train_stock_id_1["ask_price1"] + book_train_stock_id_1["ask_size1"])

In [None]:
book_train_stock_id_1["log_wap3"] = book_train_stock_id_1.groupby("time_id")["wap3"].\
apply(log_return)

In [None]:
book_train_stock_id_1["wap_balance"] = abs(book_train_stock_id_1["wap"]-\
                                           book_train_stock_id_1["wap1"])+1

In [None]:
book_train_stock_id_1["log_wap_balance"] = book_train_stock_id_1.groupby("time_id")["wap_balance"]\
.apply(log_return)

In [None]:
book_train_stock_id_1["price_spread"] = book_train_stock_id_1["ask_price1"] - \
book_train_stock_id_1["bid_price1"]

In [None]:
book_train_stock_id_1["log_price_spread"] = book_train_stock_id_1.groupby("time_id")["price_spread"]\
.apply(log_return)

In [None]:
book_train_stock_id_1["price_spread_1"] = book_train_stock_id_1["ask_price2"] - \
book_train_stock_id_1["bid_price2"]

In [None]:
book_train_stock_id_1["log_price_spread_1"] = book_train_stock_id_1.groupby("time_id")["price_spread_1"].\
apply(log_return)

In [None]:
book_train_stock_id_1.loc[:,"avg_wap"] = book_train_stock_id_1.loc[:,"wap"] + \
book_train_stock_id_1.loc[:,"wap1"]/2

In [None]:
book_train_stock_id_1["price_spread_avg"] = book_train_stock_id_1["price_spread"] + \
book_train_stock_id_1["price_spread_1"] /2

In [None]:
book_train_stock_id_1["price_spread_diff"] = book_train_stock_id_1["price_spread_1"] - \
book_train_stock_id_1["price_spread"]

In [None]:
book_train_stock_id_1["log_avg_wap"] = book_train_stock_id_1.groupby("time_id")["avg_wap"].apply(log_return)

In [None]:
book_train_stock_id_1["log_price_spread_avg"] = book_train_stock_id_1.groupby("time_id")["price_spread_avg"].apply(log_return)

In [None]:
book_train_stock_id_1["log_price_spread_diff"] = book_train_stock_id_1.groupby("time_id")["price_spread_diff"].\
apply(log_return)

In [None]:
book_train_stock_id_1["total_volume"] = book_train_stock_id_1["ask_size1"]+\
book_train_stock_id_1["bid_size1"] + book_train_stock_id_1["ask_size2"] +\
book_train_stock_id_1["bid_size2"]

In [None]:
book_train_stock_id_1["volume_imbalance"] = (book_train_stock_id_1["bid_size1"] +\
book_train_stock_id_1["bid_size2"]) - (book_train_stock_id_1["ask_size1"]+\
                                       book_train_stock_id_1["ask_size2"])

In [None]:
book_train_stock_id_1["volume_imbalance"] = 1 + book_train_stock_id_1["volume_imbalance"].abs()

In [None]:

book_train_stock_id_1["log_volume_imbalance"] = book_train_stock_id_1.groupby("time_id")["volume_imbalance"]\
.apply(log_return)

In [None]:
book_train_stock_id_1["bid_size"] = book_train_stock_id_1.loc[:,"bid_size1"] + book_train_stock_id_1.loc[:,"bid_size2"]

In [None]:
book_train_stock_id_1["ask_size"] = book_train_stock_id_1.loc[:,"ask_size1"] + book_train_stock_id_1.loc[:,"ask_size2"]

In [None]:
book_train_stock_id_1 = book_train_stock_id_1.loc[~((book_train_stock_id_1["log_return"].isnull())|\
                                                  (book_train_stock_id_1["log_return1"].isnull())|(\
                                                    book_train_stock_id_1["log_supply_demand"].isnull())),:]

In [None]:
# book order for stock_id= 1 and time_id=5
book_train_stock_id_1_time_id_5 = book_train_stock_id_1.loc[book_train_stock_id_1["time_id"]==5,:]

In [None]:
fig = px.line(data_frame=book_train_stock_id_1_time_id_5,x="seconds_in_bucket",y="wap",\
             title="WAP of stock_id_1,time_id_5")
fig.show()

In [None]:
fig = px.line(data_frame=book_train_stock_id_1_time_id_5,x="seconds_in_bucket",y="log_return",\
             title ="log return for stock_id_1,time_id_5")
fig.show()

In [None]:
fig = px.line(data_frame=book_train_stock_id_1_time_id_5,x="seconds_in_bucket",y="wap1",\
             title="WAP1 of stock_id_1,time_id_5")
fig.show()

In [None]:
fig = px.line(data_frame=book_train_stock_id_1_time_id_5,x="seconds_in_bucket",y="log_return1",\
             title="log_return1 of stock_id_1,time_id_5")
fig.show()

In [None]:
 create_feature_dict = {
        'log_return':[realized_volatility],
        'log_return1':[realized_volatility],
        'log_squared_wap':[realized_volatility],
        "log_supply_demand":[realized_volatility],
        "log_avg_wap" :[realized_volatility],
        "supply_demand":[np.mean],
        "diff_bid_price":[np.mean],
        "diff_ask_price":[np.mean],
        "log_diff_ask_price":[realized_volatility],
        "log_diff_bid_price" :[realized_volatility],
        "log_wap3":[realized_volatility],
        'wap_balance':[np.mean],
        "price_spread_1" :[np.mean],
        'price_spread':[np.mean],
        "price_spread_avg":[np.mean],
        "price_spread_diff":[np.mean],
        'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
        'wap':[np.mean],
        "log_wap_balance":[realized_volatility],
        "log_price_spread":[realized_volatility],
        "log_price_spread_1":[realized_volatility],
        "log_price_spread_avg":[realized_volatility],
        "log_price_spread_diff":[realized_volatility],
        "log_volume_imbalance":[realized_volatility],
        "bid_size":[np.sum],
        "ask_size":[np.sum]
            }

In [None]:
book_features = book_train_stock_id_1.groupby("time_id").agg(create_feature_dict).reset_index()

In [None]:
book_features.columns = ["_".join(col) for col in book_features]

In [None]:
book_features.rename(columns={"time_id_":"time_id"},inplace=True)

realized_volatility_stock_id_1 = book_train_stock_id_1.groupby("time_id")["log_return"].\
apply(realized_volatility)

realized_volatility1_stock_id_1 =  book_train_stock_id_1.groupby("time_id")["log_return1"].\
apply(realized_volatility)

supply_demand_volatility = book_train_stock_id_1.groupby("time_id")["log_supply_demand"].apply(realized_volatility)

realized_volatility_squared_price = book_train_stock_id_1.groupby("time_id")["log_squared_wap"].apply(realized_volatility)

simple_volatility = book_train_stock_id_1.groupby("time_id")["wap"].apply(simple_volatility)

realized_diff_ask_price = book_train_stock_id_1.groupby("time_id")["diff_ask_price"].apply(\
                                                                                          realized_volatility)

realized_diff_bid_price = book_train_stock_id_1.groupby("time_id")["diff_bid_price"].apply(\
                                                                                          realized_volatility)

In [None]:
data_stock_id_1 = train.loc[train["stock_id"]==1,:]

In [None]:
data_stock_id_1["row_id"] = data_stock_id_1["stock_id"].astype("str").str.cat(data_stock_id_1\
                                                                             ["time_id"].astype("str"),sep="-")

In [None]:
book_features["stock_id"] = 1
book_features["row_id"] = book_features["stock_id"].astype("str").str.cat(book_features["time_id"]\
                                                                         .astype("str"),sep="-")

In [None]:
data_stock_id_1 = data_stock_id_1.merge(book_features,how="left",on="row_id")

In [None]:
data_stock_id_1.rename(columns={"stock_id_x":"stock_id","time_id_x":"time_id"},inplace=True)

In [None]:
del(data_stock_id_1["time_id_y"])
del(data_stock_id_1["stock_id_y"])

In [None]:
data_stock_id_1.rename(columns={"log_return_realized_volatility":"realized_volatility",\
                               "log_return1_realized_volatility":"realized1_volatility",\
                               "log_squared_wap_realized_volatility":"realized_squared_volatility",\
                               "log_supply_demand_realized_volatility":"supply_demand_volatilty",\
                               "log_diff_ask_price_realized_volatility":"realized_diff_ask_price",\
                               "log_diff_bid_price_realized_volatility":"realized_diff_bid_price"},\
                      inplace=True)

In [None]:
data_stock_id_1 = data_stock_id_1[["stock_id","time_id","target","realized_volatility","realized1_volatility",\
                                 'realized_squared_volatility','supply_demand_volatilty','log_avg_wap_realized_volatility',\
                                 'supply_demand_mean','diff_bid_price_mean','diff_ask_price_mean','realized_diff_ask_price',\
                                 'realized_diff_bid_price','log_wap3_realized_volatility','wap_balance_mean','price_spread_1_mean',\
                                  'price_spread_mean','price_spread_avg_mean','price_spread_diff_mean','volume_imbalance_mean',\
                                   'total_volume_mean','wap_mean','log_wap_balance_realized_volatility',
       'log_price_spread_realized_volatility',
       'log_price_spread_1_realized_volatility',
       'log_price_spread_avg_realized_volatility',
       'log_price_spread_diff_realized_volatility',
       'log_volume_imbalance_realized_volatility', 'bid_size_sum',
       'ask_size_sum']]

In [None]:
data_stock_id_1.rename(columns={"realized_volatility":"realized_t","realized1_volatility":"realized1_t"},inplace=True)

### Exploration and feature engineering related to realized_t feature:

In [None]:
for i in range(1,6) :
    data_stock_id_1[f"realized_t_{i}"] = data_stock_id_1["realized_t"].shift(i)

In [None]:
data_stock_id_1 = data_stock_id_1.reset_index(drop=True)

In [None]:
missing_realized_t_1 = np.where(data_stock_id_1["realized_t_1"].isna())[0]
missing_realized_t_2 = np.where(data_stock_id_1["realized_t_2"].isna())[0]
missing_realized_t_3 = np.where(data_stock_id_1["realized_t_3"].isna())[0]
missing_realized_t_4 = np.where(data_stock_id_1["realized_t_4"].isna())[0]
missing_realized_t_5 = np.where(data_stock_id_1["realized_t_5"].isna())[0]
for i in missing_realized_t_1 : 
    data_stock_id_1.loc[i,"realized_t_1"] = data_stock_id_1.loc[i,"realized_t"]
for i in missing_realized_t_2 : 
    data_stock_id_1.loc[i,"realized_t_2"] = data_stock_id_1.loc[i,"realized_t_1"]
for i in missing_realized_t_3 : 
    data_stock_id_1.loc[i,"realized_t_3"] = data_stock_id_1.loc[i,"realized_t_2"]
for i in missing_realized_t_4 : 
    data_stock_id_1.loc[i,"realized_t_4"] = data_stock_id_1.loc[i,"realized_t_3"]
for i in missing_realized_t_5 : 
    data_stock_id_1.loc[i,"realized_t_5"] = data_stock_id_1.loc[i,"realized_t_4"]

In [None]:
fix,axes = plt.subplots(3,2,figsize=(15,20))
for i in range(6) :
    if i != 0 :
       sns.lineplot(data_stock_id_1.loc[:,f"realized_t_{i}"].values,data_stock_id_1.loc[:,"target"].\
                 values,ax=axes[i//2,i%2])
       axes[i//2,i%2].set_xlabel(f"realized_t_{i}",size=15)
       axes[i//2,i%2].set_ylabel("target",size=15)
       axes[i//2,i%2].set_title(f"target=f(realized_t_{i})",size=15,color="green")
       axes[i//2,i%2].set_xlim([0,0.04])
    else : 
       sns.lineplot(data_stock_id_1.loc[:,"realized_t"].values,data_stock_id_1.loc[:,"target"].\
                 values,ax=axes[i//2,i%2])
       axes[i//2,i%2].set_xlabel("realized_t",size=15)
       axes[i//2,i%2].set_ylabel("target",size=15)
       axes[i//2,i%2].set_xlim([0,0.04])
       axes[i//2,i%2].set_title("target=f(realized_t)",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({f"realized_t_{i}":data_stock_id_1["target"].\
                                  corr(data_stock_id_1[f"realized_t_{i}"]) for i in range(1,6)},index=["target"])

In [None]:
table_correlation["realized_t"] = data_stock_id_1["target"].corr(data_stock_id_1["realized_t"])

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
sns.heatmap(table_correlation,annot=True,square=True)

> ==> High positive correlation between realized volatility in time t with target of time t.Moreover, we don't find any correlation between realized volatility of previous times and the current target. Besides the charts above can urge to study an eventual quadratic relation between target feature and realized_t_i features for i in {1,2,3,4,5}. 

In [None]:
for i in range(5) :
    if i == 0 : 
       data_stock_id_1["diff_t"] = data_stock_id_1["realized_t"] - data_stock_id_1[f"realized_t_{i+1}"]
    else :
        data_stock_id_1[f"diff_t_{i}"] = data_stock_id_1[f"realized_t_{i}"] - data_stock_id_1[f"realized_t_{i+1}"]

In [None]:
fig,ax = plt.subplots(5,1,figsize=(10,30))
for i in range(5) :
    if i == 0 : 
       sns.lineplot(data_stock_id_1["diff_t"],data_stock_id_1["target"],ax=ax[i])
    else :
       sns.lineplot(data_stock_id_1[f"diff_t_{i}"],data_stock_id_1["target"],ax=ax[i])
    ax[i].set_title(f"target = f('diff_t_{i}')",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({f"diff_t_{i}":data_stock_id_1["target"].\
                                  corr(data_stock_id_1[f"diff_t_{i}"]) for i in range(1,5)},index=["target"])
table_correlation["diff_t"] = data_stock_id_1["target"].corr(data_stock_id_1["diff_t"])

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
sns.heatmap(table_correlation,annot=True,square=True)

> The above analysis shows a high positive correlation , between diff_t and the target feature. Besides , the chart above show also a quadratic relation between diff_t_i and target features for i in {1,2,3,4}

In [None]:
# study quadratic relation between realized_t_i features for i in {1,2,3,4,5} and target feature.
for i in range(1,6):
    data_stock_id_1[f"realized_t_{i}^2"] = data_stock_id_1[f"realized_t_{i}"] ** 2

In [None]:
fig,ax = plt.subplots(5,1,figsize=(10,30))
for i in range(1,6) :
    sns.lineplot(data_stock_id_1[f"realized_t_{i}^2"],data_stock_id_1["target"],ax=ax[i-1])
    ax[i-1].set_title(f"Target = f(realized_t_{i}^2)",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({f"realized_t_{i}^2": data_stock_id_1["target"].corr(data_stock_id_1[f"realized_t_{i}^2"])\
                                                                                      for i in range(1,6)},index=["target"])

In [None]:
fig = plt.figure(figsize=(15,5))
sns.heatmap(table_correlation,annot=True,square=True)

==> No correlation shown between quadratic realized_t_i features and target feature  for i in{1,2,3,4,5}.

In [None]:
# Study correlation between target feature and diff_t_i features for i in {1,2,3,4}.
for i in range(1,5) : 
    data_stock_id_1[f"diff_t_{i}^2"] = data_stock_id_1[f"diff_t_{i}"] ** 2

In [None]:
fig,ax = plt.subplots(4,1,figsize=(10,30))
for i in range(1,5) :
    sns.lineplot(data_stock_id_1[f"diff_t_{i}^2"],data_stock_id_1["target"],ax=ax[i-1])
    ax[i-1].set_title(f"target=f(diff_t_{i}^2)",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({f"diff_t_{i}^2":data_stock_id_1[f"diff_t_{i}^2"].\
                                  corr(data_stock_id_1["target"])for i in range(1,5)},index=\
                                ["target"])

In [None]:
fig = plt.figure(figsize=(15,5))
sns.heatmap(table_correlation,annot=True,square=True)

> ==> No correlation showed between diff_t_i^2 features for i in {1,2,3,4} and target feature.

In [None]:
# Let's study correlation between the quadratic of realized_t feature, the quadratic of 
# diff_t feature and the target feature.
data_stock_id_1["realized_t^2"] = data_stock_id_1["realized_t"] ** 2
data_stock_id_1["diff_t^2"] = data_stock_id_1["diff_t"] ** 2

In [None]:
fig,ax = plt.subplots(1,2,figsize=(20,5))

sns.lineplot(data_stock_id_1["realized_t^2"],data_stock_id_1["target"],ax=ax[0])
ax[0].set_title("target=f(realized_t^2)",size=15,color="green")
sns.lineplot(data_stock_id_1["diff_t^2"],data_stock_id_1["target"],ax=ax[1])
ax[1].set_title("target=f(diff_t^2)",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({"realized_t^2":data_stock_id_1["realized_t^2"].\
                                  corr(data_stock_id_1["target"]),"diff_t^2":data_stock_id_1["diff_t^2"].corr(data_stock_id_1["target"])},\
                                index=["target"])

In [None]:
fig = plt.figure(figsize=(50,2))
sns.heatmap(table_correlation,annot=True,square=True)

> ==> very high positive correlation between realized_t^2 feature and target feature. We can also underline some positive correlation between diff_t^2 and target feature.

### Exploration and feature engineering related to realized1_t feature:

In [None]:
for i in range(1,6):
    data_stock_id_1[f"realized1_t_{i}"] = data_stock_id_1["realized1_t"].shift(i)

In [None]:
missing_realized1_t_1 = np.where(data_stock_id_1["realized1_t_1"].isna())[0]
missing_realized1_t_2 = np.where(data_stock_id_1["realized1_t_2"].isna())[0]
missing_realized1_t_3 = np.where(data_stock_id_1["realized1_t_3"].isna())[0]
missing_realized1_t_4 = np.where(data_stock_id_1["realized1_t_4"].isna())[0]
missing_realized1_t_5 = np.where(data_stock_id_1["realized1_t_5"].isna())[0]
for i in missing_realized_t_1 : 
    data_stock_id_1.loc[i,"realized1_t_1"] = data_stock_id_1.loc[i,"realized1_t"]
for i in missing_realized_t_2 : 
    data_stock_id_1.loc[i,"realized1_t_2"] = data_stock_id_1.loc[i,"realized1_t_1"]
for i in missing_realized_t_3 : 
    data_stock_id_1.loc[i,"realized1_t_3"] = data_stock_id_1.loc[i,"realized1_t_2"]
for i in missing_realized_t_4 : 
    data_stock_id_1.loc[i,"realized1_t_4"] = data_stock_id_1.loc[i,"realized1_t_3"]
for i in missing_realized_t_5 : 
    data_stock_id_1.loc[i,"realized1_t_5"] = data_stock_id_1.loc[i,"realized1_t_4"]

In [None]:
fix,axes = plt.subplots(3,2,figsize=(15,20))
for i in range(6) :
    if i != 0 :
       sns.lineplot(data_stock_id_1.loc[:,f"realized1_t_{i}"].values,data_stock_id_1.loc[:,"target"].\
                 values,ax=axes[i//2,i%2])
       axes[i//2,i%2].set_xlabel(f"realized1_t_{i}",size=15)
       axes[i//2,i%2].set_ylabel("target",size=15)
       axes[i//2,i%2].set_title(f"target=f(realized1_t_{i})",size=15,color="green")
       axes[i//2,i%2].set_xlim([0,0.04])
    else : 
       sns.lineplot(data_stock_id_1.loc[:,"realized1_t"].values,data_stock_id_1.loc[:,"target"].\
                 values,ax=axes[i//2,i%2])
       axes[i//2,i%2].set_xlabel("realized1_t",size=15)
       axes[i//2,i%2].set_ylabel("target",size=15)
       axes[i//2,i%2].set_xlim([0,0.04])
       axes[i//2,i%2].set_title("target=f(realized1_t)",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({f"realized1_t_{i}":data_stock_id_1["target"].\
                                  corr(data_stock_id_1[f"realized1_t_{i}"]) for i in range(1,6)},index=["target"])
table_correlation["realized1_t"] = data_stock_id_1["target"].corr(data_stock_id_1["realized1_t"])

In [None]:
fig, ax = plt.subplots(1,1,figsize=(15,5))
sns.heatmap(table_correlation,annot=True,square=True)

> ==> High positive correlation between realized volatility for second price in time t with target of time t.Moreover, we don't find any correlation between realized volatility for second price of previous times and the current target.

In [None]:
data_stock_id_1.loc[:,"diff1_t"] = data_stock_id_1.loc[:,"realized1_t"] - data_stock_id_1.loc[:,"realized1_t_1"]
data_stock_id_1.loc[:,"realized1_t^2"] = data_stock_id_1.loc[:,"realized1_t"] ** 2
data_stock_id_1.loc[:,"diff1_t^2"] = data_stock_id_1.loc[:,"diff1_t"] ** 2

In [None]:
features = ["diff1_t","realized1_t^2","diff1_t^2"]
fig,ax = plt.subplots(3,1,figsize=(15,20))
for i in range(3) :
    sns.lineplot(data_stock_id_1.loc[:,f"{features[i]}"].values,data_stock_id_1.loc[:,"target"].values,\
                ax=ax[i])
    ax[i].set_title(f"taget=f({features[i]})",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({features[i]:data_stock_id_1["target"].corr(data_stock_id_1[features[i]])\
                                 for i in range(3)},index=["target"])

In [None]:
fig = plt.figure(figsize=(35,2))
sns.heatmap(table_correlation,annot=True,square=True)

> ==> The chart above shows a respectful positive correlation between diff1_t and realized1_t^2 features with the target feature. We can also,see very slim positive correlation of target feature with diff1_t^2 feature.

### Exploration and feature engineering related to supply_demand_volatility feature:

In [None]:
data_stock_id_1["supply_demand_1"] = data_stock_id_1["supply_demand_volatilty"].shift(1)

In [None]:
ind = np.where(data_stock_id_1["supply_demand_1"].isna())[0]
for i in ind :
    data_stock_id_1.loc[i,"supply_demand_1"] = data_stock_id_1.loc[i,"supply_demand_volatilty"]

In [None]:
data_stock_id_1["diff_supply_demand"] = data_stock_id_1["supply_demand_volatilty"]-data_stock_id_1[\
                                                                                                  "supply_demand_1"]

In [None]:
supply_features = ["supply_demand_volatilty","supply_demand_1","diff_supply_demand",\
                   "supply_demand_mean"]

In [None]:
table_correlation = pd.DataFrame({f:data_stock_id_1[f].corr(data_stock_id_1["target"]) for \
                                 f in supply_features},index=["target"])

In [None]:
sns.heatmap(table_correlation,annot=True,square=True)

> ==> very slim correlation between target feature and the features supply_demand_volatilty and diff_supply_demand features.

### Exploration and feature engineering related to others features:

In [None]:
others_features = ["realized_squared_volatility","realized_diff_bid_price","realized_diff_ask_price",\
                  "diff_bid_price_mean","diff_ask_price_mean",'log_wap3_realized_volatility','wap_balance_mean',\
                  'price_spread_mean','volume_imbalance_mean','total_volume_mean','wap_mean',\
                  'log_avg_wap_realized_volatility','price_spread_1_mean','price_spread_avg_mean',\
                   'price_spread_diff_mean','log_wap_balance_realized_volatility',
       'log_price_spread_realized_volatility',
       'log_price_spread_1_realized_volatility',
       'log_price_spread_avg_realized_volatility',
       'log_price_spread_diff_realized_volatility',
       'log_volume_imbalance_realized_volatility', 'bid_size_sum',
       'ask_size_sum']
table_correlation = pd.DataFrame({f:data_stock_id_1[f].corr(data_stock_id_1["target"]) for f in others_features},\
                                index=["target"])

In [None]:
fig = plt.figure(figsize=(25,10))
sns.heatmap(table_correlation,annot=True,square=True)

> ==> high correlation between realized_squared_volatility feature and target feature.

## Trade features explorations :

In [None]:
trade_train_stock_id_1["log_price"] = trade_train_stock_id_1.groupby("time_id")["price"].\
apply(log_return)

In [None]:
trade_train_stock_id_1["size_order"] = trade_train_stock_id_1["size"]/trade_train_stock_id_1["order_count"]


In [None]:
trade_train_stock_id_1 = trade_train_stock_id_1.loc[~(trade_train_stock_id_1.loc[:,"log_price"].isnull())]

In [None]:
aggregate_dictionary={
    "price" : [np.mean],
    "log_price" : [realized_volatility],
    "size_order" : [np.mean],
    "size" : [np.mean,np.sum,np.std],
    "order_count" : [np.sum],
    "seconds_in_bucket":[count_unique],
    "order_count":[np.sum,np.std,np.mean]
}

In [None]:
stock_features = trade_train_stock_id_1.groupby("time_id").agg(aggregate_dictionary).reset_index()

In [None]:
stock_features.columns = ["_".join(col) for col in stock_features]

In [None]:
stock_features.rename(columns={"log_price_realized_volatility":"vol_price"},inplace=True)

In [None]:
stock_features.rename(columns={"time_id_":"time_id"},inplace=True)

In [None]:
data_stock_id_1 = data_stock_id_1.merge(stock_features,how="left",on="time_id")

In [None]:
data_stock_id_1["vol_price_1"] = data_stock_id_1["vol_price"].shift()

In [None]:
indices = np.where(data_stock_id_1["vol_price_1"].isna())[0]

for i in indices :
    data_stock_id_1.loc[i,"vol_price_1"] =  data_stock_id_1.loc[i,"vol_price"] 

In [None]:
data_stock_id_1["diff_price"] = data_stock_id_1["vol_price_1"] - data_stock_id_1["vol_price"]

In [None]:
data_stock_id_1["vol_price^2"] = data_stock_id_1["vol_price"] ** 2

In [None]:
data_stock_id_1["balance_wap_price"] = abs(data_stock_id_1["price_mean"]- \
                                           data_stock_id_1["wap_mean"])

In [None]:
data_stock_id_1["rapp_price"] = (data_stock_id_1["price_mean"]+1)/(data_stock_id_1["wap_mean"]+1)

In [None]:
data_stock_id_1["shares_vs_bid"] = data_stock_id_1["size_sum"] - data_stock_id_1["bid_size_sum"]

In [None]:
data_stock_id_1["shares_vs_ask"] = data_stock_id_1["size_sum"] - data_stock_id_1["ask_size_sum"]

In [None]:
f = list(stock_features.columns[1:]) + ["vol_price_1","diff_price","vol_price^2",\
                                        "balance_wap_price","rapp_price","shares_vs_bid",\
                                       "shares_vs_ask"]

In [None]:
fig,ax = plt.subplots(math.ceil(len(f)/2),2,figsize=(15,45))
for i in range(len(f)) :
    sns.lineplot(data_stock_id_1[f[i]],data_stock_id_1["target"],ax=ax[i//2,i%2])
    ax[i//2,i%2].set_title(f"target=f({f[i]})",size=15,color="green")

In [None]:
table_correlation = pd.DataFrame({f[i]:data_stock_id_1[f[i]].corr(data_stock_id_1["target"]) for i in range(len(f))},\
                                index=["target"])

In [None]:
fig = plt.figure(figsize=(15,2))
sns.heatmap(table_correlation,annot=True,square=True)
plt.show()

> ===> We can notice from the chart above , that there is high correlation between target feature and vol_price feature, besides we notice also a positive correaltion between vol_price^2 feature and target feature , and a negative correlation between diff_price feature and target feature.

## Top insights :
- High positive correlation between realized squared volatility and target feature.
- High positive correlation between vol_price feature and target feature.
- High positive correlation between realized_t feature and target feature.
- Positive correlation between vol_price^2 feature and target feature.
- negative correlation between diff_price feature and target feature.
- Positive correlation between diff_t feature and target feature.
- Positive correlation between realized_t^2 feature and target feature.
- High positive correlation between realized1_t feature and target feature.  
- Positive correlation between diff1_t feature and target feature.
- Positive correlation between realized1_t^2 feature and target feature.

# 4.Preprocessing datas :

In [None]:
def preprocessing_stock_id_time_id(st_file):
    book_train_stock_id = pd.read_parquet(st_file)
    stock_id = st_file.split("=")[1]
    book_train_stock_id["stock_id"] = stock_id 
    book_train_stock_id["wap"] = (book_train_stock_id['bid_price1'] * book_train_stock_id['ask_size1']+book_train_stock_id['ask_price1'] * book_train_stock_id['bid_size1'])  / (
                                      book_train_stock_id['bid_size1']+ book_train_stock_id[
                                  'ask_size1'])
    book_train_stock_id["wap1"] = (book_train_stock_id['bid_price2'] * book_train_stock_id['ask_size2']+book_train_stock_id['ask_price2'] * book_train_stock_id['bid_size2'])  / (
                                      book_train_stock_id['bid_size2']+ book_train_stock_id[
                                  'ask_size2'])
    book_train_stock_id["supply_demand"] = (book_train_stock_id["ask_size1"]+book_train_stock_id["ask_size2"])/(book_train_stock_id["bid_size1"]+book_train_stock_id["bid_size2"])
    book_train_stock_id["squared_wap"] = book_train_stock_id["wap"] ** 2
    book_train_stock_id["squared_wap_1"] = book_train_stock_id["wap1"] ** 2
    
    book_train_stock_id["log_supply_demand"] = book_train_stock_id.groupby("time_id")["supply_demand"].apply(log_return)
    book_train_stock_id["log_return"] = book_train_stock_id.groupby("time_id")["wap"].apply(log_return)
    book_train_stock_id["log_return1"] = book_train_stock_id.groupby("time_id")["wap1"].apply(log_return)
    book_train_stock_id["log_squared_wap"] = book_train_stock_id.groupby("time_id")["squared_wap"].apply(log_return)
    book_train_stock_id["log_squared_wap_1"] = book_train_stock_id.groupby("time_id")["squared_wap_1"].apply(log_return)
    
    book_train_stock_id["diff_bid_price"] = book_train_stock_id["bid_price1"] - book_train_stock_id["bid_price2"]
    book_train_stock_id["diff_ask_price"] = book_train_stock_id["ask_price2"] - book_train_stock_id["ask_price1"]
    
    book_train_stock_id["log_diff_bid_price"] = book_train_stock_id.groupby("time_id")["diff_bid_price"].apply(log_return)
    book_train_stock_id["log_diff_ask_price"] = book_train_stock_id.groupby("time_id")["diff_ask_price"].apply(log_return)
    
    
    book_train_stock_id["wap3"] =  (book_train_stock_id['bid_price1'] * book_train_stock_id['bid_size1']+book_train_stock_id['ask_price1'] * book_train_stock_id['ask_size1'])  / (
                                      book_train_stock_id['ask_size1']+ book_train_stock_id[
                                  'bid_size1'])
    book_train_stock_id["log_wap3"] = book_train_stock_id.groupby("time_id")["wap3"].apply(log_return)
    
    
    book_train_stock_id["wap_balance"] = abs(book_train_stock_id["wap"] - book_train_stock_id["wap1"])
    book_train_stock_id["price_spread"] = book_train_stock_id["ask_price1"] - book_train_stock_id["bid_price1"]
    book_train_stock_id["price_spread_1"] = book_train_stock_id["ask_price2"] - \
book_train_stock_id_1["bid_price2"]
    book_train_stock_id["total_volume"] = book_train_stock_id["ask_size1"]+\
book_train_stock_id["bid_size1"] + book_train_stock_id["ask_size2"] +\
book_train_stock_id["bid_size2"]
    
    book_train_stock_id.loc[:,"avg_wap"] = book_train_stock_id.loc[:,"wap"] + \
book_train_stock_id.loc[:,"wap1"]/2
    book_train_stock_id["log_avg_wap"] = book_train_stock_id.groupby("time_id")["avg_wap"].apply(log_return)
    book_train_stock_id["price_spread_avg"] = book_train_stock_id["price_spread"] + \
book_train_stock_id["price_spread_1"] /2
    book_train_stock_id["price_spread_diff"] = book_train_stock_id["price_spread_1"] - \
book_train_stock_id["price_spread"]
    
    book_train_stock_id["wap_balance"] = abs(book_train_stock_id["wap"]-\
                                           book_train_stock_id["wap1"])+1
    
    book_train_stock_id["log_wap_balance"] = book_train_stock_id.groupby("time_id")["wap_balance"]\
.apply(log_return)
    book_train_stock_id["log_price_spread"] = book_train_stock_id.groupby("time_id")["price_spread"]\
.apply(log_return)
    book_train_stock_id["log_price_spread_1"] = book_train_stock_id.groupby("time_id")["price_spread_1"].\
apply(log_return)
    book_train_stock_id["log_avg_wap"] = book_train_stock_id.groupby("time_id")["avg_wap"].apply(log_return)
    book_train_stock_id["log_price_spread_avg"] = book_train_stock_id.groupby("time_id")["price_spread_avg"].\
    apply(log_return)
    book_train_stock_id["log_price_spread_diff"] = book_train_stock_id.groupby("time_id")["price_spread_diff"].\
apply(log_return)
    book_train_stock_id["volume_imbalance"] = (book_train_stock_id["bid_size1"] +\
book_train_stock_id["bid_size2"]) - (book_train_stock_id["ask_size1"]+\
                                       book_train_stock_id["ask_size2"])
    book_train_stock_id["volume_imbalance"] = 1 + book_train_stock_id["volume_imbalance"].abs()
    book_train_stock_id["log_volume_imbalance"] = book_train_stock_id.groupby("time_id")["volume_imbalance"]\
.apply(log_return)
    book_train_stock_id["bid_size"] = book_train_stock_id["bid_size1"] + book_train_stock_id["bid_size2"]
    book_train_stock_id["ask_size"] = book_train_stock_id["ask_size1"] + book_train_stock_id["ask_size2"]
    book_train_stock_id = book_train_stock_id.loc[~((book_train_stock_id["log_return"].isnull())|\
                                                  (book_train_stock_id["log_return1"].isnull())|(\
                                                    book_train_stock_id["log_supply_demand"].isnull())),:]
    
    create_feature_dict = { 'log_return':[realized_volatility],
        'log_return1':[realized_volatility],
        'log_squared_wap':[realized_volatility],
        'log_squared_wap_1':[realized_volatility],                
        "log_supply_demand":[realized_volatility],
        "diff_bid_price":[np.mean],
        "diff_ask_price":[np.mean],
        "log_diff_ask_price":[realized_volatility],
        "log_diff_bid_price" :[realized_volatility],
        "log_wap3":[realized_volatility],
        'wap_balance':[np.mean],
        'price_spread':[np.mean],
        #'volume_imbalance':[np.mean],
        'total_volume':[np.mean],
        'wap':[np.mean],
        "log_avg_wap" :[realized_volatility],
        "price_spread_1":[np.mean],
        "price_spread_avg":[np.mean],
        "price_spread_diff":[np.mean],
        "log_wap_balance":[realized_volatility],
        "log_price_spread":[realized_volatility],
        "log_price_spread_1":[realized_volatility],
        "log_price_spread_avg":[realized_volatility],
        "log_price_spread_diff":[realized_volatility],
        "log_volume_imbalance":[realized_volatility],
        "bid_size":[np.sum],
        "ask_size":[np.sum]
    }
    book_features = book_train_stock_id.groupby("time_id").agg( create_feature_dict).reset_index()
    book_features.columns = ["_".join(col) for col in book_features]
    book_features.rename(columns={"time_id_":"time_id"},inplace=True)
    
    book_features.rename(columns={"log_return_realized_volatility":"realized_volatility",\
                               "log_return1_realized_volatility":"realized_volatility1",\
                               "log_squared_wap_realized_volatility":"squared_wap_vol",\
                               "log_supply_demand_realized_volatility":"realized_supply_demand",\
                               "log_diff_ask_price_realized_volatility":"realized_ask_price",\
                               "log_diff_bid_price_realized_volatility":"realized_bid_price",\
                                "log_squared_wap_1_realized_volatility":"squared_wap1_vol",\
                                "log_wap_balance":[realized_volatility]},inplace=True)
    book_features["row_id"] = book_features["time_id"].map(lambda x :f"{stock_id}-{x}")
        
    return book_features

In [None]:
def preprocessing_all_files (all_files):
    df = pd.DataFrame()
    
    for file in all_files :
        df= pd.concat([df,preprocessing_stock_id_time_id(file)])
        #dff = pd.concat([dff,preprocessing_stock_id_time_id(file)[1]])
    return df 

In [None]:
all_files = glob.glob("../input/optiver-realized-volatility-prediction/book_train.parquet/*")

In [None]:
data_train_episodes = preprocessing_all_files(all_files)

In [None]:
def preprocessing_trade_stock_time_id(file):
    trade_id = pd.read_parquet(file)
    trade_id["log_price"] = trade_id.groupby("time_id")["price"].apply(log_return)
    trade_id = trade_id.loc[~(trade_id.loc[:,"log_price"].isnull()),:]
    trade_id["size_order"] = trade_id["size"]/trade_id["order_count"]
    aggregate_dictionary={
    "price" : [np.mean],
    "log_price" : [realized_volatility],
    "size_order" : [np.mean],
    "size" : [np.mean,np.sum,np.std],
    "order_count" : [np.sum],
    "seconds_in_bucket":[count_unique],
    "order_count":[np.sum,np.std,np.mean]
    }
    df = trade_id.groupby("time_id").agg(aggregate_dictionary)
    df = df.reset_index(drop=False)
    df.columns = ["_".join(col) for col in df]
    df.rename(columns={"log_price_realized_volatility":"vol_price"},inplace=True)
    df.rename(columns={"time_id_":"time_id"},inplace=True)
    
    df["stock_id"] = file.split("=")[1]
    df["row_id"] = df["time_id"].apply(lambda x : f"{file.split('=')[1]}-{x}")
    
    return df
    

In [None]:
def preprocessing_trade_all_file(files):
    df = pd.DataFrame()
    for f in files :
        df = pd.concat([df,preprocessing_trade_stock_time_id(f)])
    return df 

In [None]:
files = glob.glob("../input/optiver-realized-volatility-prediction/trade_train.parquet/*")

In [None]:
trade_stock = preprocessing_trade_all_file(files)

In [None]:
def merge_and_create_feature_engineering(labeled_df,data_episodes):
    
    labeled_df["row_id"] = labeled_df["stock_id"].astype(str).str.cat(labeled_df["time_id"].\
                                                                     astype(str),sep="-")
    del(labeled_df["time_id"])
    #del(labeled_df["stock_id"])
    comb_data = labeled_df.merge(data_episodes,on="row_id",how="left")
    comb_data.rename(columns={"realized_volatility":"realized_t"},inplace=True)
    comb_data["realized_t_1"] = comb_data.groupby("stock_id")["realized_t"].shift(1)
    index = np.where(comb_data["realized_t_1"].isna())[0]
    for i in index :
        comb_data.loc[i,"realized_t_1"] = comb_data.loc[i,"realized_t"]
    comb_data.loc[:,"diff_t"] = comb_data.loc[:,"realized_t"] - comb_data.loc[:,"realized_t_1"]
    comb_data.drop("realized_t_1",axis=1,inplace=True)
    comb_data.loc[:,"diff_t^2"] = comb_data.loc[:,"diff_t"] ** 2
    comb_data.loc[:,"realized_t^2"] = comb_data.loc[:,"realized_t"] ** 2
   
    return comb_data

In [None]:
train = pd.read_csv("../input/optiver-realized-volatility-prediction/train.csv")

In [None]:
train_df = merge_and_create_feature_engineering(train,data_train_episodes)

In [None]:
def merge_and_create_feature_engineering_for_second_level(labeled_df):
    #labeled_df["row_id"] = labeled_df["stock_id"].astype(str).str.cat(labeled_df["time_id"].\
                                                                     #astype(str),sep="-")
    #comb_data = labeled_df.merge(data_episodes,on="row_id",how="right")
    comb_data = labeled_df
    comb_data.rename(columns={"realized_volatility1":"realized1_t"},inplace=True)
    comb_data["realized1_t_1"] = comb_data.groupby("stock_id")["realized1_t"].shift(1)
    index = np.where(comb_data["realized1_t_1"].isna())[0]
    for i in index :
        comb_data.loc[i,"realized1_t_1"] = comb_data.loc[i,"realized1_t"]
    comb_data.loc[:,"diff1_t"] = comb_data.loc[:,"realized1_t"] - comb_data.loc[:,"realized1_t_1"]
    comb_data.drop("realized1_t_1",axis=1,inplace=True)
    comb_data.loc[:,"diff1_t^2"] = comb_data.loc[:,"diff1_t"] ** 2
    comb_data.loc[:,"realized1_t^2"] = comb_data.loc[:,"realized1_t"] ** 2
    return comb_data

In [None]:
train_df = merge_and_create_feature_engineering_for_second_level(train_df)

In [None]:
train_df = train_df.merge(trade_stock,how="inner",on="row_id")

In [None]:
train_df["vol_price^2"] = train_df["vol_price"] ** 2

In [None]:
del(train_df['stock_id_y'])
del(train_df['time_id_y'])

In [None]:
train_df.rename(columns={"stock_id_x":"stock_id","time_id_x":"time_id"},inplace=True)

In [None]:
train_df["vol_price_1"] = train_df.groupby("stock_id")["vol_price"].shift(1)
ind = np.where(train_df["vol_price_1"].isna())[0]
for i in ind :
    train_df.loc[i,"vol_price_1"] = train_df.loc[i,"vol_price"]

In [None]:
train_df["diff_price"] = train_df["vol_price"] - train_df["vol_price_1"]
del(train_df["vol_price_1"])

In [None]:
train_df["balance_wap_price"] = abs(train_df["price_mean"]- \
                                           train_df["wap_mean"])

In [None]:
train_df.columns

In [None]:
#retained_features = ["stock_id","realized_t","diff_t","realized_t^2","diff_t^2","realized1_t",\
                    #"diff1_t","diff1_t^2","realized1_t^2",'realized_supply_demand','squared_wap_vol',\
                     #'squared_wap1_vol',"vol_price","vol_price^2","diff_price","realized_bid_price","realized_ask_price"]
retained_features = [f for f in train_df.columns if f not in ("target","row_id","time_id")]

In [None]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=42)

# 5. Modeling :

### 5.1 LGBM :

In [None]:
def create_lgbm_model(X_train,Y_train,X_val,Y_val):
    #gbm = lgbm.LGBMRegressor(objective="mse",random_state=33,early_stopping_round= 100,n_estimators=10000)
    #gbm.set_params(**{'objective':custom_loss})
    #gbm.fit(X_train,Y_train,eval_set=[(X_val,Y_val)],eval_metric="mse",verbose=10)
    #gbm_eval = gbm.evals_result_
    params_lgbm = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'objective': 'regression',
        'metric': 'None',
        'max_depth': -1,
        'n_jobs': -1,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'lambda_l2': 1,
        'verbose': -1
        #'bagging_freq': 5
    }
    cats=["stock_id"]
    train_data = lgbm.Dataset(X_train, label=Y_train, categorical_feature=cats, weight=1/np.power(Y_train,2))
    val_data = lgbm.Dataset(X_val, label=Y_val, categorical_feature=cats, weight=1/np.power(Y_val,2))
    n_rounds = 5000
    model = lgbm.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 250,
                      early_stopping_rounds=500
                     )
    
    return model

In [None]:
models = []

for i ,(tr_ind,ts_ind) in enumerate(skf.split(train_df,train_df["stock_id"])) :
    X_tr = train_df.loc[tr_ind,retained_features]
    X_val = train_df.loc[ts_ind,retained_features]
    Y_tr= train_df.loc[tr_ind,"target"]
    Y_val = train_df.loc[ts_ind,"target"]
    print(f"{r__} Training model_{i} starting{st__}")
    md = create_lgbm_model(X_tr,Y_tr,X_val,Y_val)
    #print(f"{g__}The MSE of the lgbm model n°{i+1} = {perf['valid_0']['l2'][-1]}{st__}")
    models.append(md)

In [None]:
fig ,ax  = plt.subplots(4,1,figsize=(15,40))
for i in range(4) :
    lgbm.plot_importance(models[i],ax=ax[i])

In [None]:
train_df["cible"] = np.mean(np.vstack([model.predict(train_df[retained_features]) for model in models]),axis=0)

In [None]:
RMSPE = round(rmspe(y_true = train_df['target'], y_pred = train_df['cible']),3)
print(f'Performance of LGBM model  RMSPE: {RMSPE}')

# 5.Submission :

In [None]:
test_all_files = glob.glob("../input/optiver-realized-volatility-prediction/book_test.parquet/*")

In [None]:
data_test_all_files = preprocessing_all_files(test_all_files)

In [None]:
test_df = merge_and_create_feature_engineering(test,data_test_all_files)

In [None]:
test_df = merge_and_create_feature_engineering_for_second_level(test_df)

In [None]:
files_test = glob.glob("../input/optiver-realized-volatility-prediction/trade_test.parquet/*")
trade_stock = preprocessing_trade_all_file(files_test)

In [None]:
test_df = test_df.merge(trade_stock,how="left",on="row_id")

In [None]:
test_df["vol_price^2"] = test_df["vol_price"] ** 2

In [None]:
del(test_df["stock_id_y"])
del(test_df["time_id_y"])

In [None]:
test_df.rename(columns={"time_id_x":"time_id"},inplace=True)
test_df.rename(columns={"stock_id_x":"stock_id"},inplace=True)


In [None]:
test_df["vol_price_1"] = test_df.groupby("stock_id")["vol_price"].shift(1)
ind = np.where(test_df["vol_price_1"].isna())[0]
for i in ind :
    test_df.loc[i,"vol_price_1"] = test_df.loc[i,"vol_price"]

In [None]:
test_df["diff_price"] = test_df["vol_price"] - test_df["vol_price_1"]
del(test_df["vol_price_1"])

In [None]:
test_df["balance_wap_price"] = abs(test_df["price_mean"]- \
                                           test_df["wap_mean"])

In [None]:
test_df["prediction"] = np.mean(np.vstack([model.predict(test_df[retained_features]) for model in models]),axis=0)

In [None]:
test_df.rename(columns={"prediction":"target"},inplace=True)

In [None]:
test_df[["row_id","target"]].to_csv("submission.csv",index=False)

In [None]:
train_df.to_csv("train_df",index=False)