In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px

In [None]:
import glob

# Checking out data

In [None]:
train = pd.read_csv('../input/optiver-realized-volatility-prediction/train.csv')
train.head()

In [None]:
train.info()

## Only getting df for stock id 0

In [None]:
train_sid0 = train[train['stock_id'].astype('str').str.contains('0')]

In [None]:
train_sid0.head()

In [None]:
train_sid0.info()

In [None]:
fig = px.line(train_sid0, x="time_id", y="target", title='Volatility of stock_id_0')
fig.show()

In [None]:
#Volatility for just first 100 time_ids
first_10 = train_sid0.head(100)
fig = px.line(first_10, x="time_id", y="target", title='Volatility of stock_id_0')
fig.show()

# LGB Starter 
Taking help from this notebook to get importance of different features: https://www.kaggle.com/manels/lgb-starter/notebook

In [None]:
from IPython.core.display import display, HTML

import pandas as pd
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import glob
import os
import gc

from joblib import Parallel, delayed

from sklearn import preprocessing, model_selection
import lightgbm as lgb

from sklearn.metrics import r2_score

import matplotlib.pyplot as plt 
import seaborn as sns

path_root = '../input/optiver-realized-volatility-prediction'
path_data = '../input/optiver-realized-volatility-prediction'
path_submissions = '/'

target_name = 'target'
scores_folds = {}

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None) 

In [None]:
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

In [None]:
params_lgbm = {
        'task': 'train',
        'boosting_type': 'gbdt',
        'learning_rate': 0.01,
        'objective': 'regression',
        'metric': 'None',
        'max_depth': -1,
        'n_jobs': -1,
        'feature_fraction': 0.7,
        'bagging_fraction': 0.7,
        'lambda_l2': 1,
        'verbose': -1
        #'bagging_freq': 5
}

# Data pre-something for only stock id = 0

In [None]:
train = pd.read_csv(os.path.join(path_data, 'train.csv'))

In [None]:
key = ['stock_id', 'time_id', 'seconds_in_bucket']
dataType = 'train'
stock_id = 0

## Book feature

In [None]:
#As the train data in book doesn't contain a column for stock_id, that's manually added
df_book = pd.read_parquet(os.path.join(path_data, 'book_{}.parquet/stock_id={}/'.format(dataType, stock_id)))
df_book['stock_id'] = stock_id
cols = key + [col for col in df_book.columns if col not in key]
df_book = df_book[cols]

In [None]:
print(df_book.head(10))

In [None]:
#calculating wap and log returns and adding them to the column
df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] +
                                df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] +
                                df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
df_book['log_return1'] = df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).fillna(0)
df_book['log_return2'] = df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).fillna(0)

In [None]:
print(df_book.head(10))

In [None]:
#calculating volatility for each time id
features_to_apply_realized_volatility = ['log_return'+str(i+1) for i in range(2)]
stock_stat = df_book.groupby(by = ['stock_id', 'time_id'])[features_to_apply_realized_volatility]\
                    .agg(realized_volatility).reset_index()

In [None]:
print(stock_stat.head(10))

## Trade features

In [None]:
trade_stat =  pd.read_parquet(os.path.join(path_data,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
trade_stat = trade_stat.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
trade_stat['stock_id'] = stock_id
cols = key + [col for col in trade_stat.columns if col not in key]
trade_stat = trade_stat[cols]

In [None]:
print(trade_stat.head(10))

In [None]:
#calculating log returns and taking the price as wap
trade_stat['trade_log_return1'] = trade_stat.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)

In [None]:
print(trade_stat.head(10))

In [None]:
trade_stat = trade_stat.groupby(by = ['stock_id', 'time_id'])[['trade_log_return1']]\
                       .agg(realized_volatility).reset_index()

In [None]:
print(trade_stat.head(10))

## Merging both trade and book

In [None]:
stock_stat = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)

In [None]:
print(stock_stat.head(10))

### Now the dataframe above is basically each stock_id and their volatality at time_id calculated using wap1 wap2 and trade prices

# Now doing this all in a function and making dataframe for all of the stock ids

In [None]:
train = pd.read_csv(os.path.join(path_data, 'train.csv'))

In [None]:
#function combining all that we did above
def get_stock_stat(stock_id : int, dataType = 'train'):
    key = ['stock_id', 'time_id', 'seconds_in_bucket']
    #Book features
    df_book = pd.read_parquet(os.path.join(path_data, 'book_{}.parquet/stock_id={}/'.format(dataType, stock_id)))
    df_book['stock_id'] = stock_id
    cols = key + [col for col in df_book.columns if col not in key]
    df_book = df_book[cols]   
    df_book['wap1'] = (df_book['bid_price1'] * df_book['ask_size1'] +
                                    df_book['ask_price1'] * df_book['bid_size1']) / (df_book['bid_size1'] + df_book['ask_size1'])
    df_book['wap2'] = (df_book['bid_price2'] * df_book['ask_size2'] +
                                    df_book['ask_price2'] * df_book['bid_size2']) / (df_book['bid_size2'] + df_book['ask_size2'])
    df_book['log_return1'] = df_book.groupby(by = ['time_id'])['wap1'].apply(log_return).fillna(0)
    df_book['log_return2'] = df_book.groupby(by = ['time_id'])['wap2'].apply(log_return).fillna(0)
    features_to_apply_realized_volatility = ['log_return'+str(i+1) for i in range(2)]
    stock_stat = df_book.groupby(by = ['stock_id', 'time_id'])[features_to_apply_realized_volatility]\
                        .agg(realized_volatility).reset_index()

    #Trade features
    trade_stat =  pd.read_parquet(os.path.join(path_data,'trade_{}.parquet/stock_id={}'.format(dataType, stock_id)))
    trade_stat = trade_stat.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    trade_stat['stock_id'] = stock_id
    cols = key + [col for col in trade_stat.columns if col not in key]
    trade_stat = trade_stat[cols]
    trade_stat['trade_log_return1'] = trade_stat.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    trade_stat = trade_stat.groupby(by = ['stock_id', 'time_id'])[['trade_log_return1']]\
                           .agg(realized_volatility).reset_index()
    
    #Joining book and trade features
    stock_stat = stock_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    return stock_stat

In [None]:
#function to loop over all the stocks
def get_dataSet(stock_ids : list, dataType = 'train'):
    #using Parallel to use multi processing
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, dataType) 
        for stock_id in stock_ids
    )
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)
    return stock_stat_df

In [None]:
#to caculate loss
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False

In [None]:
%time train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), dataType = 'train')

In [None]:
train_stock_stat_df.head(10)

In [None]:
#adding the actual target value from the train.csv
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')

In [None]:
train.head(10)

In [None]:
print('Train shape: {}'.format(train.shape))

In [None]:
#loading in the test data
test = pd.read_csv(os.path.join(path_data, 'test.csv'))
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), dataType = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
print('Test shape: {}'.format(test.shape))
display(test.head())

In [None]:
cats = ['stock_id']

In [None]:
model_name = 'lgb1'
pred_name = 'pred_{}'.format(model_name)
features_to_consider = ['stock_id', 'log_return1', 'log_return2', 'trade_log_return1']

In [None]:
train[pred_name] = 0
test['target'] = 0

In [None]:
n_folds = 4
n_rounds = 5000
kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=2016)
scores_folds[model_name] = []
counter = 1

In [None]:
#using train data (by splitting it into train and val using cross validation) 
#then using using lgb to train, by giving it the params we initiazlized in an array in the start
for dev_index, val_index in kf.split(range(len(train))):
    print('CV {}/{}'.format(counter, n_folds))
    X_train = train.loc[dev_index, features_to_consider]
    y_train = train.loc[dev_index, target_name].values
    X_val = train.loc[val_index, features_to_consider]
    y_val = train.loc[val_index, target_name].values
    
    #############################################################################################
    #LGB
    #############################################################################################
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cats, weight=1/np.power(y_train,2))
    val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cats, weight=1/np.power(y_val,2))
    
    model = lgb.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 250,
                      early_stopping_rounds=500
                     )
    preds = model.predict(train.loc[val_index, features_to_consider])
    train.loc[val_index, pred_name] = preds
    score = round(rmspe(y_true = y_val, y_pred = preds),5)
    print('Fold {} {}: {}'.format(counter, model_name, score))
    scores_folds[model_name].append(score)
    counter += 1
    test[target_name] += model.predict(test[features_to_consider]).clip(0,1e10)

In [None]:
del train_data, val_data
test[target_name] = test[target_name]/n_folds

In [None]:
print(test)

In [None]:
score = round(rmspe(y_true = train[target_name].values, y_pred = train[pred_name].values),5)
print('RMSPE {}: {} - Folds: {}'.format(model_name, score, scores_folds[model_name]))

In [None]:
display(test[['row_id', target_name]].head(2))
test[['row_id', target_name]].to_csv('submission.csv',index = False)

importances = pd.DataFrame({'Feature': model.feature_name(), 
                            'Importance': model.feature_importance(importance_type='gain')})
importances.sort_values(by = 'Importance', inplace=True)
importances2 = importances.nlargest(50,'Importance', keep='first').sort_values(by='Importance', ascending=True)
importances2[['Importance', 'Feature']].plot(kind = 'barh', x = 'Feature', figsize = (8,6), color = 'blue', fontsize=11);plt.ylabel('Feature', fontsize=12)

# Using lgbm to train now from another notebooks

In [None]:
#importing again
import os
import sys
import time
import glob
from pathlib import Path

import pandas as pd
import numpy as np

# Parallel processing
from joblib import Parallel
from joblib import delayed

# Preprocess
from sklearn import preprocessing
from sklearn import model_selection

# Evaluation
from sklearn.metrics import r2_score

# Visullize
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
#import lightgbm as lgb
import optuna.integration.lightgbm as lgb


# Others
import warnings
warnings.simplefilter("ignore")

In [None]:
# Dataset path
data_path = Path('../input/optiver-realized-volatility-prediction')

In [None]:
# setting display option
pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 10)
#pd.set_option('display.max_columns', None) 

In [None]:
# Objective variable
target = 'target'

# submission file setting
submit_file = 'submission.csv'
Id_column = 'row_id'

# Functions

In [None]:
#　Log Return
def log_return(list_stock_prices):
    return np.log(list_stock_prices).diff() 

# Realized Volatility
def realized_volatility(series_log_return):
    return np.sqrt(np.sum(series_log_return**2))

In [None]:
# WAP calculation
def wap_calculation1(df):
    return (df['bid_price1'] * df['ask_size1'] + df['ask_price1'] * df['bid_size1']) / (df['bid_size1'] + df['ask_size1'])

def wap_calculation2(df):
    return (df['bid_price2'] * df['ask_size2'] + df['ask_price2'] * df['bid_size2']) / (df['bid_size2'] + df['ask_size2'])

def wap_calculation3(df):
    wap = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    return wap

def wap_calculation4(df):
    wap = (df['bid_price2'] * df['bid_size2'] + df['ask_price2'] * df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    return wap

In [None]:
# my palams
# askprice1 - bidprice1
# askprice2 - bidprice2
# askprice2 - askprice1
# bidprice1 - bidprice2
def price_ask1_bid1_diff(df):
    return (df['ask_price1'] - df['bid_price1'])
def price_ask2_bid2_diff(df):
    return (df['ask_price2'] - df['bid_price2'])
def price_ask2_bid1_diff(df):
    return (df['ask_price2'] - df['bid_price1'])
def price_ask1_bid2_diff(df):
    return (df['ask_price1'] - df['bid_price2'])
def price_wap1_wap2_diff(df):
    return (df['wap1'] - df['wap2'])
def std_per_mean(df):
    return np.std(df) / np.mean(df)

In [None]:
# RMSPE
def rmspe(y_true, y_pred):
    return  (np.sqrt(np.mean(np.square((y_true - y_pred) / y_true))))

# Pre-processing data

## Book data

In [None]:
def book_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'book_{data_type}.parquet/stock_id={stock_id}/')
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # WAP calculation
    df['wap1'] = wap_calculation1(df)
    df['wap2'] = wap_calculation2(df)
    df['wap3'] = wap_calculation3(df)
    df['wap4'] = wap_calculation4(df)

    # log return calculation
    df['log_return1'] = df.groupby(['time_id'])['wap1'].apply(log_return).fillna(0)
    df['log_return2'] = df.groupby(['time_id'])['wap2'].apply(log_return).fillna(0)  
    df['log_return3'] = df.groupby(['time_id'])['wap3'].apply(log_return).fillna(0)
    df['log_return4'] = df.groupby(['time_id'])['wap4'].apply(log_return).fillna(0)  
    
    # Calculate wap balance
    df['wap_balance12'] = abs(df['wap1'] - df['wap2'])
    df['wap_balance34'] = abs(df['wap3'] - df['wap4'])
    # Calculate spread
    df['price_spread1'] = (df['ask_price1'] - df['bid_price1']) / ((df['ask_price1'] + df['bid_price1']) / 2)
    df['price_spread2'] = (df['ask_price2'] - df['bid_price2']) / ((df['ask_price2'] + df['bid_price2']) / 2)
    df['bid_spread'] = df['bid_price1'] - df['bid_price2']
    df['ask_spread'] = df['ask_price1'] - df['ask_price2']
    df["bid_ask_spread"] = abs(df['bid_spread'] - df['ask_spread'])
    df['total_volume'] = (df['ask_size1'] + df['ask_size2']) + (df['bid_size1'] + df['bid_size2'])
    df['volume_imbalance'] = abs((df['ask_size1'] + df['ask_size2']) - (df['bid_size1'] + df['bid_size2']))
                           
    # Log_return calculation each stock_id and time_id
    feat_to_calc_rv = ['log_return1','log_return2','log_return3','log_return4']
    return_values = pd.DataFrame(
        df.groupby(
            ['stock_id','time_id']
        )[feat_to_calc_rv].agg(realized_volatility)
    ).reset_index()
    return_values = return_values.rename(
        columns={
            'log_return1': 'realized_volatility1',
            'log_return2': 'realized_volatility2',
            'log_return3': 'realized_volatility3',
            'log_return4': 'realized_volatility4'
        }
    )

    df = df.drop(['time_id', 'seconds_in_bucket'], axis=1)

    # skew
    return_values = return_values.merge(
        df.groupby(['stock_id']).skew(),
        on='stock_id',
        suffixes=['', '_skew'],
        how='left'
    )
    # sem
    return_values = return_values.merge(
        df.groupby(['stock_id']).sem(),
        on='stock_id',
        suffixes=['', '_sem'],
        how='left'
    )
    # std_per_mean
    return_values = return_values.merge(
        df.groupby(['stock_id']).agg(std_per_mean),
        on='stock_id',
        suffixes=['', '_std_per_mean'],
        how='left'
    )   

    features = [
        'wap1',
        'wap2',
        'wap3',
        'wap4',
        'ask_price1',
        'ask_price2',
        'bid_price1',
        'bid_price2',
        'ask_size1',
        'ask_size2',
        'bid_size1',
        'bid_size2',
        'log_return1',
        'log_return2',
        'realized_volatility1',
        'realized_volatility2',
        'realized_volatility3',
        'realized_volatility4',
        'std_per_mean',
        'wap_balance12',
        'wap_balance34',
        'price_spread1',
        'price_spread2',
        'bid_spread',
        'ask_spread',
        'bid_ask_spread',
        'total_volume',
        'volume_imbalance'
    ]

    return return_values

In [None]:
df_book = book_preprocessing(97, 'train')
df_book

## Trade book

In [None]:
def trade_preprocessing(stock_id : int, data_type = 'train'):
    # read data
    df = pd.read_parquet(data_path / f'trade_{data_type}.parquet/stock_id={stock_id}/')
    
    df = df.sort_values(by=['time_id', 'seconds_in_bucket']).reset_index(drop=True)
    
    # set stock_id
    df['stock_id'] = stock_id
    
    # log return calculation
    df['trade_log_return1'] = df.groupby(by = ['time_id'])['price'].apply(log_return).fillna(0)
    
    # Log_return calculation each stock_id and time_id
    df = pd.DataFrame(df.groupby(['stock_id','time_id'])[['trade_log_return1']].agg(realized_volatility).reset_index())
    
    return df

In [None]:
df_trade = trade_preprocessing(0,'train')
df_trade.head()

## Merging trade and book functions

In [None]:
def get_stock_stat(stock_id : int, data_type = 'train'):
    
    # parquet data processing
    book_stat = book_preprocessing(stock_id, data_type)
    trade_stat = trade_preprocessing(stock_id, data_type)
    
    #Merge book and trade features
    stock_stat = book_stat.merge(trade_stat, on=['stock_id', 'time_id'], how='left').fillna(-999)
    
    return stock_stat

In [None]:
def get_dataSet(stock_ids : list, data_type = 'train'):
    # Parallel process of get_stock_stat 
    stock_stat = Parallel(n_jobs=-1)(
        delayed(get_stock_stat)(stock_id, data_type) 
        for stock_id in stock_ids
    )
    # concat several stock_stats in vertical direction, axis=0(default)
    stock_stat_df = pd.concat(stock_stat, ignore_index = True)

    return stock_stat_df

## train data

In [None]:
train=pd.read_csv(data_path / 'train.csv')
train['row_id'] = train['stock_id'].astype(str) + '-' + train['time_id'].astype(str)
display(train.head())
print('train data shape:', train.shape)

In [None]:
def miff_max_min(x):
    return max(x) - min(x)

def add_my_param(data):

    data_ = data.copy()
    try:
        data_ = data_.drop(['target'],axis=1)
    except Exception as e:
        pass

    data_ = data_.merge(
        data.groupby('stock_id').std(),
        on='stock_id',
        suffixes=['', '_std'],
        how='left'
    )
        
    # skew
    data_ = data_.merge(
        data.groupby('stock_id').skew(),
        on='stock_id',
        suffixes=['', '_skew'],
        how='left'
    )
    
    # sem
    data_ = data_.merge(
        data.groupby('stock_id').sem(),
        on='stock_id',
        suffixes=['', '_sem'],
        how='left'
    )    
    return data_

In [None]:
train_stock_stat_df = get_dataSet(stock_ids = train['stock_id'].unique(), data_type = 'train')
# train_stock_stat_df = add_my_param(train_stock_stat_df)

# Merge train with train_stock_stat_df
train = pd.merge(train, train_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left')

train

# LOFO

In [None]:
!pip install git+https://github.com/aerdem4/lofo-importance

In [None]:
from sklearn.model_selection import KFold
from lofo import LOFOImportance, Dataset, plot_importance
%matplotlib inline

target="target"

sample_df = train.sample(frac=0.01, random_state=0)
#sample_df.sort_values("AvSigVersion", inplace=True)

# define the binary target and the features
cv = KFold(n_splits=4, shuffle=False, random_state=0)
#target = "HasDetections"
features = [col for col in train.columns if col != target]
#features = [col for col in train.columns]


# define the binary target and the features
dataset = Dataset(df=sample_df, target="target", features=[col for col in sample_df.columns if col != target])

# get the mean and standard deviation of the importances in pandas format
lofo = LOFOImportance(dataset, cv=cv, scoring="neg_mean_absolute_error")
importance_df = lofo.get_importance()

# plot the means and standard deviations of the importances
plot_importance(importance_df, figsize=(12, 40))

# test data

In [None]:
test = pd.read_csv(data_path /'test.csv')
test['row_id'] = test['stock_id'].astype(str) + '-' + test['time_id'].astype(str)
test

In [None]:
test_stock_stat_df = get_dataSet(stock_ids = test['stock_id'].unique(), data_type = 'test')
test = pd.merge(test, test_stock_stat_df, on = ['stock_id', 'time_id'], how = 'left').fillna(0)
test

# Training

In [None]:
# Parameters of Light GBM
# first try
params_lgbm = {
         'task': 'train',
         'boosting_type': 'gbdt',
         'learning_rate': 0.01,
         'objective': 'regression',
         'metric': 'None',
         'max_depth': -1,
         'n_jobs': -1,
         'feature_fraction': 0.7,
         'bagging_fraction': 0.7,
         'lambda_l2': 1,
         'verbose': -1,
         #'bagging_freq': 5
 }

In [None]:
# Define loss function for lightGBM training
def feval_RMSPE(preds, train_data):
    labels = train_data.get_label()
    return 'RMSPE', round(rmspe(y_true = labels, y_pred = preds),5), False


In [None]:
# training function
def light_gbm(X_train, y_train, X_val ,y_val,cats):
    
    print(cats)
    
    # Create dataset
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cats, weight=1/np.power(y_train,2))
    val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cats, weight=1/np.power(y_val,2))
    
    # training
    model = lgb.train(params_lgbm, 
                      train_data, 
                      n_rounds, 
                      valid_sets=val_data, 
                      feval=feval_RMSPE,
                      verbose_eval= 250,
                      early_stopping_rounds=500
                     )
    
    # Prediction w/ validation data
    preds_val = model.predict(train.loc[val_index, features_columns])
    train.loc[val_index, pred_name] = preds_val
    
    # RMSPE calculation
    score = round(rmspe(y_true = y_val, y_pred = preds_val),5)

    # Prediction w/ validation data
    test_preds = model.predict(test[features_columns]).clip(0,1e10)
    
    # delete dataset
    del train_data, val_data
    
    return score, test_preds, model

# Preparing data to train

In [None]:
# Categorical data column list
cats = ['stock_id']
features_columns = train.columns.values.tolist()

# drop feat list
drop_feat = ['row_id','target']
for i in drop_feat: features_columns.remove(i)

model_name = 'lgb1'
pred_name = f'pred_{model_name}'

print(f'Train dataset columns : {len(features_columns)} features')

train[pred_name] = 0
test[target] = 0

# Training

In [None]:
params = {
    'objective': 'mean_squared_error',
    'metric': 'mae',
    "verbosity": -1,
    "boosting_type": "gbdt",
}

best_params, history = {}, []

lgb_train = lgb.Dataset(train[features_columns], train[target])
lgb_eval = lgb.Dataset(test[features_columns], test[target], reference=lgb_train)

# LightGBM
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=200,
                valid_sets=[lgb_train, lgb_eval],
                early_stopping_rounds=50
               )

best_params = gbm.params
params_lgbm = best_params
params_lgbm

# cross validation

In [None]:
# k-flods Ensemble Training
n_folds = 4
n_rounds = 10000

kf = model_selection.KFold(n_splits=n_folds, shuffle=True, random_state=42)

# Initialize scores dict
scores_folds = {}
# Initialize value in scores_folds(dict) to record each step in CV
scores_folds[model_name] = []

# Initial value
cv_trial = 1

In [None]:
# --- Cross Validation ---
for train_index, val_index in kf.split(range(len(train))):
    
    print(f'CV trial : {cv_trial} /{n_folds}')
    
    # Divide dataset into train and validation data such as Cross Validation
    X_train = train.loc[train_index, features_columns]
    y_train = train.loc[train_index, target].values
    X_val = train.loc[val_index, features_columns]
    y_val = train.loc[val_index, target].values
    
    # train with Light GBM
    rmspe_score, test_preds, model = light_gbm(X_train, y_train, X_val ,y_val,cats)
    
    # record score data at each train in CV
    scores_folds[model_name].append(rmspe_score)

    # Each validation Summary 
    print(f'Fold-{cv_trial} Model-{model_name} RMSPE: {rmspe_score}')
    print('-'*50)
    test_preds = model.predict(test[features_columns]).clip(0,1e10)
    print(test_preds)
    test[target] += test_preds
    cv_trial += 1
    
# delete dataset
# del train, test

# eval

In [None]:
# devide test target score due to sum 4 preds value in CV process
test[target] = test[target]/n_folds

# score calculation
score = round(rmspe(y_true = train[target].values, y_pred = train[pred_name].values),5)
print(f'RMSPE {model_name}: {score} - Folds: {scores_folds[model_name]}')

display(test[[Id_column, target]].head(2))

In [None]:
test[[Id_column, target]].to_csv(submit_file, index = False)