In [1]:
#load libraries

import numpy as np
import pandas as pd
from sklearn.metrics import \
    r2_score, get_scorer
from sklearn.linear_model import \
    LinearRegression
from sklearn.preprocessing import \
    StandardScaler

from sklearn.preprocessing import MinMaxScaler

from scipy.stats.mstats import winsorize

from tqdm import tqdm

from sklearn.ensemble import GradientBoostingRegressor

from sklearn.model_selection import ParameterGrid

import matplotlib.pyplot as plt

import xgboost as xgb

In [2]:
#load data --> winsorize data --> scale data

df = pd.read_csv ('C:\\Users\\Martijn\\Documents\\Dataset_unscaled.csv', index_col=0)
df.sort_values(by=['year','month'], inplace=True)
df = df.reset_index(drop=True)

df.iloc[:,16:] = df.iloc[:,16:].apply(lambda x : winsorize(x,(0.1,0.1)))

cols_to_scale = list(df.iloc[:,15:].columns)
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))

df[cols_to_scale] = min_max_scaler.fit_transform(df[cols_to_scale])

In [3]:
#create tuning parameters

# Number of trees
n_estimators = [1000]
# shrinkage level
learning_rate = [0.01,0.1]
# maximum tree depth
max_depth = [1,2]

param_grid = {'n_estimators': n_estimators, 'learning_rate': learning_rate, 
              'max_depth': max_depth}
tuning_parameters = ParameterGrid(param_grid)

In [4]:
R2_list_gbm = []
feature_importance_list = []

for name in df.iloc[:,9:15]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]
        
    true = []
    predicted = []
    
    for j in range(1,6):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:]

        val_r2_list = []

        for i in tqdm(range(len(tuning_parameters))):
            
            GBM = xgb.XGBRegressor(n_estimators=tuning_parameters[i]['n_estimators'],
                                    eta = tuning_parameters[i]['learning_rate'],
                                    max_depth=tuning_parameters[i]['max_depth'], seed = 123)

            # fit the regressor with x and y data
            GBM.fit(X_train, y_train) 

            y_pred_val = GBM.predict(X_val)

            val_r2_list.append(r2_score(y_val, y_pred_val)*100)

            opt = tuning_parameters[np.argmax(val_r2_list)]
        
        GBM = xgb.XGBRegressor(n_estimators=opt['n_estimators'],
                                eta = opt['learning_rate'],
                                max_depth=opt['max_depth'], seed = 123)

        GBM.fit(X_train, y_train) 

        y_pred_test = GBM.predict(X_test)
        
        true.extend(y_test)

        predicted.extend(y_pred_test)
        
        print(name,j,opt)
        
    R2_list_gbm.append([name,r2_score(true, predicted)*100])
    
print(R2_list_gbm)

100%|██████████| 4/4 [00:52<00:00, 13.21s/it]


IV_D_C_30 1 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [00:55<00:00, 13.89s/it]


IV_D_C_30 2 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:04<00:00, 16.14s/it]


IV_D_C_30 3 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:08<00:00, 17.17s/it]


IV_D_C_30 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:12<00:00, 18.07s/it]


IV_D_C_30 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [00:56<00:00, 14.19s/it]


IV_D_P_30 1 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:00<00:00, 15.23s/it]


IV_D_P_30 2 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:04<00:00, 16.19s/it]


IV_D_P_30 3 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:08<00:00, 17.24s/it]


IV_D_P_30 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:12<00:00, 18.13s/it]


IV_D_P_30 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [00:56<00:00, 14.19s/it]


IV_D_C_60 1 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:00<00:00, 15.12s/it]


IV_D_C_60 2 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:04<00:00, 16.05s/it]


IV_D_C_60 3 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:08<00:00, 17.04s/it]


IV_D_C_60 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:11<00:00, 18.00s/it]


IV_D_C_60 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [00:56<00:00, 14.18s/it]


IV_D_P_60 1 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:00<00:00, 15.15s/it]


IV_D_P_60 2 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:04<00:00, 16.05s/it]


IV_D_P_60 3 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:08<00:00, 17.17s/it]


IV_D_P_60 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:13<00:00, 18.34s/it]


IV_D_P_60 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [00:56<00:00, 14.18s/it]


IV_D_C_91 1 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:01<00:00, 15.46s/it]


IV_D_C_91 2 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:04<00:00, 16.15s/it]


IV_D_C_91 3 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:07<00:00, 16.93s/it]


IV_D_C_91 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:11<00:00, 17.96s/it]


IV_D_C_91 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [00:56<00:00, 14.15s/it]


IV_D_P_91 1 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:00<00:00, 15.05s/it]


IV_D_P_91 2 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:04<00:00, 16.03s/it]


IV_D_P_91 3 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:07<00:00, 16.99s/it]


IV_D_P_91 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [01:12<00:00, 18.09s/it]


IV_D_P_91 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}
[['IV_D_C_30', 11.584502819383413], ['IV_D_P_30', 12.150676223030466], ['IV_D_C_60', 11.940242235384302], ['IV_D_P_60', 14.31966262442267], ['IV_D_C_91', 12.444738249251852], ['IV_D_P_91', 12.14539968376469]]


In [5]:
#FINVAR ONLY

R2_list_gbm = []
feature_importance_list = []

for name in df.iloc[:,9:10]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]
        
    true = []
    predicted = []
    
    for j in range(1,6):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:60]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:60]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:60]

        val_r2_list = []

        for i in tqdm(range(len(tuning_parameters))):
            
            GBM = xgb.XGBRegressor(n_estimators=tuning_parameters[i]['n_estimators'],
                                    eta = tuning_parameters[i]['learning_rate'],
                                    max_depth=tuning_parameters[i]['max_depth'], seed = 123)

            # fit the regressor with x and y data
            GBM.fit(X_train, y_train) 

            y_pred_val = GBM.predict(X_val)

            val_r2_list.append(r2_score(y_val, y_pred_val)*100)

            opt = tuning_parameters[np.argmax(val_r2_list)]
        
        GBM = xgb.XGBRegressor(n_estimators=opt['n_estimators'],
                                eta = opt['learning_rate'],
                                max_depth=opt['max_depth'], seed = 123)

        GBM.fit(X_train, y_train) 

        y_pred_test = GBM.predict(X_test)
        
        true.extend(y_test)

        predicted.extend(y_pred_test)
        
        print(name,j,opt)
        
    R2_list_gbm.append([name,r2_score(true, predicted)*100])
    
print(R2_list_gbm)

100%|██████████| 4/4 [00:51<00:00, 12.94s/it]


IV_D_C_30 1 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.01}


100%|██████████| 4/4 [00:55<00:00, 13.78s/it]


IV_D_C_30 2 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.01}


100%|██████████| 4/4 [00:58<00:00, 14.68s/it]


IV_D_C_30 3 {'n_estimators': 1000, 'max_depth': 1, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:02<00:00, 15.71s/it]


IV_D_C_30 4 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}


100%|██████████| 4/4 [01:05<00:00, 16.43s/it]


IV_D_C_30 5 {'n_estimators': 1000, 'max_depth': 2, 'learning_rate': 0.1}
[['IV_D_C_30', 11.386394138460599]]
