In [1]:
#load libraries

import numpy as np
import pandas as pd
from sklearn.metrics import \
    r2_score, get_scorer
from sklearn.linear_model import \
    LinearRegression
from sklearn.preprocessing import \
    StandardScaler

from sklearn.preprocessing import MinMaxScaler

from scipy.stats.mstats import winsorize

from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import ParameterGrid

In [2]:
#load data --> winsorize data --> scale data

df = pd.read_csv ('C:\\Users\\Martijn\\Documents\\Dataset_unscaled.csv', index_col=0)
df.sort_values(by=['year','month'], inplace=True)
df = df.reset_index(drop=True)

df.iloc[:,16:] = df.iloc[:,16:].apply(lambda x : winsorize(x,(0.1,0.1)))

cols_to_scale = list(df.iloc[:,15:].columns)
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))

df[cols_to_scale] = min_max_scaler.fit_transform(df[cols_to_scale])

In [5]:
#create tuning parameters

# Number of trees in random forest
n_estimators = [300]#,50,150,300]
# Number of features to consider at every split
max_features = ['sqrt']
# Maximum number of levels in tree
max_depth = [3,6,12]
# Minimum number of samples required to split a node
#max_leaf_nodes = [25,50]
# Minimum number of samples required at each leaf node
#min_samples_leaf = [1,3,5]
min_samples_split = [15,30,50]
# Method of selecting samples for training each tree
bootstrap = [True]

In [6]:
param_grid = {'n_estimators': n_estimators, 'max_features': max_features, 
              'max_depth': max_depth, 'min_samples_split': min_samples_split,
             'bootstrap':bootstrap}
train_features = ParameterGrid(param_grid)

In [7]:
R2_list_rf = []
feature_importance_list = []

for name in df.iloc[:,9:15]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]
        
    true = []
    predicted = []
    
    for j in range(1,6):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:]

        val_r2_list = []

        for i in tqdm(range(len(train_features))):

            regressor = RandomForestRegressor(n_estimators = train_features[i]['n_estimators'], 
                                              min_samples_split = train_features[i]['min_samples_split'],
                                              max_features = train_features[i]['max_features'],
                                              max_depth = train_features[i]['max_depth'],
                                              bootstrap = train_features[i]['bootstrap'],random_state = 123)

            # fit the regressor with x and y data
            regressor.fit(X_train, y_train) 

            y_pred_val = regressor.predict(X_val)

            val_r2_list.append(r2_score(y_val, y_pred_val)*100)
    
            opt = train_features[np.argmax(val_r2_list)]

        regressor = RandomForestRegressor(n_estimators = opt['n_estimators'], 
                                          min_samples_split = opt['min_samples_split'],
                                          max_features = opt['max_features'],
                                          max_depth = opt['max_depth'],
                                          bootstrap = opt['bootstrap'],random_state = 123)
        regressor.fit(X_train, y_train) 

        y_pred_test = regressor.predict(X_test)
        
        true.extend(y_test)

        predicted.extend(y_pred_test)

        print(name,j,opt)
        
    R2_list_rf.append([name,r2_score(true, predicted)*100])
    
print(R2_list_rf)


100%|██████████| 9/9 [07:46<00:00, 51.82s/it]


IV_D_C_30 1 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [08:18<00:00, 55.37s/it]


IV_D_C_30 2 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [08:55<00:00, 59.47s/it]


IV_D_C_30 3 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [09:28<00:00, 63.22s/it]


IV_D_C_30 4 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [08:55<00:00, 59.53s/it]


IV_D_C_30 5 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:59<00:00, 39.89s/it]


IV_D_P_30 1 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:25<00:00, 42.89s/it]


IV_D_P_30 2 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [06:53<00:00, 45.95s/it]


IV_D_P_30 3 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:20<00:00, 48.98s/it]


IV_D_P_30 4 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:48<00:00, 52.08s/it]


IV_D_P_30 5 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:57<00:00, 39.74s/it]


IV_D_C_60 1 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:25<00:00, 42.84s/it]


IV_D_C_60 2 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:51<00:00, 45.67s/it]


IV_D_C_60 3 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:19<00:00, 48.86s/it]


IV_D_C_60 4 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:47<00:00, 51.98s/it]


IV_D_C_60 5 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:57<00:00, 39.77s/it]


IV_D_P_60 1 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:25<00:00, 42.81s/it]


IV_D_P_60 2 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:51<00:00, 45.68s/it]


IV_D_P_60 3 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:18<00:00, 48.70s/it]


IV_D_P_60 4 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:47<00:00, 51.94s/it]


IV_D_P_60 5 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:55<00:00, 39.49s/it]


IV_D_C_91 1 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:23<00:00, 42.57s/it]


IV_D_C_91 2 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:50<00:00, 45.56s/it]


IV_D_C_91 3 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:18<00:00, 48.69s/it]


IV_D_C_91 4 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:45<00:00, 51.75s/it]


IV_D_C_91 5 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:54<00:00, 39.43s/it]


IV_D_P_91 1 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:22<00:00, 42.51s/it]


IV_D_P_91 2 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [06:48<00:00, 45.44s/it]


IV_D_P_91 3 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:16<00:00, 48.55s/it]


IV_D_P_91 4 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [07:44<00:00, 51.63s/it]


IV_D_P_91 5 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}
[['IV_D_C_30', 7.877108474840466], ['IV_D_P_30', 8.255414677659456], ['IV_D_C_60', 9.882378795732317], ['IV_D_P_60', 9.733361622710824], ['IV_D_C_91', 11.03872139617852], ['IV_D_P_91', 10.444898615878472]]


In [8]:
#FINVAR ONLY

R2_list_rf = []
feature_importance_list = []

for name in df.iloc[:,9:10]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]
        
    true = []
    predicted = []
    
    for j in range(1,6):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:60]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:60]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:60]

        val_r2_list = []

        for i in tqdm(range(len(train_features))):

            regressor = RandomForestRegressor(n_estimators = train_features[i]['n_estimators'], 
                                              min_samples_split = train_features[i]['min_samples_split'],
                                              max_features = train_features[i]['max_features'],
                                              max_depth = train_features[i]['max_depth'],
                                              bootstrap = train_features[i]['bootstrap'],random_state = 123)

            # fit the regressor with x and y data
            regressor.fit(X_train, y_train) 

            y_pred_val = regressor.predict(X_val)

            val_r2_list.append(r2_score(y_val, y_pred_val)*100)
    
            opt = train_features[np.argmax(val_r2_list)]

        regressor = RandomForestRegressor(n_estimators = opt['n_estimators'], 
                                          min_samples_split = opt['min_samples_split'],
                                          max_features = opt['max_features'],
                                          max_depth = opt['max_depth'],
                                          bootstrap = opt['bootstrap'],random_state = 123)
        regressor.fit(X_train, y_train) 

        y_pred_test = regressor.predict(X_test)
        
        true.extend(y_test)

        predicted.extend(y_pred_test)

        print(name,j,opt)
        
    R2_list_rf.append([name,r2_score(true, predicted)*100])
    
print(R2_list_rf)


100%|██████████| 9/9 [04:43<00:00, 31.52s/it]


IV_D_C_30 1 {'n_estimators': 300, 'min_samples_split': 30, 'max_features': 'sqrt', 'max_depth': 6, 'bootstrap': True}


100%|██████████| 9/9 [05:05<00:00, 34.00s/it]


IV_D_C_30 2 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:28<00:00, 36.47s/it]


IV_D_C_30 3 {'n_estimators': 300, 'min_samples_split': 50, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [05:49<00:00, 38.85s/it]


IV_D_C_30 4 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}


100%|██████████| 9/9 [06:12<00:00, 41.43s/it]


IV_D_C_30 5 {'n_estimators': 300, 'min_samples_split': 15, 'max_features': 'sqrt', 'max_depth': 12, 'bootstrap': True}
[['IV_D_C_30', 8.717796480542738]]
