In [1]:
#load libraries

import numpy as np
import pandas as pd
from sklearn.metrics import \
    r2_score, get_scorer
from sklearn.linear_model import \
    Lasso, Ridge, LassoCV,LinearRegression
from sklearn.preprocessing import \
    StandardScaler, PolynomialFeatures
from sklearn.model_selection import \
    KFold, RepeatedKFold, GridSearchCV, \
    cross_validate, train_test_split


from sklearn.preprocessing import MinMaxScaler

from scipy.stats.mstats import winsorize

from tqdm import tqdm

In [3]:
#load data --> winsorize data --> scale data

df = pd.read_csv ('/Volumes/USB THOMAS/THESIS NEW/DATA/8. MERGE FILES (FINAL DATASET)/Dataset_unscaled.csv', index_col=0)
df.sort_values(by=['year','month'], inplace=True)
df = df.reset_index(drop=True)

df.iloc[:,16:] = df.iloc[:,16:].apply(lambda x : winsorize(x,(0.1,0.1)))

cols_to_scale = list(df.iloc[:,15:].columns)
min_max_scaler = MinMaxScaler(feature_range=(-1, 1))

df[cols_to_scale] = min_max_scaler.fit_transform(df[cols_to_scale])


In [6]:
#create alpha/lambda penalty grid
alphas = np.linspace(0.0001, 0.1,100)

In [7]:
alphas

array([0.0001    , 0.00110909, 0.00211818, 0.00312727, 0.00413636,
       0.00514545, 0.00615455, 0.00716364, 0.00817273, 0.00918182,
       0.01019091, 0.0112    , 0.01220909, 0.01321818, 0.01422727,
       0.01523636, 0.01624545, 0.01725455, 0.01826364, 0.01927273,
       0.02028182, 0.02129091, 0.0223    , 0.02330909, 0.02431818,
       0.02532727, 0.02633636, 0.02734545, 0.02835455, 0.02936364,
       0.03037273, 0.03138182, 0.03239091, 0.0334    , 0.03440909,
       0.03541818, 0.03642727, 0.03743636, 0.03844545, 0.03945455,
       0.04046364, 0.04147273, 0.04248182, 0.04349091, 0.0445    ,
       0.04550909, 0.04651818, 0.04752727, 0.04853636, 0.04954545,
       0.05055455, 0.05156364, 0.05257273, 0.05358182, 0.05459091,
       0.0556    , 0.05660909, 0.05761818, 0.05862727, 0.05963636,
       0.06064545, 0.06165455, 0.06266364, 0.06367273, 0.06468182,
       0.06569091, 0.0667    , 0.06770909, 0.06871818, 0.06972727,
       0.07073636, 0.07174545, 0.07275455, 0.07376364, 0.07477

In [8]:
#run lasso model and save R2 scores 

R2_list_lasso = []

for name in df.iloc[:,9:15]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]

    true = []
    predicted = []

    for j in tqdm(range(1,6)):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:]

        val_r2_list = []

        for i in alphas:
            model = Lasso(alpha=i,random_state=123)

            model.fit(X_train,y_train)

            y_pred_val = model.predict(X_val)

            coefficient_of_dermination = r2_score(y_val, y_pred_val)

            val_r2_list.append(coefficient_of_dermination*100)


        model = Lasso(alpha=alphas[np.argmax(val_r2_list)],random_state=123)
        print(name,j,[alphas[np.argmax(val_r2_list)]])

        model.fit(X_train,y_train)

        y_pred_test = model.predict(X_test)

        true.extend(y_test)

        predicted.extend(y_pred_test)
    
    R2_list_lasso.append([name,r2_score(true, predicted)*100])
    
print(R2_list_lasso)

 20%|█████████                                    | 1/5 [00:04<00:17,  4.42s/it]

IV_D_C_30 1 [0.002118181818181818]


 40%|██████████████████                           | 2/5 [00:09<00:13,  4.56s/it]

IV_D_C_30 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:14<00:09,  4.94s/it]

IV_D_C_30 3 [0.0011090909090909092]
IV_D_C_30 4 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:25<00:00,  5.19s/it]

IV_D_C_30 5 [0.0001]



 20%|█████████                                    | 1/5 [00:04<00:17,  4.46s/it]

IV_D_P_30 1 [0.002118181818181818]


 40%|██████████████████                           | 2/5 [00:08<00:13,  4.48s/it]

IV_D_P_30 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:13<00:09,  4.67s/it]

IV_D_P_30 3 [0.0001]
IV_D_P_30 4 [0.0001]


 80%|████████████████████████████████████         | 4/5 [00:19<00:05,  5.17s/it]

IV_D_P_30 5 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:26<00:00,  5.27s/it]
 20%|█████████                                    | 1/5 [00:05<00:22,  5.54s/it]

IV_D_C_60 1 [0.002118181818181818]


 40%|██████████████████                           | 2/5 [00:11<00:17,  5.67s/it]

IV_D_C_60 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:17<00:11,  6.00s/it]

IV_D_C_60 3 [0.0011090909090909092]


 80%|████████████████████████████████████         | 4/5 [00:24<00:06,  6.24s/it]

IV_D_C_60 4 [0.0011090909090909092]
IV_D_C_60 5 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:31<00:00,  6.32s/it]
 20%|█████████                                    | 1/5 [00:05<00:23,  5.82s/it]

IV_D_P_60 1 [0.0011090909090909092]


 40%|██████████████████                           | 2/5 [00:12<00:18,  6.22s/it]

IV_D_P_60 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:18<00:12,  6.14s/it]

IV_D_P_60 3 [0.0011090909090909092]
IV_D_P_60 4 [0.0001]


 80%|████████████████████████████████████         | 4/5 [00:24<00:06,  6.21s/it]

IV_D_P_60 5 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:31<00:00,  6.22s/it]
 20%|█████████                                    | 1/5 [00:05<00:20,  5.10s/it]

IV_D_C_91 1 [0.002118181818181818]


 40%|██████████████████                           | 2/5 [00:10<00:15,  5.23s/it]

IV_D_C_91 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:15<00:10,  5.37s/it]

IV_D_C_91 3 [0.0011090909090909092]


 80%|████████████████████████████████████         | 4/5 [00:21<00:05,  5.62s/it]

IV_D_C_91 4 [0.0011090909090909092]
IV_D_C_91 5 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:28<00:00,  5.63s/it]
 20%|█████████                                    | 1/5 [00:05<00:20,  5.04s/it]

IV_D_P_91 1 [0.002118181818181818]


 40%|██████████████████                           | 2/5 [00:10<00:15,  5.17s/it]

IV_D_P_91 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:15<00:10,  5.37s/it]

IV_D_P_91 3 [0.0011090909090909092]


 80%|████████████████████████████████████         | 4/5 [00:21<00:05,  5.54s/it]

IV_D_P_91 4 [0.0011090909090909092]
IV_D_P_91 5 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:28<00:00,  5.61s/it]

[['IV_D_C_30', 8.960208637141232], ['IV_D_P_30', 9.231823952930263], ['IV_D_C_60', 8.355737642291562], ['IV_D_P_60', 11.204098117989025], ['IV_D_C_91', 9.187590957937408], ['IV_D_P_91', 8.35522802737525]]





In [9]:
#run lasso model financial variables only

R2_list_lasso = []

for name in df.iloc[:,9:10]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]

    true = []
    predicted = []

    for j in tqdm(range(1,6)):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:60]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:60]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:60]

        val_r2_list = []

        for i in alphas:
            model = Lasso(alpha=i,random_state=123)

            model.fit(X_train,y_train)

            y_pred_val = model.predict(X_val)

            coefficient_of_dermination = r2_score(y_val, y_pred_val)

            val_r2_list.append(coefficient_of_dermination*100)


        model = Lasso(alpha=alphas[np.argmax(val_r2_list)],random_state=123)
        print(name,j,[alphas[np.argmax(val_r2_list)]])

        model.fit(X_train,y_train)

        y_pred_test = model.predict(X_test)

        true.extend(y_test)

        predicted.extend(y_pred_test)
    
    R2_list_lasso.append([name,r2_score(true, predicted)*100])
    
print(R2_list_lasso)

 20%|█████████                                    | 1/5 [00:03<00:14,  3.72s/it]

IV_D_C_30 1 [0.002118181818181818]


 40%|██████████████████                           | 2/5 [00:07<00:11,  3.82s/it]

IV_D_C_30 2 [0.002118181818181818]


 60%|███████████████████████████                  | 3/5 [00:11<00:07,  3.84s/it]

IV_D_C_30 3 [0.002118181818181818]
IV_D_C_30 4 [0.0001]


100%|█████████████████████████████████████████████| 5/5 [00:20<00:00,  4.09s/it]

IV_D_C_30 5 [0.0001]
[['IV_D_C_30', 8.716157059467145]]





In [10]:
#run ridge model and save R2 scores

R2_list_ridge = []

for name in df.iloc[:,9:15]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]

    true = []
    predicted = []

    for j in tqdm(range(1,6)):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:]

        val_r2_list = []

        for i in alphas:
            model = Ridge(alpha=i,random_state=123)

            model.fit(X_train,y_train)

            y_pred_val = model.predict(X_val)

            coefficient_of_dermination = r2_score(y_val, y_pred_val)

            val_r2_list.append(coefficient_of_dermination*100)


        model = Ridge(alpha=alphas[np.argmax(val_r2_list)],random_state=123)
        print(name,j,[alphas[np.argmax(val_r2_list)]])

        model.fit(X_train,y_train)

        y_pred_test = model.predict(X_test)

        true.extend(y_test)

        predicted.extend(y_pred_test)

        coefficient_of_dermination = r2_score(y_test, y_pred_test)
    
    R2_list_ridge.append([name,r2_score(true, predicted)*100])
    
print(R2_list_ridge)

 20%|█████████                                    | 1/5 [00:03<00:15,  3.93s/it]

IV_D_C_30 1 [0.1]


 40%|██████████████████                           | 2/5 [00:08<00:12,  4.18s/it]

IV_D_C_30 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:12<00:08,  4.29s/it]

IV_D_C_30 3 [0.1]


 80%|████████████████████████████████████         | 4/5 [00:17<00:04,  4.53s/it]

IV_D_C_30 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:22<00:00,  4.49s/it]

IV_D_C_30 5 [0.0001]



 20%|█████████                                    | 1/5 [00:04<00:16,  4.19s/it]

IV_D_P_30 1 [0.1]


 40%|██████████████████                           | 2/5 [00:08<00:12,  4.21s/it]

IV_D_P_30 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:12<00:08,  4.35s/it]

IV_D_P_30 3 [0.0001]


 80%|████████████████████████████████████         | 4/5 [00:17<00:04,  4.48s/it]

IV_D_P_30 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:22<00:00,  4.53s/it]

IV_D_P_30 5 [0.0001]



 20%|█████████                                    | 1/5 [00:04<00:16,  4.14s/it]

IV_D_C_60 1 [0.1]


 40%|██████████████████                           | 2/5 [00:08<00:12,  4.29s/it]

IV_D_C_60 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:13<00:08,  4.37s/it]

IV_D_C_60 3 [0.1]


 80%|████████████████████████████████████         | 4/5 [00:17<00:04,  4.56s/it]

IV_D_C_60 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:22<00:00,  4.56s/it]

IV_D_C_60 5 [0.0001]



 20%|█████████                                    | 1/5 [00:04<00:16,  4.05s/it]

IV_D_P_60 1 [0.1]


 40%|██████████████████                           | 2/5 [00:08<00:12,  4.19s/it]

IV_D_P_60 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:12<00:08,  4.37s/it]

IV_D_P_60 3 [0.0001]


 80%|████████████████████████████████████         | 4/5 [00:17<00:04,  4.55s/it]

IV_D_P_60 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:22<00:00,  4.52s/it]

IV_D_P_60 5 [0.0001]



 20%|█████████                                    | 1/5 [00:04<00:16,  4.18s/it]

IV_D_C_91 1 [0.1]


 40%|██████████████████                           | 2/5 [00:08<00:13,  4.40s/it]

IV_D_C_91 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:13<00:09,  4.59s/it]

IV_D_C_91 3 [0.1]


 80%|████████████████████████████████████         | 4/5 [00:18<00:04,  4.84s/it]

IV_D_C_91 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:24<00:00,  4.94s/it]

IV_D_C_91 5 [0.0001]



 20%|█████████                                    | 1/5 [00:04<00:16,  4.15s/it]

IV_D_P_91 1 [0.1]


 40%|██████████████████                           | 2/5 [00:08<00:13,  4.44s/it]

IV_D_P_91 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:13<00:09,  4.61s/it]

IV_D_P_91 3 [0.1]


 80%|████████████████████████████████████         | 4/5 [00:19<00:04,  4.95s/it]

IV_D_P_91 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:24<00:00,  4.91s/it]

IV_D_P_91 5 [0.0001]
[['IV_D_C_30', 9.024523579256805], ['IV_D_P_30', 9.483418442706636], ['IV_D_C_60', 11.354552002898], ['IV_D_P_60', 11.72925662935722], ['IV_D_C_91', 11.471763318284534], ['IV_D_P_91', 10.940373947007853]]





In [11]:
#run ridge model financial variables only

R2_list_ridge = []

for name in df.iloc[:,9:10]:
    
    df_iter = df[df[name] <= df[name].quantile(0.95)].copy()
    
    for p in range(1,6):
    
        years = np.arange(1996, 2013+p, 1)

        locals()['df_train_' + str(p)] = df_iter[df_iter.year.isin(years)]

        year = [2013 + p,2014 + p,2015 + p]

        locals()['df_val_' + str(p)] = df_iter[df_iter.year.isin(year)]

        year = [2016 + p]

        locals()['df_test_' + str(p)] = df_iter[df_iter.year.isin(year)]

    true = []
    predicted = []

    for j in tqdm(range(1,6)):
        y_train = globals()['df_train_'+str(j)][name]
        y_val = globals()['df_val_'+str(j)][name]
        y_test = globals()['df_test_'+str(j)][name]

        X_train = globals()['df_train_'+str(j)].iloc[:,16:60]
        X_val = globals()['df_val_'+str(j)].iloc[:,16:60]
        X_test = globals()['df_test_'+str(j)].iloc[:,16:60]

        val_r2_list = []

        for i in alphas:
            model = Ridge(alpha=i,random_state=123)

            model.fit(X_train,y_train)

            y_pred_val = model.predict(X_val)

            coefficient_of_dermination = r2_score(y_val, y_pred_val)

            val_r2_list.append(coefficient_of_dermination*100)


        model = Ridge(alpha=alphas[np.argmax(val_r2_list)],random_state=123)
        print(name,j,[alphas[np.argmax(val_r2_list)]])

        model.fit(X_train,y_train)

        y_pred_test = model.predict(X_test)

        true.extend(y_test)

        predicted.extend(y_pred_test)

        coefficient_of_dermination = r2_score(y_test, y_pred_test)
    
    R2_list_ridge.append([name,r2_score(true, predicted)*100])
    
print(R2_list_ridge)

 20%|█████████                                    | 1/5 [00:03<00:12,  3.04s/it]

IV_D_C_30 1 [0.1]


 40%|██████████████████                           | 2/5 [00:06<00:09,  3.18s/it]

IV_D_C_30 2 [0.1]


 60%|███████████████████████████                  | 3/5 [00:09<00:06,  3.17s/it]

IV_D_C_30 3 [0.1]


 80%|████████████████████████████████████         | 4/5 [00:12<00:03,  3.19s/it]

IV_D_C_30 4 [0.1]


100%|█████████████████████████████████████████████| 5/5 [00:16<00:00,  3.39s/it]

IV_D_C_30 5 [0.0001]
[['IV_D_C_30', 8.939705747988501]]



