# Preliminaries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import sys
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

#sys.path.append("/Users/paolo/Documents/methods/CMI_FS")
#from feature_selection import forwardFeatureSelection

sys.path.append("../LinCFA")
from LinCFA import LinCFA

sys.path.append("../NonLinCFA")
from NonLinCFA import NonLinCFA

sys.path.append("../GenLinCFA")
from GenLinCFA import GenLinCFA

sys.path.append("../droughts")
from aux import prepare_target,prepare_features,compare_methods

#from aux import standardize,unfold_dataset,compute_r2,prepare_target,prepare_features,aggregate_unfolded_data,aggregate_unfolded_data_onlyTrain,FS_with_linearWrapper,compare_methods, compute_r2


# Data

In [2]:
### Load and standardize target
target_df_train,target_df_val,target_df_test,target_df_trainVal = prepare_target('',max_train='2010-01-01', max_val='2015-01-01', max_test='2020-01-01', path='../droughts/Emiliani1.csv')


target samples:            date      mean  median  year  week  mean_std
0    2001-01-05  0.379890    0.50  2001     1 -0.382765
1    2001-01-13  0.482679    0.58  2001     2  0.319215
2    2001-01-21  0.516259    0.59  2001     3  0.548542
3    2001-01-29  0.434421    0.50  2001     5 -0.010351
4    2001-02-06  0.494805    0.54  2001     6  0.402030
..          ...       ...     ...   ...   ...       ...
406  2009-11-27  0.427085    0.43  2009    48 -0.060454
407  2009-12-05  0.547380    0.57  2009    49  0.761079
408  2009-12-13  0.531070    0.58  2009    50  0.649694
409  2009-12-21  0.295704    0.00  2009    52 -0.957702
410  2009-12-29  0.027861    0.00  2009    53 -2.786888

[411 rows x 6 columns]
 target shapes: ((411, 6), (228, 6), (639, 6), (228, 6))


In [3]:
### Load and standardize features
variables_list = ['cyclostationary_mean_tg', 
                 'cyclostationary_mean_tg_1w',
                 'cyclostationary_mean_tg_4w', 
                 'cyclostationary_mean_tg_8w',
                 'cyclostationary_mean_tg_12w', 
                 'cyclostationary_mean_tg_16w',
                 'cyclostationary_mean_tg_24w',
                 'cyclostationary_mean_rr', 
                 'cyclostationary_mean_rr_1w',
                 'cyclostationary_mean_rr_4w', 
                 'cyclostationary_mean_rr_8w',
                 'cyclostationary_mean_rr_12w', 
                 'cyclostationary_mean_rr_16w',
                 'cyclostationary_mean_rr_24w'
                 ]

df_train = pd.DataFrame()
df_val = pd.DataFrame()
df_test = pd.DataFrame()
df_trainVal = pd.DataFrame()

for variable in variables_list:
    df_train_unfolded_std, df_val_unfolded_std, df_test_unfolded_std,df_trainVal_unfolded_std = prepare_features('../droughts/Emiliani1_aggreg.csv',variable,False,max_train='2010-01-01', max_val='2015-01-01', max_test='2020-01-01')
    df_train_unfolded_std = df_train_unfolded_std.add_prefix(variable)
    df_val_unfolded_std = df_val_unfolded_std.add_prefix(variable)
    df_test_unfolded_std = df_test_unfolded_std.add_prefix(variable)
    df_trainVal_unfolded_std = df_trainVal_unfolded_std.add_prefix(variable)
    df_train = pd.concat((df_train,df_train_unfolded_std),axis=1)
    df_val = pd.concat((df_val,df_val_unfolded_std),axis=1)
    df_test = pd.concat((df_test,df_test_unfolded_std),axis=1)
    df_trainVal = pd.concat((df_trainVal,df_trainVal_unfolded_std),axis=1)
    
df_trainVal

Unnamed: 0,cyclostationary_mean_tgmean_12.149860342381333_43.74986055078544,cyclostationary_mean_tgmean_12.149860342381333_43.8498605504681,cyclostationary_mean_tgmean_12.149860342381333_43.94986055015075,cyclostationary_mean_tgmean_12.149860342381333_44.04986054983341,cyclostationary_mean_tgmean_12.149860342381333_44.14986054951606,cyclostationary_mean_tgmean_12.149860342381333_44.24986054919872,cyclostationary_mean_tgmean_12.149860342381333_44.34986054888138,cyclostationary_mean_tgmean_12.149860342381333_44.44986054856403,cyclostationary_mean_tgmean_12.149860342381333_44.54986054824669,cyclostationary_mean_tgmean_12.149860342381333_44.64986054792934,...,cyclostationary_mean_rr_24wmean_11.349860345581744_44.849860547294654,cyclostationary_mean_rr_24wmean_11.449860345181692_44.14986054951606,cyclostationary_mean_rr_24wmean_11.449860345181692_44.24986054919872,cyclostationary_mean_rr_24wmean_11.449860345181692_44.34986054888138,cyclostationary_mean_rr_24wmean_11.449860345181692_44.44986054856403,cyclostationary_mean_rr_24wmean_11.449860345181692_44.54986054824669,cyclostationary_mean_rr_24wmean_11.449860345181692_44.64986054792934,cyclostationary_mean_rr_24wmean_11.449860345181692_44.749860547612,cyclostationary_mean_rr_24wmean_11.449860345181692_44.849860547294654,cyclostationary_mean_rr_24wmean_10.749860347982052_44.24986054919872
0,0.719982,0.737633,0.791074,0.848973,0.983768,1.172439,1.201792,1.127842,0.986823,0.794499,...,2.331970,6.168345,6.215139,5.406085,4.240808,4.540304,2.472063,2.105030,2.296531,3.363161
1,1.845931,2.162888,2.278888,2.313132,2.330406,2.363919,2.119857,2.008556,1.797622,1.626867,...,1.670322,3.527914,3.841691,3.108603,2.947959,2.824932,1.593425,1.462249,1.728906,3.004583
2,-0.312107,-0.404970,-0.361520,-0.116017,0.037568,0.199148,0.215971,0.233859,0.106281,-0.117314,...,1.118337,1.915294,2.358315,2.981883,2.858264,2.382361,1.067353,1.006560,1.055914,1.950734
3,1.666516,1.661740,1.601745,1.595421,1.579804,1.588420,1.391649,1.185715,1.141724,0.968893,...,0.534584,2.429799,2.789340,2.202089,1.991279,1.546552,0.466434,0.489244,0.506180,2.157730
4,-0.031846,-0.003213,-0.013352,0.122002,0.295776,0.473944,0.532393,0.421560,0.238369,-0.025356,...,1.056396,2.718182,3.000691,3.082463,3.061446,2.457115,1.281718,1.138486,1.142762,2.350811
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,0.578850,0.609135,0.689113,0.967141,1.273710,1.237462,1.007356,1.032989,1.194737,1.011196,...,0.690343,1.041006,0.388026,0.123181,0.347596,0.465708,-0.240666,0.449174,0.650788,1.164679
635,1.243632,1.043466,0.783567,0.922940,1.124030,1.209410,1.205085,1.367888,1.900526,1.924692,...,0.666434,0.955184,0.294397,0.090659,0.410255,0.581733,-0.169828,0.462829,0.652467,1.096017
636,0.267909,0.598843,0.778573,1.011130,1.340522,1.491446,1.432183,1.526572,1.965525,1.990127,...,0.894777,1.047647,0.425352,0.262704,0.623916,0.826442,0.059008,0.692457,0.885935,1.242053
637,0.957833,1.053095,1.249241,1.296098,1.319729,1.135229,0.976383,1.250418,1.632408,1.738173,...,0.878993,1.036936,0.456154,0.304824,0.500595,0.733759,0.026283,0.689393,0.807281,1.119456


In [4]:
### Together
df_trainVal_withTar = pd.concat((df_trainVal,target_df_trainVal['mean_std']), axis=1)
df_trainVal_withTar

Unnamed: 0,cyclostationary_mean_tgmean_12.149860342381333_43.74986055078544,cyclostationary_mean_tgmean_12.149860342381333_43.8498605504681,cyclostationary_mean_tgmean_12.149860342381333_43.94986055015075,cyclostationary_mean_tgmean_12.149860342381333_44.04986054983341,cyclostationary_mean_tgmean_12.149860342381333_44.14986054951606,cyclostationary_mean_tgmean_12.149860342381333_44.24986054919872,cyclostationary_mean_tgmean_12.149860342381333_44.34986054888138,cyclostationary_mean_tgmean_12.149860342381333_44.44986054856403,cyclostationary_mean_tgmean_12.149860342381333_44.54986054824669,cyclostationary_mean_tgmean_12.149860342381333_44.64986054792934,...,cyclostationary_mean_rr_24wmean_11.449860345181692_44.14986054951606,cyclostationary_mean_rr_24wmean_11.449860345181692_44.24986054919872,cyclostationary_mean_rr_24wmean_11.449860345181692_44.34986054888138,cyclostationary_mean_rr_24wmean_11.449860345181692_44.44986054856403,cyclostationary_mean_rr_24wmean_11.449860345181692_44.54986054824669,cyclostationary_mean_rr_24wmean_11.449860345181692_44.64986054792934,cyclostationary_mean_rr_24wmean_11.449860345181692_44.749860547612,cyclostationary_mean_rr_24wmean_11.449860345181692_44.849860547294654,cyclostationary_mean_rr_24wmean_10.749860347982052_44.24986054919872,mean_std
0,0.719982,0.737633,0.791074,0.848973,0.983768,1.172439,1.201792,1.127842,0.986823,0.794499,...,6.168345,6.215139,5.406085,4.240808,4.540304,2.472063,2.105030,2.296531,3.363161,-0.382765
1,1.845931,2.162888,2.278888,2.313132,2.330406,2.363919,2.119857,2.008556,1.797622,1.626867,...,3.527914,3.841691,3.108603,2.947959,2.824932,1.593425,1.462249,1.728906,3.004583,0.319215
2,-0.312107,-0.404970,-0.361520,-0.116017,0.037568,0.199148,0.215971,0.233859,0.106281,-0.117314,...,1.915294,2.358315,2.981883,2.858264,2.382361,1.067353,1.006560,1.055914,1.950734,0.548542
3,1.666516,1.661740,1.601745,1.595421,1.579804,1.588420,1.391649,1.185715,1.141724,0.968893,...,2.429799,2.789340,2.202089,1.991279,1.546552,0.466434,0.489244,0.506180,2.157730,-0.010351
4,-0.031846,-0.003213,-0.013352,0.122002,0.295776,0.473944,0.532393,0.421560,0.238369,-0.025356,...,2.718182,3.000691,3.082463,3.061446,2.457115,1.281718,1.138486,1.142762,2.350811,0.402030
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
634,0.578850,0.609135,0.689113,0.967141,1.273710,1.237462,1.007356,1.032989,1.194737,1.011196,...,1.041006,0.388026,0.123181,0.347596,0.465708,-0.240666,0.449174,0.650788,1.164679,-0.320516
635,1.243632,1.043466,0.783567,0.922940,1.124030,1.209410,1.205085,1.367888,1.900526,1.924692,...,0.955184,0.294397,0.090659,0.410255,0.581733,-0.169828,0.462829,0.652467,1.096017,0.137056
636,0.267909,0.598843,0.778573,1.011130,1.340522,1.491446,1.432183,1.526572,1.965525,1.990127,...,1.047647,0.425352,0.262704,0.623916,0.826442,0.059008,0.692457,0.885935,1.242053,0.305368
637,0.957833,1.053095,1.249241,1.296098,1.319729,1.135229,0.976383,1.250418,1.632408,1.738173,...,1.036936,0.456154,0.304824,0.500595,0.733759,0.026283,0.689393,0.807281,1.119456,-0.586199


# NonLinCFA

In [10]:
### Run with script to parallelize
import pickle
with open('../results/droughts_NonLinCFA.pkl', 'rb') as f:
    nonLinCFA_res = pickle.load(f)

nonLinCFA_res
#for eps in [0.01,0.001,0.0001,0.00001,0.000001]:
#    for curr_seed in [0,1,2,3]:
#        curr_df_trainVal = df_trainVal[np.random.default_rng(seed=curr_seed).permutation(df_trainVal.columns.values)]
#        curr_df_test = df_test[np.random.default_rng(seed=curr_seed).permutation(df_test.columns.values)]
#        curr_df_trainVal_withTar = pd.concat((curr_df_trainVal,target_df_trainVal['mean_std']), axis=1)
#        
#        output = NonLinCFA(curr_df_trainVal_withTar,'mean_std', eps, -5 , 0).compute_clusters()
#        
#        aggregate_trainVal = pd.DataFrame()
#        aggregate_test = pd.DataFrame()
#        for i in range(len(output)):
#            aggregate_trainVal[str(i)] = curr_df_trainVal_withTar[output[i]].mean(axis=1)
#            aggregate_trainVal = aggregate_trainVal.copy()
#            aggregate_test[str(i)] = curr_df_test[output[i]].mean(axis=1)
#            aggregate_test = aggregate_test.copy()
#        print(f'Number of aggregated features: {len(output)}\n')
#        compare_methods(aggregate_trainVal, aggregate_test, target_df_trainVal, target_df_test, list(aggregate_trainVal.columns))


[[0.01, 0, 7, 0.3185643694838668],
 [0.01, 1, 9, 0.27697135091242076],
 [0.01, 2, 7, 0.2805071762678476],
 [0.01, 3, 6, 0.284424091830241],
 [0.01, 4, 6, 0.3145161825852978],
 [0.001, 0, 13, 0.22432066655583394],
 [0.001, 1, 11, 0.27781824096242425],
 [0.001, 2, 11, 0.27404608133044606],
 [0.001, 3, 12, 0.23752764598521392],
 [0.001, 4, 10, 0.2597891626933886],
 [0.0001, 0, 13, 0.27147372750196175],
 [0.0001, 1, 11, 0.2524542658278436],
 [0.0001, 2, 12, 0.25650136455459227],
 [0.0001, 3, 13, 0.22976989266115388],
 [0.0001, 4, 12, 0.2607828450665214],
 [1e-05, 0, 13, 0.27219577058779654],
 [1e-05, 1, 12, 0.253784182174689],
 [1e-05, 2, 13, 0.2572360525137376],
 [1e-05, 3, 13, 0.22147710794539166],
 [1e-05, 4, 11, 0.2599739911061125],
 [1e-06, 0, 13, 0.27219577058779676],
 [1e-06, 1, 12, 0.2543218592176427],
 [1e-06, 2, 13, 0.25723605251373727],
 [1e-06, 3, 13, 0.22214496392204675],
 [1e-06, 4, 11, 0.2593453992896675]]

# GenLinCFA

In [12]:
#for variable in ['cyclostationary_mean_tg']:#variables_list:
#actual_df_trainVal = df_trainVal_unfolded_std[df_trainVal_unfolded_std.columns[pd.Series(df_trainVal_unfolded_std.columns).str.startswith(variable)]]
for eps in [0.45,0.455,0.46,0.465]:
    for curr_seed in [0,1,2,3]:
        curr_df_trainVal = df_trainVal[np.random.default_rng(seed=curr_seed).permutation(df_trainVal.columns.values)]
        curr_df_test = df_test[np.random.default_rng(seed=curr_seed).permutation(df_test.columns.values)]
        curr_df_trainVal_withTar = pd.concat((curr_df_trainVal,target_df_trainVal['mean_std']), axis=1)
        
        output = GenLinCFA(curr_df_trainVal_withTar,'mean_std', eps, -5 , 0, 1).compute_clusters()
        
        aggregate_trainVal = pd.DataFrame()
        aggregate_test = pd.DataFrame()
        for i in range(len(output)):
            aggregate_trainVal[str(i)] = curr_df_trainVal_withTar[output[i]].mean(axis=1)
            aggregate_trainVal = aggregate_trainVal.copy()
            aggregate_test[str(i)] = curr_df_test[output[i]].mean(axis=1)
            aggregate_test = aggregate_test.copy()
        print(f'Number of aggregated features: {len(output)}\n')
        compare_methods(aggregate_trainVal, aggregate_test, target_df_trainVal, target_df_test, list(aggregate_trainVal.columns))


Number of aggregated features: 7

Full aggregate regression train score: 0.3419645417141758, test score: 0.2734277423665955
Aggregate regression train score with FS: 0.3419645417141758, test score: 0.2734277423665955
Number of aggregated features: 6

Full aggregate regression train score: 0.34770244087100066, test score: 0.29004806013374373
Aggregate regression train score with FS: 0.34770244087100066, test score: 0.29004806013374373
Number of aggregated features: 9

Full aggregate regression train score: 0.35276158568859106, test score: 0.28546968251694593
Aggregate regression train score with FS: 0.35276158568859106, test score: 0.28546968251694593
Number of aggregated features: 6

Full aggregate regression train score: 0.34325568020302477, test score: 0.28828584371562394
Aggregate regression train score with FS: 0.34325568020302477, test score: 0.28828584371562394
Number of aggregated features: 4

Full aggregate regression train score: 0.32878601231391014, test score: 0.264842127790

# LinCFA

In [5]:
for curr_seed in [0,1,2,3,4]:
    curr_df_trainVal = df_trainVal[np.random.default_rng(seed=curr_seed).permutation(df_trainVal.columns.values)]
    curr_df_test = df_test[np.random.default_rng(seed=curr_seed).permutation(df_test.columns.values)]
    curr_df_trainVal_withTar = pd.concat((curr_df_trainVal,target_df_trainVal['mean_std']), axis=1)
    
    output = LinCFA(curr_df_trainVal_withTar,'mean_std', 0, neigh=0).compute_clusters()
    
    aggregate_trainVal = pd.DataFrame()
    aggregate_test = pd.DataFrame()
    for i in range(len(output)):
        aggregate_trainVal[str(i)] = curr_df_trainVal_withTar[output[i]].mean(axis=1)
        aggregate_trainVal = aggregate_trainVal.copy()
        aggregate_test[str(i)] = curr_df_test[output[i]].mean(axis=1)
        aggregate_test = aggregate_test.copy()
    print(f'Number of aggregated features: {len(output)}\n')
    compare_methods(aggregate_trainVal, aggregate_test, target_df_trainVal, target_df_test, list(aggregate_trainVal.columns))


Number of aggregated features: 225

Full aggregate regression train score: 0.6761288593584261, test score: -1.2694691757547836
Aggregate regression train score with FS: 0.6761288593584261, test score: -1.2694691757547836
Number of aggregated features: 223

Full aggregate regression train score: 0.6666080375874754, test score: -1.7018292139423892
Aggregate regression train score with FS: 0.6666080375874754, test score: -1.7018292139423892
Number of aggregated features: 223

Full aggregate regression train score: 0.6580759552486883, test score: -0.8736910841363972
Aggregate regression train score with FS: 0.6580759552486883, test score: -0.8736910841363972
Number of aggregated features: 223

Full aggregate regression train score: 0.6827870220312666, test score: -1.230733284058061
Aggregate regression train score with FS: 0.6827870220312666, test score: -1.230733284058061
Number of aggregated features: 216

Full aggregate regression train score: 0.6640882303882194, test score: -1.35456987

# PCA

In [31]:
def compute_PCA(max_components, train_df, val_df, train_target, val_target, variables):
    r2=[]
    for i in range(max_components):
        train_pca = pd.DataFrame()
        val_pca = pd.DataFrame()
        for var in variables:
            pca = PCA(n_components=i+1)
            actual_train = train_df[train_df.columns[pd.Series(train_df.columns).str.startswith(var)]]
            actual_val = val_df[val_df.columns[pd.Series(val_df.columns).str.startswith(var)]]
            train_pca = pd.concat([train_pca,pd.DataFrame(pca.fit_transform(actual_train))], axis=1)
            val_pca = pd.concat([val_pca,pd.DataFrame(pca.transform(actual_val))],axis=1)

        actual_r2 = compare_methods(train_pca, val_pca, train_target, val_target, list(train_pca.columns))
        r2.append(actual_r2)
        print(f'Components: {i+1}, R2: {actual_r2}\n')

    return r2 

In [32]:
compute_PCA(50, curr_df_trainVal, curr_df_test, target_df_trainVal, target_df_test, variables_list)

Full aggregate regression train score: 0.3631682549719679, test score: 0.3389444124989692
Aggregate regression train score with FS: 0.35709845618058467, test score: 0.33210909651343656
Components: 1, R2: 0.33210909651343656

Full aggregate regression train score: 0.3934005751537262, test score: 0.2651627976959352
Aggregate regression train score with FS: 0.3934169545949662, test score: 0.2631179639111214
Components: 2, R2: 0.2631179639111214

Full aggregate regression train score: 0.4302115238554136, test score: 0.15703973245335057
Aggregate regression train score with FS: 0.42690132463475305, test score: 0.16311659178706983
Components: 3, R2: 0.16311659178706983

Full aggregate regression train score: 0.467287375699011, test score: 0.2156302310329189
Aggregate regression train score with FS: 0.44033551942770777, test score: 0.12441762387134603
Components: 4, R2: 0.12441762387134603

Full aggregate regression train score: 0.50135674243842, test score: 0.02528660455668874
Aggregate regr

Full aggregate regression train score: 0.9406660722359005, test score: -109.80585243350828
Aggregate regression train score with FS: 0.9339454949985999, test score: -114.8051551775824
Components: 38, R2: -114.8051551775824

Full aggregate regression train score: 0.9503134654875937, test score: -76.1053445959032
Aggregate regression train score with FS: 0.9475488480425156, test score: -76.8862580312614
Components: 39, R2: -76.8862580312614

Full aggregate regression train score: 0.9546272657098303, test score: -62.50043960308911
Aggregate regression train score with FS: 0.9500316475683116, test score: -66.07667400296108
Components: 40, R2: -66.07667400296108

Full aggregate regression train score: 0.9607043492407835, test score: -93.0384420181453
Aggregate regression train score with FS: 0.9125485231611962, test score: -99.35487278189366
Components: 41, R2: -99.35487278189366

Full aggregate regression train score: 0.9700215105778527, test score: -222.19659021356102
Aggregate regression

[0.33210909651343656,
 0.2631179639111214,
 0.16311659178706983,
 0.12441762387134603,
 -0.013751811187005636,
 -0.1030683035204758,
 -0.8364687963375967,
 -0.31363683342506277,
 -0.39950683594293257,
 -0.7664303801305066,
 -1.591600447719867,
 -1.8463143467614294,
 -2.862703810471825,
 -2.3085481751193817,
 -2.7293069659968445,
 -2.444106461961161,
 -2.7025706884292666,
 -5.064147920366556,
 -2.7023040995329484,
 -5.685257127138058,
 -8.408402127780164,
 -7.556797525955677,
 -7.499411859814208,
 -11.219305377951507,
 -6.956331306783634,
 -7.513097428666136,
 -6.446748073742082,
 -10.787130559981165,
 -15.669873840715773,
 -19.962080058555493,
 -24.13244343922712,
 -40.915942687243884,
 -51.1308495725757,
 -64.93312943124737,
 -61.5427706138912,
 -80.78757549067149,
 -98.7462770988711,
 -114.8051551775824,
 -76.8862580312614,
 -66.07667400296108,
 -99.35487278189366,
 -216.88453879600465,
 -443.1570792141089,
 -950.3528987056748,
 -6198.795416462183,
 -6494.222782162498,
 -1802.3185580

In [26]:
from sklearn.decomposition import PCA
for curr_seed in [0,1,2,3,4]: 
    for i in range(50):
        pca = PCA(n_components=i+1, svd_solver='randomized', random_state=curr_seed)
        trainVal_pca = pd.DataFrame(pca.fit_transform(curr_df_trainVal))
        test_pca = pd.DataFrame(pca.transform(curr_df_test))
        actual_r2 = compare_methods(trainVal_pca, test_pca, target_df_trainVal, target_df_test, list(trainVal_pca.columns))
        r2.append([curr_seed,i,actual_r2])

Full aggregate regression train score: 0.19023370351560964, test score: 0.19622927730772388
Aggregate regression train score with FS: 0.19023370351560964, test score: 0.19622927730772388
Full aggregate regression train score: 0.2871035147853612, test score: 0.2851720515271383
Aggregate regression train score with FS: 0.2871035147853612, test score: 0.2851720515271383
Full aggregate regression train score: 0.33443279190005926, test score: 0.363152826834618
Aggregate regression train score with FS: 0.33443279190005926, test score: 0.36315282683461814
Full aggregate regression train score: 0.33493514496336974, test score: 0.3502842756986605
Aggregate regression train score with FS: 0.33493514496336974, test score: 0.3502842756986605
Full aggregate regression train score: 0.33794220405473774, test score: 0.35594978108250264
Aggregate regression train score with FS: 0.33794220405473774, test score: 0.3559497810825024
Full aggregate regression train score: 0.33801175931456817, test score: 0.

Full aggregate regression train score: 0.4854647700428454, test score: 0.1985617699516563
Aggregate regression train score with FS: 0.4854647700428454, test score: 0.19856176995165553
Full aggregate regression train score: 0.4852523324270911, test score: 0.1978785060461531
Aggregate regression train score with FS: 0.4852523324270911, test score: 0.19787850604615254
Full aggregate regression train score: 0.4867383247724053, test score: 0.19471461415503244
Aggregate regression train score with FS: 0.4867383247724054, test score: 0.19471461415503233
Full aggregate regression train score: 0.48714914831723843, test score: 0.18275297064368212
Aggregate regression train score with FS: 0.48714914831723843, test score: 0.18275297064368212
Full aggregate regression train score: 0.490307248680128, test score: 0.1644256240339831
Aggregate regression train score with FS: 0.490307248680128, test score: 0.1644256240339821
Full aggregate regression train score: 0.1902337035155891, test score: 0.196229

Full aggregate regression train score: 0.4703211495469708, test score: 0.2265509842125173
Aggregate regression train score with FS: 0.4703211495469707, test score: 0.22655098421251707
Full aggregate regression train score: 0.47015773536717864, test score: 0.22634248755070174
Aggregate regression train score with FS: 0.47015773536717875, test score: 0.22634248755070097
Full aggregate regression train score: 0.4708337027102376, test score: 0.23420461102831713
Aggregate regression train score with FS: 0.4708337027102376, test score: 0.2342046110283177
Full aggregate regression train score: 0.4857414281737912, test score: 0.19463890858501764
Aggregate regression train score with FS: 0.4857414281737912, test score: 0.19463890858501742
Full aggregate regression train score: 0.4862848168114532, test score: 0.1974940073029483
Aggregate regression train score with FS: 0.4862848168114533, test score: 0.1974940073029483
Full aggregate regression train score: 0.4868995767883817, test score: 0.1926

Full aggregate regression train score: 0.44213357837341705, test score: 0.25598462604901895
Aggregate regression train score with FS: 0.44213357837341705, test score: 0.2559846260490197
Full aggregate regression train score: 0.44309704674017814, test score: 0.24532442352153194
Aggregate regression train score with FS: 0.44309704674017814, test score: 0.24532442352153194
Full aggregate regression train score: 0.4458440813196314, test score: 0.241076047349137
Aggregate regression train score with FS: 0.4458440813196314, test score: 0.24107604734913735
Full aggregate regression train score: 0.47276619805052444, test score: 0.21377059880362392
Aggregate regression train score with FS: 0.47276619805052444, test score: 0.2137705988036228
Full aggregate regression train score: 0.470416597389417, test score: 0.22409305965980064
Aggregate regression train score with FS: 0.470416597389417, test score: 0.22409305965980097
Full aggregate regression train score: 0.4701562099677675, test score: 0.22

Full aggregate regression train score: 0.4391505417993963, test score: 0.255267286679286
Aggregate regression train score with FS: 0.4391505417993963, test score: 0.2552672866792859
Full aggregate regression train score: 0.43919541102548376, test score: 0.24647811159624222
Aggregate regression train score with FS: 0.43919541102548376, test score: 0.2464781115962421
Full aggregate regression train score: 0.4393372753771907, test score: 0.2465545974863398
Aggregate regression train score with FS: 0.4393372753771907, test score: 0.24655459748633968
Full aggregate regression train score: 0.4396564941781169, test score: 0.24763065676539575
Aggregate regression train score with FS: 0.43965649417811703, test score: 0.24763065676539597
Full aggregate regression train score: 0.4403328894961952, test score: 0.24783832601363054
Aggregate regression train score with FS: 0.4403328894961953, test score: 0.24783832601363043
Full aggregate regression train score: 0.4421338557036326, test score: 0.2561

Full aggregate regression train score: 0.4177095692666739, test score: 0.21143607194894787
Aggregate regression train score with FS: 0.4177095692666738, test score: 0.2114360719489482
Full aggregate regression train score: 0.41794965111278826, test score: 0.21790176443333076
Aggregate regression train score with FS: 0.41794965111278826, test score: 0.21790176443333054
Full aggregate regression train score: 0.4376180973273841, test score: 0.24273953441718743
Aggregate regression train score with FS: 0.4376180973273841, test score: 0.2427395344171882
Full aggregate regression train score: 0.43930702604148275, test score: 0.25585204128428596
Aggregate regression train score with FS: 0.43930702604148275, test score: 0.2558520412842865
Full aggregate regression train score: 0.43953675729559794, test score: 0.24650864277476814
Aggregate regression train score with FS: 0.4395367572955978, test score: 0.2465086427747678
Full aggregate regression train score: 0.43952689819612667, test score: 0.

In [37]:
from sklearn.decomposition import PCA
r2 = []
for curr_seed in [0,1,2,3,4]: 
    pca = PCA(n_components=0.9)
    trainVal_pca = pd.DataFrame(pca.fit_transform(curr_df_trainVal))
    test_pca = pd.DataFrame(pca.transform(curr_df_test))
    actual_r2 = compare_methods(trainVal_pca, test_pca, target_df_trainVal, target_df_test, list(trainVal_pca.columns))
    r2.append([curr_seed,i,actual_r2])
    print(test_pca.shape)
    

Full aggregate regression train score: 0.38037334157878255, test score: 0.2971017670536156
Aggregate regression train score with FS: 0.38037334157878255, test score: 0.2971017670536156
(228, 16)
Full aggregate regression train score: 0.38037334157878255, test score: 0.2971017670536156
Aggregate regression train score with FS: 0.38037334157878255, test score: 0.2971017670536156
(228, 16)
Full aggregate regression train score: 0.38037334157878255, test score: 0.2971017670536156
Aggregate regression train score with FS: 0.38037334157878255, test score: 0.2971017670536156
(228, 16)
Full aggregate regression train score: 0.38037334157878255, test score: 0.2971017670536156
Aggregate regression train score with FS: 0.38037334157878255, test score: 0.2971017670536156
(228, 16)
Full aggregate regression train score: 0.38037334157878255, test score: 0.2971017670536156
Aggregate regression train score with FS: 0.38037334157878255, test score: 0.2971017670536156
(228, 16)


In [None]:
from sklearn.manifold import Isomap

for i in range(50):
    lda = Isomap(n_components=i+1)
    trainVal_lda = pd.DataFrame(lda.fit_transform(df.iloc[:-392,:-1]))
    test_lda = pd.DataFrame(lda.transform(df.iloc[-392:,:-1]))
    actual_r2 = compare_methods(trainVal_lda, test_lda, df.iloc[:-392,:], df.iloc[-392:,:], list(trainVal_lda.columns))
    r2.append([curr_seed,i,actual_r2])
    print(test_lda.shape)