<a href="https://colab.research.google.com/github/RituAnilkumar/pt-gmb-ml/blob/main/CompleteRegressionGMBwithSubsets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import general use libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns # Better plots
sns.set()

# Sklearn imports for regression
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from scipy import stats # Error Assessment
import pickle # Saving models and importing other saved models

In [None]:
# Folder paths
fld='drive/MyDrive/PhD/EGU22/CSV/Variablewise/'
fld_pc='drive/MyDrive/PhD/EGU22/CSV/PCA/'
fd_2std='Std2'

# Read labels
lab_2std=np.genfromtxt(fld+fd_2std+'/label.csv', delimiter=',')

# Set a zero array to append file entries to
data_arr_2std=np.zeros((9166,1));

# Loop through the files to extract data as a 3 dimensional nparray
for i in range(14):
  # Create file names
  fn_2std=fld+fd_2std+'/feat_var_'+str(i)+'.csv'

  # Read files and stack to numpy array
  f_2std=np.genfromtxt(fn_2std,delimiter=',')
  data_arr_2std=np.append(data_arr_2std,f_2std,axis=1)

data_arr_2std=np.delete(data_arr_2std,0,1)

# Reshape labels to samples,1
lab=np.reshape(lab_2std,newshape=(-1,1))

In [None]:
# Define the number samples in the sampled datasets to be created
samp_perc=np.arange(10,101,10)
samp_num=np.round(data_arr_2std.shape[0]*samp_perc/100)
# Locations to save files
loc_save='drive/MyDrive/PhD/ML_GMB_Regress/'

In [None]:
rf_feat_imp=np.zeros(shape=(1,data_arr_2std.shape[1]))
rf_feat_imp_perm=np.zeros(shape=(1,data_arr_2std.shape[1]))
rf_train_test_metrics=pd.DataFrame()

gb_feat_imp=np.zeros(shape=(1, data_arr_2std.shape[1]))
gb_feat_imp_perm=np.zeros(shape=(1, data_arr_2std.shape[1]))
gb_train_test_metrics=pd.DataFrame()

svm_feat_imp_perm=np.zeros(shape=(1, data_arr_2std.shape[1]))
svm_train_test_metrics=pd.DataFrame()

nn_feat_imp_perm=np.zeros(shape=(1,data_arr_2std.shape[1]))
nn_train_test_metrics=pd.DataFrame()

linreg_feat_imp_perm=np.zeros(shape=(1, data_arr_2std.shape[1]))
linreg_train_test_metrics=pd.DataFrame()

np.random.seed(42)

for i in range(len(samp_num)):
  # Generate sampled dataset
  rand_sel=np.random.randint(0,data_arr_2std.shape[0],int(samp_num[i]))
  feat_samp=data_arr_2std[rand_sel,:]
  lab_samp=lab[rand_sel]
  # Generating the train and test features
  train_features, test_features, train_labels, test_labels = train_test_split(feat_samp,lab_samp, test_size = 0.3,random_state=42)

  # Extract min and max of input features
  inp1_min=np.min(train_features,axis=0)
  inp1_max=np.max(train_features,axis=0)
  # Normalize the training samples between 0 and 1 using min max
  train_features_array=(train_features-inp1_min)/(inp1_max-inp1_min)
  test_features_array=(test_features-inp1_min)/(inp1_max-inp1_min)

  # Extract min and max of input features
  inp2_min=np.min(train_labels,axis=0)
  inp2_max=np.max(train_labels,axis=0)
  # Normalize the training samples between 0 and 1 using min max
  train_labels_array=(train_labels-inp2_min)/(inp2_max-inp2_min)

  # # Not scaling the test labels as the predictions are scaled back instead
  # test_labels_array=(test_labels-inp2_min)/(inp2_max-inp2_min)
  # Reshaping labels to a single dimension for running accuracy metrics
  test_labels_resh=np.reshape(test_labels, newshape=(-1,))

  #------------------------- Random Forest Classifier---------------------------
  rf_regressor = RandomForestRegressor(oob_score=True,bootstrap=True)
  # grid_param = {'n_estimators':[2,5]
  # }
  grid_param = {'n_estimators':[50,100,200]
  }

  gd_sr_rf = GridSearchCV(estimator=rf_regressor,
                      param_grid=grid_param,
                      scoring='neg_mean_absolute_error',
                      cv=3,
                      n_jobs=-1)
  gd_sr_rf.fit(train_features_array, train_labels_array)
  df_rf = pd.DataFrame(gd_sr_rf.cv_results_)
  # Write RF training and validation results to CSV
  df_rf.to_csv(loc_save+'rf_grid_'+str(i)+'samp.csv')
  tmp_rf1=gd_sr_rf.best_estimator_.feature_importances_
  rf_feat_imp=np.append(rf_feat_imp,np.reshape(tmp_rf1,newshape=(1,-1)),axis=0)
  perm_imp_rf = permutation_importance(gd_sr_rf.best_estimator_,train_features_array, train_labels_array, n_repeats=10)
  tmp_rf2=perm_imp_rf.importances_mean
  rf_feat_imp_perm=np.append(rf_feat_imp_perm,np.reshape(tmp_rf2,newshape=(1,-1)),axis=0)

  # Assessment of the best RF classifier with training data
  rf_out_train=gd_sr_rf.predict(train_features_array)
  # Invert the normalization for labels to compare with test
  rf_out_train_unnorm=((inp2_max-inp2_min)*rf_out_train)+inp2_min
  rf_regstats_train=stats.linregress(rf_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))
  rf_rmse_train=mean_absolute_error(rf_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))

  # Assessment of the best RF classifier with testing data
  rf_out_test=gd_sr_rf.predict(test_features_array)
  # Invert the normalization for labels to compare with test
  rf_out_test_unnorm=((inp2_max-inp2_min)*rf_out_test)+inp2_min
  rf_regstats_test=stats.linregress(rf_out_test_unnorm,test_labels_resh)
  rf_rmse_test=mean_absolute_error(rf_out_test_unnorm,test_labels_resh)
  
  dict_cor={'Data':str(samp_perc[i])+'% data','Training CorrCoef':rf_regstats_train.rvalue, 'Training Intercept':rf_regstats_train.intercept,'Training Slope':rf_regstats_train.slope,'Training R2':rf_regstats_train.rvalue**2,'Training RMSE':rf_rmse_train,'Testing CorrCoef':rf_regstats_test.rvalue, 'Testing Intercept':rf_regstats_test.intercept,'Testing Slope':rf_regstats_test.slope,'Testing R2':rf_regstats_test.rvalue**2,'Testing RMSE':rf_rmse_test}
  # print(dict_cor)
  rf_train_test_metrics=rf_train_test_metrics.append(dict_cor,ignore_index=True)

  #------------------------------- Gradient Boost-------------------------------
  gb_regressor = GradientBoostingRegressor(max_depth=10)
  # grid_param = {'n_estimators':[2,5]
  # }
  grid_param = {'n_estimators':[50,100,200,300,400,500]
  }
  gd_sr_gb = GridSearchCV(estimator=gb_regressor,
                      param_grid=grid_param,
                      scoring='neg_mean_absolute_error',
                      cv=3,
                      n_jobs=-1)
  gd_sr_gb.fit(train_features_array, train_labels_array)
  df_gb = pd.DataFrame(gd_sr_gb.cv_results_)
  # Write RF training and validation results to CSV
  df_gb.to_csv(loc_save+'gb_grid_'+str(i)+'samp.csv')
  tmp_gb1=gd_sr_gb.best_estimator_.feature_importances_
  gb_feat_imp=np.append(gb_feat_imp,np.reshape(tmp_gb1,newshape=(1,-1)),axis=0)
  perm_imp_gb = permutation_importance(gd_sr_gb.best_estimator_,train_features_array, train_labels_array, n_repeats=10)
  tmp_gb2=perm_imp_gb.importances_mean
  gb_feat_imp_perm=np.append(gb_feat_imp_perm,np.reshape(tmp_gb2,newshape=(1,-1)),axis=0)

  gb_out_train=gd_sr_gb.predict(train_features_array)
  # Invert the normalization for labels to compare with test
  gb_out_train_unnorm=((inp2_max-inp2_min)*gb_out_train)+inp2_min
  gb_regstats_train=stats.linregress(gb_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))
  gb_rmse_train=mean_absolute_error(gb_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))

  gb_out_test=gd_sr_gb.predict(test_features_array)
  # Invert the normalization for labels to compare with test
  gb_out_test_unnorm=((inp2_max-inp2_min)*gb_out_test)+inp2_min
  gb_regstats_test=stats.linregress(gb_out_test_unnorm,test_labels_resh)
  gb_rmse_test=mean_absolute_error(gb_out_test_unnorm,test_labels_resh)

  dict_cor_gb={'Data':str(samp_perc[i])+'% data','Training CorrCoef':gb_regstats_train.rvalue, 'Training Intercept':gb_regstats_train.intercept,'Training Slope':gb_regstats_train.slope,'Training R2':gb_regstats_train.rvalue**2,'Training RMSE':gb_rmse_train,'Testing CorrCoef':gb_regstats_test.rvalue, 'Testing Intercept':gb_regstats_test.intercept,'Testing Slope':gb_regstats_test.slope,'Testing R2':gb_regstats_test.rvalue**2,'Testing RMSE':gb_rmse_test}
  # print(dict_cor)
  gb_train_test_metrics=gb_train_test_metrics.append(dict_cor_gb,ignore_index=True)

  #--------------------------Support Vector Machine-----------------------------
  svm_GS=svm.SVR()
  # grid_param = {'C':[0.001],
  #               'kernel':['poly'],
  #               'degree':[2]
  # }

  grid_param = {'C':[0.1,1,10],
                'kernel':['rbf','poly'],
                'degree':[4,5,6,7]
  }

  gd_sr = GridSearchCV(estimator=svm_GS,
                     param_grid=grid_param,
                     scoring='neg_mean_absolute_error',
                     cv=3,
                     n_jobs=-1)
  gd_sr.fit(train_features_array, train_labels_array)
  df_svm = pd.DataFrame(gd_sr.cv_results_)
  # Write RF training and validation results to CSV
  df_svm.to_csv(loc_save+'svm_grid_'+str(i)+'samp.csv')
  perm_imp_svm = permutation_importance(gd_sr.best_estimator_,train_features_array, train_labels_array, n_repeats=10)
  tmp_svm=perm_imp_svm.importances_mean
  svm_feat_imp_perm=np.append(svm_feat_imp_perm,np.reshape(tmp_svm,newshape=(1,-1)),axis=0)

  svm_out_train=gd_sr.predict(train_features_array)
  # Invert the normalization for labels to compare with test
  svm_out_train_unnorm=((inp2_max-inp2_min)*svm_out_train)+inp2_min
  svm_regstats_train=stats.linregress(svm_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))
  svm_rmse_train=mean_absolute_error(svm_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))

  svm_out_test=gd_sr.predict(test_features_array)
  # Invert the normalization for labels to compare with test
  svm_out_test_unnorm=((inp2_max-inp2_min)*svm_out_test)+inp2_min
  svm_regstats_test=stats.linregress(svm_out_test_unnorm,test_labels_resh)
  svm_rmse_test=mean_absolute_error(svm_out_test_unnorm,test_labels_resh)

  dict_cor_svm={'Data':str(samp_perc[i])+'% data','Training CorrCoef':svm_regstats_train.rvalue, 'Training Intercept':svm_regstats_train.intercept,'Training Slope':svm_regstats_train.slope,'Training R2':svm_regstats_train.rvalue**2,'Training RMSE':svm_rmse_train,'Testing CorrCoef':svm_regstats_test.rvalue, 'Testing Intercept':svm_regstats_test.intercept,'Testing Slope':svm_regstats_test.slope,'Testing R2':svm_regstats_test.rvalue**2,'Testing RMSE':svm_rmse_test}
  # print(dict_cor)
  svm_train_test_metrics=svm_train_test_metrics.append(dict_cor_svm,ignore_index=True)

  #------------------------- Neural Network Classifier--------------------------
  nn_regressor = MLPRegressor(max_iter=500,early_stopping=True)
  # grid_param = {'hidden_layer_sizes':[(10,),(20,)]}
  grid_param_nn = {'hidden_layer_sizes':[(10,),(50,),(100,),(200,),(300,),(400,),(500,),(400,200),(400,200,100),(500,200,100),(200,100,50),(100,50,10),(300,200,100,50),(200,100,50,10)]}

  gd_sr_nn = GridSearchCV(estimator=nn_regressor,
                      param_grid=grid_param_nn,
                      scoring='neg_mean_absolute_error',
                      cv=3,
                      n_jobs=-1)
  gd_sr_nn.fit(train_features_array, train_labels_array)
  df_nn = pd.DataFrame(gd_sr_nn.cv_results_)
  # Write RF training and validation results to CSV
  df_nn.to_csv(loc_save+'nn_grid_'+str(i)+'samp.csv')
  perm_imp_nn = permutation_importance(gd_sr_nn.best_estimator_,train_features_array, train_labels_array, n_repeats=10)
  tmp_nn2=perm_imp_nn.importances_mean
  nn_feat_imp_perm=np.append(nn_feat_imp_perm,np.reshape(tmp_nn2,newshape=(1,-1)),axis=0)

  # Assessment of the best RF classifier with training data
  nn_out_train=gd_sr_nn.predict(train_features_array)
  # Invert the normalization for labels to compare with test
  nn_out_train_unnorm=((inp2_max-inp2_min)*nn_out_train)+inp2_min
  nn_regstats_train=stats.linregress(nn_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))
  nn_rmse_train=mean_absolute_error(nn_out_train_unnorm,np.reshape(train_labels, newshape=(-1,)))

  # Assessment of the best RF classifier with testing data
  nn_out_test=gd_sr_nn.predict(test_features_array)
  # Invert the normalization for labels to compare with test
  nn_out_test_unnorm=((inp2_max-inp2_min)*nn_out_test)+inp2_min
  nn_regstats_test=stats.linregress(nn_out_test_unnorm,test_labels_resh)
  nn_rmse_test=mean_absolute_error(nn_out_test_unnorm,test_labels_resh)
  
  dict_cor={'Data':str(samp_perc[i])+'% data','Training CorrCoef':nn_regstats_train.rvalue, 'Training Intercept':nn_regstats_train.intercept,'Training Slope':nn_regstats_train.slope,'Training R2':nn_regstats_train.rvalue**2,'Training RMSE':nn_rmse_train,'Testing CorrCoef':nn_regstats_test.rvalue, 'Testing Intercept':nn_regstats_test.intercept,'Testing Slope':nn_regstats_test.slope,'Testing R2':nn_regstats_test.rvalue**2,'Testing RMSE':nn_rmse_test}
  # print(dict_cor)
  nn_train_test_metrics=nn_train_test_metrics.append(dict_cor,ignore_index=True)

  #------------------Ordinary Least Squares Linear Regression-------------------
  lin_regressor = LinearRegression(n_jobs=-1)
  lin_regressor.fit(train_features_array, train_labels_array)

  perm_imp_linreg = permutation_importance(lin_regressor,train_features_array, train_labels_array, n_repeats=10)
  tmp_linreg=perm_imp_linreg.importances_mean
  linreg_feat_imp_perm=np.append(linreg_feat_imp_perm,np.reshape(tmp_linreg,newshape=(1,-1)),axis=0)


  # Assessment of the best RF classifier with training data
  linreg_out_train=lin_regressor.predict(train_features_array)
  # Invert the normalization for labels to compare with test
  linreg_out_train_unnorm=((inp2_max-inp2_min)*linreg_out_train)+inp2_min
  print(linreg_out_train_unnorm.shape)
  print(train_labels.shape)
  linreg_regstats_train=stats.linregress(np.reshape(linreg_out_train_unnorm, newshape=(-1,)),np.reshape(train_labels, newshape=(-1,)))
  linreg_rmse_train=mean_absolute_error(np.reshape(linreg_out_train_unnorm, newshape=(-1,)),np.reshape(train_labels, newshape=(-1,)))

  # Assessment of the best RF classifier with testing data
  linreg_out_test=lin_regressor.predict(test_features_array)
  # Invert the normalization for labels to compare with test
  linreg_out_test_unnorm=((inp2_max-inp2_min)*linreg_out_test)+inp2_min
  linreg_regstats_test=stats.linregress(np.reshape(linreg_out_test_unnorm, newshape=(-1,)),test_labels_resh)
  linreg_rmse_test=mean_absolute_error(np.reshape(linreg_out_test_unnorm, newshape=(-1,)),test_labels_resh)
  
  dict_cor={'Data':str(samp_perc[i])+'% data','Training CorrCoef':linreg_regstats_train.rvalue, 'Training Intercept':linreg_regstats_train.intercept,'Training Slope':linreg_regstats_train.slope,'Training R2':linreg_regstats_train.rvalue**2,'Training RMSE':linreg_rmse_train,'Testing CorrCoef':linreg_regstats_test.rvalue, 'Testing Intercept':linreg_regstats_test.intercept,'Testing Slope':linreg_regstats_test.slope,'Testing R2':linreg_regstats_test.rvalue**2,'Testing RMSE':linreg_rmse_test}
  # print(dict_cor)
  linreg_train_test_metrics=linreg_train_test_metrics.append(dict_cor,ignore_index=True)

  print("1 Loop run")

np.savetxt(loc_save+'rf_featImp.csv',rf_feat_imp,delimiter=',')
np.savetxt(loc_save+'rf_permImp.csv',rf_feat_imp_perm,delimiter=',')
rf_train_test_metrics.to_csv(loc_save+'rf_metrics.csv')
np.savetxt(loc_save+'gb_featImp.csv',gb_feat_imp,delimiter=',')
np.savetxt(loc_save+'gb_permImp.csv',gb_feat_imp_perm,delimiter=',')
gb_train_test_metrics.to_csv(loc_save+'gb_metrics.csv')
np.savetxt(loc_save+'svm_permImp.csv',svm_feat_imp_perm,delimiter=',')
svm_train_test_metrics.to_csv(loc_save+'svm_metrics.csv')
np.savetxt(loc_save+'nn_permImp.csv',nn_feat_imp_perm,delimiter=',')
nn_train_test_metrics.to_csv(loc_save+'nn_metrics.csv')
np.savetxt(loc_save+'linreg_permImp.csv',linreg_feat_imp_perm,delimiter=',')
linreg_train_test_metrics.to_csv(loc_save+'linreg_metrics.csv')

In [None]:
rf_train_test_metrics

Unnamed: 0,Data,Training CorrCoef,Training Intercept,Training Slope,Training R2,Training RMSE,Testing CorrCoef,Testing Intercept,Testing Slope,Testing R2,Testing RMSE
0,10% data,0.899343,181.101758,1.154079,0.808818,517.765951,0.450752,-416.307238,0.751585,0.203177,1043.548601
1,20% data,0.852601,130.695586,1.109373,0.726929,593.52195,0.617236,-84.479921,0.87988,0.38098,906.10968
2,30% data,0.844892,85.010412,1.067853,0.713842,609.046156,0.655629,-119.641896,0.908477,0.429849,862.423385
3,40% data,0.840189,57.265371,1.04996,0.705917,610.476103,0.656773,-116.611556,0.854052,0.431351,847.077661
4,50% data,0.818535,54.032613,1.044607,0.67,641.258928,0.70862,-76.387075,0.947089,0.502143,800.258026
5,60% data,0.81915,42.26308,1.03466,0.671006,644.767837,0.729925,-37.678034,0.959797,0.532791,790.286876
6,70% data,0.811446,42.294339,1.031558,0.658445,652.565253,0.713928,-30.553922,0.933707,0.509693,793.475373
7,80% data,0.79434,34.410138,1.027679,0.630976,679.053463,0.717545,-56.270633,0.937973,0.514871,775.319736
8,90% data,0.812545,24.620032,1.019816,0.66023,652.567281,0.728248,-87.252415,0.919662,0.530345,762.648508
9,100% data,0.803658,24.932744,1.019144,0.645866,671.910638,0.728321,-51.850483,0.950767,0.530452,781.984845


In [None]:
gb_train_test_metrics

Unnamed: 0,Data,Training CorrCoef,Training Intercept,Training Slope,Training R2,Training RMSE,Testing CorrCoef,Testing Intercept,Testing Slope,Testing R2,Testing RMSE
0,10% data,0.917351,3.536034e-06,1.0,0.841533,356.955213,0.298108,-873.944012,0.386283,0.088868,1183.381326
1,20% data,0.864761,7.814194,1.006984,0.747812,505.121203,0.583635,-260.798387,0.733331,0.34063,923.255739
2,30% data,0.852798,0.0001656194,1.0,0.727264,550.939901,0.645801,-237.25399,0.809351,0.417059,865.986414
3,40% data,0.846489,9.177802e-08,1.0,0.716544,567.42111,0.65045,-203.662721,0.776155,0.423085,842.43656
4,50% data,0.823747,0.0003272088,1.0,0.67856,610.069539,0.694154,-171.736678,0.867982,0.48185,804.215134
5,60% data,0.823135,1.790067e-06,1.0,0.677552,618.977687,0.722962,-108.737305,0.901067,0.522674,784.3504
6,70% data,0.814724,8.775796,1.007348,0.663776,632.996007,0.712975,-80.877558,0.894888,0.508334,780.268297
7,80% data,0.796994,0.0001251523,1.0,0.635199,661.738809,0.714307,-96.424091,0.900463,0.510234,771.597821
8,90% data,0.814988,3.160825e-05,1.0,0.664205,636.920106,0.728771,-111.963246,0.897775,0.531107,759.517859
9,100% data,0.805519,0.004657984,1.000004,0.64886,659.128371,0.727531,-90.355918,0.918143,0.529302,774.001666


In [None]:
svm_train_test_metrics

Unnamed: 0,Data,Training CorrCoef,Training Intercept,Training Slope,Training R2,Training RMSE,Testing CorrCoef,Testing Intercept,Testing Slope,Testing R2,Testing RMSE
0,10% data,0.799906,253.508258,1.206223,0.639849,761.367637,0.463856,-306.435291,0.805149,0.215162,1050.122661
1,20% data,0.814192,87.425396,1.070174,0.662908,704.655007,0.614328,-97.223499,0.839856,0.377399,937.219746
2,30% data,0.812599,104.20832,1.077574,0.660317,708.657187,0.67704,-84.99534,0.936284,0.458383,865.585503
3,40% data,0.811356,53.068761,1.030619,0.658299,701.620643,0.656854,-143.893336,0.83588,0.431458,862.472578
4,50% data,0.790245,74.986409,1.040448,0.624486,731.249448,0.698245,-56.18642,0.929678,0.487547,833.120272
5,60% data,0.793996,66.898567,1.029429,0.63043,719.580949,0.730764,-11.75403,0.954337,0.534016,811.611848
6,70% data,0.788513,67.090437,1.028541,0.621753,728.011338,0.717391,-4.771363,0.926957,0.51465,817.253508
7,80% data,0.771439,56.686561,1.025196,0.595117,744.838798,0.722562,-37.053026,0.942633,0.522095,790.772503
8,90% data,0.793248,59.896325,1.019159,0.629243,721.73461,0.72602,-39.749516,0.934251,0.527106,790.881601
9,100% data,0.784964,21.37981,0.998391,0.616168,726.680361,0.729111,-78.574562,0.921203,0.531603,796.797832


In [None]:
nn_train_test_metrics

Unnamed: 0,Data,Training CorrCoef,Training Intercept,Training Slope,Training R2,Training RMSE,Testing CorrCoef,Testing Intercept,Testing Slope,Testing R2,Testing RMSE
0,10% data,0.73726,247.262161,1.193312,0.543552,840.64606,0.480583,-242.447886,0.849859,0.23096,1046.438642
1,20% data,0.717878,300.494707,1.160974,0.515349,865.21312,0.55309,89.348715,0.919061,0.305909,1038.199023
2,30% data,0.731759,-87.926827,1.085904,0.535471,818.20605,0.666945,-133.189675,1.053909,0.444815,890.94114
3,40% data,0.775466,108.212919,1.080431,0.601348,755.272787,0.641351,-69.15992,0.905576,0.411331,887.971634
4,50% data,0.754408,30.716086,1.044659,0.569131,770.790811,0.689523,-62.06187,0.984557,0.475442,856.341926
5,60% data,0.779803,-45.291586,1.047876,0.608092,727.640997,0.726006,-81.623403,0.999349,0.527084,814.939547
6,70% data,0.773003,138.785908,1.09654,0.597533,748.150546,0.722409,91.961348,1.030634,0.521875,819.171522
7,80% data,0.743918,-106.247235,1.090498,0.553415,791.703267,0.709929,-145.151918,1.041831,0.503999,826.177709
8,90% data,0.762988,59.999439,1.009483,0.582151,758.312598,0.724865,-14.70733,0.948301,0.52543,802.175805
9,100% data,0.762808,187.922897,1.044871,0.581876,772.795629,0.716377,128.381208,0.995108,0.513196,835.691963


In [None]:
linreg_train_test_metrics

Unnamed: 0,Data,Training CorrCoef,Training Intercept,Training Slope,Training R2,Training RMSE,Testing CorrCoef,Testing Intercept,Testing Slope,Testing R2,Testing RMSE
0,10% data,0.775402,-1.607532e-10,1.0,0.601248,751.53948,0.261335,-931.059862,0.314338,0.068296,1305.662726
1,20% data,0.654347,-2.273737e-11,1.0,0.42817,899.131914,0.419975,-388.918113,0.549395,0.176379,1091.126053
2,30% data,0.641693,-9.708856e-11,1.0,0.41177,915.375721,0.530957,-161.957965,0.862545,0.281915,1016.264214
3,40% data,0.652146,-2.864908e-11,1.0,0.425295,908.666288,0.547932,-142.920497,0.854143,0.300229,996.237294
4,50% data,0.611293,-5.888978e-11,1.0,0.373679,947.238213,0.586684,-96.022684,0.920958,0.344198,976.713145
5,60% data,0.637977,-2.382876e-10,1.0,0.407014,918.922956,0.580825,-83.444843,0.909953,0.337357,983.293108
6,70% data,0.610042,-1.421085e-10,1.0,0.372151,941.642583,0.590902,-0.059398,0.963151,0.349165,981.520976
7,80% data,0.607711,1.057288e-10,1.0,0.369312,940.263805,0.570274,-91.625156,0.926772,0.325212,961.519225
8,90% data,0.624305,-2.59206e-10,1.0,0.389757,937.52068,0.601365,-94.455595,0.949808,0.36164,948.429624
9,100% data,0.624081,-3.228706e-11,1.0,0.389477,935.084657,0.6086,-36.127815,0.980767,0.370394,941.050349


In [None]:
print(np.argmax(rf_feat_imp,axis=1))
print(np.argmax(gb_feat_imp,axis=1))

[  0 120 120 120 120 120 112 112 120 120 120]
[  0 120 120 112  23  22 112  23 120  12 112]


In [None]:
print(np.argmax(rf_feat_imp_perm,axis=1))
print(np.argmax(gb_feat_imp_perm,axis=1))
print(np.argmax(svm_feat_imp_perm,axis=1))
print(np.argmax(nn_feat_imp_perm,axis=1))
print(np.argmax(linreg_feat_imp_perm,axis=1))

[  0 120 120 120 120 120  54  23 120 120 120]
[  0 120 120 120  23 120 112 112 120 112 112]
[  0  35 110  34  34 110  34 110 110  64  18]
[  0  34  19  90  93 110  44 110 110  87  35]
[ 0 44 43 43 44 43 43 38 43 43 43]
