In [1]:
import os
import re
import datetime
import time
import json
import pandas as pd
import numpy as np
from deepBreaks.preprocessing import read_data

## <font color=#c51b8a>Amino Acid Property Analysis:</font> 
### Analysis of the best amino acid properties combinations for training ML models on different opsin datasets...  

In [2]:
report_dir = './vpod_1.2/aa_prop_combos_analysis_2024-12-16_16-57-42'

### <font color=#c994c7>Whole Dataset (WDS)</font>

In [3]:
wds_file = f'{report_dir}/wds_aa_prop_combos_results.csv'
wds_df = pd.read_csv(wds_file)
wds_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,wds,H1_,xgb,0.964836,6.004033,0.013601,134.073611,11.315286
1,wds,H1_,gbr,0.963463,6.32831,0.014547,138.600034,11.554425
2,wds,H1_,rf,0.962085,5.862054,0.013251,143.891148,11.762819
3,wds,H1_,BayesianRidge,0.945354,9.771495,0.022451,207.01199,14.311313
4,wds,H2_,xgb,0.961466,5.89449,0.013388,146.704646,11.822667


In [4]:
wds_df_filtered = wds_df.copy()
wds_df_filtered.sort_values('R2',ascending=False,inplace=True)
wds_df_filtered.reset_index(inplace=True, drop=True)
wds_df_filtered['Ranked_By'] = 'R2'
wds_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2


In [5]:
wds_df_filtered2 = wds_df.copy()
wds_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
wds_df_filtered2.reset_index(inplace=True, drop=True)
wds_df_filtered2['Ranked_By'] = 'MAE'
wds_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
1,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
2,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
3,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
4,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


In [6]:
top_aa_prop_df = pd.concat([wds_df_filtered[0:5], wds_df_filtered2[0:5]], ignore_index=True)
top_aa_prop_df

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2
5,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
6,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
7,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
8,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
9,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


### <font color=#c994c7>Wild-Type Dataset (WT)</font>

In [7]:
wt_file = f'{report_dir}/wt_aa_prop_combos_results.csv'
wt_df = pd.read_csv(wt_file)
wt_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,wt,H1_,rf,0.92291,8.601873,0.01892,234.212181,14.770771
1,wt,H1_,gbr,0.920426,8.475219,0.018695,240.593545,14.968405
2,wt,H1_,BayesianRidge,0.910199,11.172247,0.024392,261.090311,16.063695
3,wt,H1_,xgb,0.89717,9.556674,0.020886,311.024303,17.181301
4,wt,H2_,rf,0.915391,9.198966,0.020216,258.077694,15.423515


In [8]:
wt_df_filtered = wt_df.copy()
wt_df_filtered.sort_values('R2',ascending=False,inplace=True)
wt_df_filtered.reset_index(inplace=True, drop=True)
wt_df_filtered['Ranked_By'] = 'R2'
wt_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt,H1_H3_P1_NCI_PKA_,gbr,0.939306,7.988286,0.017367,176.983955,13.128133,R2
1,wt,H2_H3_P1_NCI_PKA_,gbr,0.939214,8.014395,0.017276,180.659277,13.04588,R2
2,wt,H1_P1_P2_NCI_MASS_PKA_PKB_,gbr,0.938709,8.195275,0.017629,181.608472,13.255662,R2
3,wt,H1_H2_H3_P1_NCI_SCT_PKB_,gbr,0.938494,7.95559,0.017264,179.5034,13.233635,R2
4,wt,H1_H2_H3_P1_MASS_SASA_PKA_PKB_,gbr,0.938227,7.901994,0.016949,185.298178,13.198679,R2


In [9]:
wt_df_filtered2 = wt_df.copy()
wt_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
wt_df_filtered2.reset_index(inplace=True, drop=True)
wt_df_filtered2['Ranked_By'] = 'MAE'
wt_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt,H1_H3_P2_NCI_SASA_SCT_,gbr,0.935655,7.836841,0.017132,191.196628,13.546977,MAE
1,wt,H1_V_PKA_PKB_,gbr,0.937621,7.89032,0.01714,181.888654,13.237769,MAE
2,wt,H1_H2_H3_P1_MASS_SASA_PKA_PKB_,gbr,0.938227,7.901994,0.016949,185.298178,13.198679,MAE
3,wt,H3_PKA_,gbr,0.931147,7.925474,0.017177,208.020001,13.936483,MAE
4,wt,H1_P1_P2_V_NCI_PKA_,gbr,0.933588,7.939524,0.017235,198.462461,13.866247,MAE


In [10]:
top_aa_prop_df = pd.concat([top_aa_prop_df, wt_df_filtered[0:5], wt_df_filtered2[0:5]], ignore_index=True)
top_aa_prop_df

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2
5,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
6,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
7,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
8,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
9,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


### <font color=#c994c7>Vertebrate Dataset (Vert)</font>

In [11]:
vert_file = f'{report_dir}/vert_aa_prop_combos_results.csv'
vert_df = pd.read_csv(vert_file)
vert_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,vert,H1_,rf,0.978668,5.282291,0.011782,80.496098,8.874717
1,vert,H1_,xgb,0.977837,5.27473,0.011761,83.660893,9.010396
2,vert,H1_,gbr,0.97576,5.359604,0.011978,91.426823,9.416843
3,vert,H1_,BayesianRidge,0.952567,9.536876,0.02194,178.354665,13.302946
4,vert,H2_,rf,0.97651,5.555795,0.01231,88.587994,9.283032


In [12]:
vert_df_filtered = vert_df.copy()
vert_df_filtered.sort_values('R2',ascending=False,inplace=True)
vert_df_filtered.reset_index(inplace=True, drop=True)
vert_df_filtered['Ranked_By'] = 'R2'
vert_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,vert,H2_H3_NCI_SCT_PKB_,xgb,0.980808,4.935295,0.010916,73.004131,8.392039,R2
1,vert,H1_H2_H3_NCI_PKB_,gbr,0.980476,4.944569,0.010948,73.980611,8.479772,R2
2,vert,H2_H3_NCI_PKB_,xgb,0.980445,5.03715,0.011176,74.25391,8.484706,R2
3,vert,H1_H2_NCI_PKB_,gbr,0.980379,5.002461,0.011075,74.383624,8.490553,R2
4,vert,H1_H2_H3_P1_NCI_,rf,0.980372,5.116075,0.011414,74.191393,8.494845,R2


In [13]:
vert_df_filtered2= vert_df.copy()
vert_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
vert_df_filtered2.reset_index(inplace=True, drop=True)
vert_df_filtered2['Ranked_By'] = 'MAE'
vert_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,vert,H2_H3_P2_NCI_SASA_SCT_,gbr,0.978035,4.914302,0.010886,82.078538,8.898818,MAE
1,vert,H1_P2_V_SASA_SCT_,gbr,0.979744,4.930994,0.010914,76.25379,8.595901,MAE
2,vert,H1_P1_V_SASA_PKA_,gbr,0.979817,4.93515,0.010924,76.142473,8.567762,MAE
3,vert,H2_H3_NCI_SCT_PKB_,xgb,0.980808,4.935295,0.010916,73.004131,8.392039,MAE
4,vert,H2_H3_P2_SASA_,gbr,0.97825,4.935346,0.010928,81.382856,8.862316,MAE


In [14]:
top_aa_prop_df = pd.concat([top_aa_prop_df, vert_df_filtered[0:5], vert_df_filtered2[0:5]], ignore_index=True)

In [15]:
top_aa_prop_df

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2
5,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
6,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
7,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
8,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
9,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


### <font color=#c994c7>Wild-Type Vertebrates Dataset (WT-Vert)</font>

In [16]:
wt_vert_file = f'{report_dir}/wt_vert_aa_prop_combos_results.csv'
wt_vert_df = pd.read_csv(wt_vert_file)
wt_vert_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,wt_vert,H1_,gbr,0.974343,4.960323,0.010692,68.916935,7.95632
1,wt_vert,H1_,xgb,0.973985,4.904034,0.010533,69.370715,7.99595
2,wt_vert,H1_,rf,0.971004,5.172302,0.011083,72.533714,8.228527
3,wt_vert,H1_,BayesianRidge,0.9263,8.647717,0.019191,178.282027,13.013624
4,wt_vert,H2_,gbr,0.979033,4.665929,0.010004,54.863623,7.185818


In [17]:
wt_vert_df_filtered = wt_vert_df.copy()
wt_vert_df_filtered.sort_values('R2',ascending=False,inplace=True)
wt_vert_df_filtered.reset_index(inplace=True, drop=True)
wt_vert_df_filtered['Ranked_By'] = 'R2'
wt_vert_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt_vert,H2_P2_V_MASS_,gbr,0.97963,4.625349,0.009816,55.391988,7.065266,R2
1,wt_vert,H2_H3_V_,gbr,0.979554,4.609259,0.009793,55.358151,7.065029,R2
2,wt_vert,H2_P2_V_,gbr,0.979446,4.619346,0.009802,55.294406,7.085095,R2
3,wt_vert,H1_H2_H3_,gbr,0.979308,4.502663,0.00967,56.27558,7.135008,R2
4,wt_vert,H1_H2_P2_,gbr,0.979262,4.551335,0.009714,54.880108,7.143088,R2


In [18]:
wt_vert_df_filtered2 = wt_vert_df.copy()
wt_vert_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
wt_vert_df_filtered2.reset_index(inplace=True, drop=True)
wt_vert_df_filtered2['Ranked_By'] = 'MAE'
wt_vert_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt_vert,H1_H2_H3_PKB_,gbr,0.978618,4.46223,0.009474,55.74116,7.222893,MAE
1,wt_vert,H1_H2_H3_,gbr,0.979308,4.502663,0.00967,56.27558,7.135008,MAE
2,wt_vert,H1_H2_H3_P2_,gbr,0.978995,4.507244,0.009621,55.712441,7.201111,MAE
3,wt_vert,H1_H2_SCT_PKB_,gbr,0.978773,4.548813,0.009648,55.543803,7.2119,MAE
4,wt_vert,H1_H2_PKB_,gbr,0.978494,4.549819,0.009698,56.872034,7.263243,MAE


In [19]:
top_aa_prop_df = pd.concat([top_aa_prop_df, wt_vert_df_filtered[0:5], wt_vert_df_filtered2[0:5]], ignore_index=True)

In [20]:
top_aa_prop_df

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2
5,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
6,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
7,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
8,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
9,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


### <font color=#c994c7>Invertebrate Dataset (Inv)</font>


In [21]:
inv_file = f'{report_dir}/inv_aa_prop_combos_results.csv'
inv_df = pd.read_csv(inv_file)
inv_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,inv,H1_,gbr,0.829782,13.470825,0.028957,523.195942,21.296935
1,inv,H1_,BayesianRidge,0.828731,14.474503,0.031647,566.778289,21.895207
2,inv,H1_,xgb,0.823843,14.480657,0.031294,549.296754,22.077985
3,inv,H1_,rf,0.805686,15.174023,0.033278,624.90579,23.597051
4,inv,H2_,rf,0.803825,15.290007,0.033529,631.113876,23.786247


In [22]:
inv_df_filtered = inv_df.copy()
inv_df_filtered.sort_values('R2',ascending=False,inplace=True)
inv_df_filtered.reset_index(inplace=True, drop=True)
inv_df_filtered['Ranked_By'] = 'R2'
inv_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,inv,H1_H3_,gbr,0.839023,13.109937,0.028111,497.631891,20.6921,R2
1,inv,H1_H2_P1_P2_V_MASS_SASA_SCT_,gbr,0.838573,12.829127,0.027563,515.405708,20.989757,R2
2,inv,H2_H3_P2_MASS_SCT_,gbr,0.838251,12.759714,0.027545,521.720579,21.130178,R2
3,inv,H2_P1_P2_MASS_SCT_,gbr,0.838144,12.866207,0.027606,517.468957,21.012091,R2
4,inv,H1_P1_MASS_SCT_,gbr,0.838143,13.042821,0.028199,525.576706,20.965555,R2


In [23]:
inv_df_filtered2 = inv_df.copy()
inv_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
inv_df_filtered2.reset_index(inplace=True, drop=True)
inv_df_filtered2['Ranked_By'] = 'MAE'
inv_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,inv,H2_H3_P2_MASS_SCT_,gbr,0.838251,12.759714,0.027545,521.720579,21.130178,MAE
1,inv,H2_P2_MASS_,gbr,0.83518,12.783172,0.027487,527.024628,21.254557,MAE
2,inv,H1_H2_P2_V_MASS_,gbr,0.837207,12.798861,0.027551,517.761489,21.187476,MAE
3,inv,H2_P1_P2_MASS_SASA_,gbr,0.836175,12.801303,0.027539,521.256836,21.208294,MAE
4,inv,H1_H2_P1_P2_V_MASS_SASA_SCT_,gbr,0.838573,12.829127,0.027563,515.405708,20.989757,MAE


In [24]:
top_aa_prop_df = pd.concat([top_aa_prop_df, inv_df_filtered[0:5], inv_df_filtered2[0:5]], ignore_index=True)

In [25]:
top_aa_prop_df

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2
5,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
6,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
7,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
8,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
9,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


### <font color=#c994c7>Type-One Microbial Opsin Dataset (T1)</font>


In [26]:
t1_file = f'{report_dir}/t1_aa_prop_combos_results.csv'
t1_df = pd.read_csv(t1_file)
t1_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,t1,H1_,xgb,0.837258,8.605884,0.016044,150.836218,12.230893
1,t1,H2_,xgb,0.841295,8.481036,0.015786,145.887429,12.031368
2,t1,H3_,xgb,0.719601,11.236318,0.020875,262.022685,16.105301
3,t1,P1_,xgb,0.849117,8.34554,0.015562,139.38211,11.756137
4,t1,P2_,xgb,0.803277,9.245236,0.017192,180.689286,13.3517


In [27]:
t1_df_filtered = t1_df.copy()
t1_df_filtered.sort_values('R2',ascending=False,inplace=True)
t1_df_filtered.reset_index(inplace=True, drop=True)
t1_df_filtered['Ranked_By'] = 'R2'
t1_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,t1,H3_P1_PKB_,xgb,0.867098,7.711729,0.014381,122.902828,11.033555,R2
1,t1,P1_SASA_PKA_PKB_,xgb,0.866905,7.50819,0.014006,122.649584,11.012273,R2
2,t1,H1_H2_H3_SASA_SCT_PKB_,xgb,0.866602,7.602643,0.014163,123.214758,11.026521,R2
3,t1,H1_P1_SASA_PKA_PKB_,xgb,0.864982,7.612981,0.01419,124.540729,11.107244,R2
4,t1,H1_P1_V_SASA_PKB_,xgb,0.864908,7.594316,0.014159,124.163686,11.07084,R2


In [28]:
t1_df_filtered2 = t1_df.copy()
t1_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
t1_df_filtered2.reset_index(inplace=True, drop=True)
t1_df_filtered2['Ranked_By'] = 'MAE'
t1_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,t1,P1_SASA_PKA_PKB_,xgb,0.866905,7.50819,0.014006,122.649584,11.012273,MAE
1,t1,H1_H2_P1_NCI_MASS_SASA_PKB_,xgb,0.863582,7.536787,0.014065,125.593133,11.137883,MAE
2,t1,H1_H2_H3_V_MASS_PKB_,xgb,0.861579,7.549904,0.01408,127.383779,11.208814,MAE
3,t1,H2_P2_V_SASA_PKB_,xgb,0.85969,7.55145,0.014096,128.785731,11.253854,MAE
4,t1,P2_MASS_SASA_PKB_,xgb,0.860502,7.561929,0.014119,128.444377,11.258262,MAE


In [29]:
top_aa_prop_df = pd.concat([top_aa_prop_df, t1_df_filtered[0:5], t1_df_filtered2[0:5]], ignore_index=True)

In [30]:
top_aa_prop_df

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,5.815975,0.013326,118.476535,10.772182,R2
1,wds,H1_H2_H3_P2_V_SCT_PKA_,xgb,0.9687,5.584713,0.012641,119.005642,10.694938,R2
2,wds,H1_H2_H3_P1_P2_V_MASS_SASA_SCT_,gbr,0.968699,5.937826,0.013603,118.399741,10.72928,R2
3,wds,H1_H2_NCI_MASS_SASA_PKB_,gbr,0.968595,5.883918,0.013509,119.665746,10.794597,R2
4,wds,H1_H2_H3_MASS_SASA_SCT_PKB_,gbr,0.968479,5.736124,0.013116,119.719386,10.731194,R2
5,wds,H1_P2_SCT_,rf,0.963549,5.525469,0.012328,137.938787,11.372965,MAE
6,wds,H1_SASA_SCT_,rf,0.963176,5.536853,0.012377,139.214654,11.436566,MAE
7,wds,H2_P2_V_SASA_,xgb,0.967122,5.539737,0.012535,125.168998,10.963956,MAE
8,wds,H1_H2_H3_P2_V_SASA_SCT_,xgb,0.9671,5.541833,0.012525,124.945831,10.868227,MAE
9,wds,H1_H2_H3_P2_V_SCT_,xgb,0.967439,5.546996,0.012525,123.751668,10.785951,MAE


In [32]:
top_aa_prop_df.to_csv(f'{report_dir}/top_aa_props_all_datasets.csv')