In [1]:
import os
import re
import datetime
import time
import json
import pandas as pd
import numpy as np

## <font color=#c51b8a>Amino Acid Property Analysis: (Part 1)</font> 
### A pre-grid-search analysis of the best amino acid properties combinations for training ML models on different opsin datasets...  

In [4]:
report_dir = './vpod_1.2/aa_prop_combos_mnm_analysis_2025-03-10_12-48-09'

### <font color=#c994c7>Whole Dataset (WDS)</font>

In [7]:
wds_file = f'{report_dir}/wds_mnm_aa_prop_combos_results.csv'
wds_df = pd.read_csv(wds_file)
wds_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,wds_mnm,H1_,rf,0.966493,5.828356,0.012763,123.170673,10.969838
1,wds_mnm,H1_,xgb,0.959684,6.161644,0.013453,148.045319,11.971938
2,wds_mnm,H1_,gbr,0.957265,7.848281,0.017372,155.368967,12.372931
3,wds_mnm,H1_,BayesianRidge,0.940711,10.345268,0.023133,216.258492,14.635953
4,wds_mnm,H2_,rf,0.965514,6.069676,0.013257,126.451072,11.107524


In [8]:
wds_df_filtered = wds_df.copy()
wds_df_filtered.sort_values('R2',ascending=False,inplace=True)
wds_df_filtered.reset_index(inplace=True, drop=True)
wds_df_filtered['Ranked_By'] = 'R2'
wds_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds_mnm,H2_NCI_MASS_,rf,0.967861,5.724482,0.012495,117.674194,10.69238,R2
1,wds_mnm,P1_NCI_MASS_,rf,0.967849,5.801479,0.012678,117.860171,10.696164,R2
2,wds_mnm,H2_H3_P2_MASS_,xgb,0.967848,5.68774,0.012404,118.482447,10.757487,R2
3,wds_mnm,H2_H3_NCI_MASS_,rf,0.967847,5.729064,0.012512,117.668,10.698528,R2
4,wds_mnm,H2_P2_V_MASS_SCT_,xgb,0.967829,5.627071,0.012279,118.526506,10.707115,R2


In [9]:
wds_df_filtered2 = wds_df.copy()
wds_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
wds_df_filtered2.reset_index(inplace=True, drop=True)
wds_df_filtered2['Ranked_By'] = 'MAE'
wds_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wds_mnm,H2_NCI_MASS_SASA_,xgb,0.967609,5.608074,0.012201,119.014093,10.717896,MAE
1,wds_mnm,H2_P2_V_MASS_SCT_,xgb,0.967829,5.627071,0.012279,118.526506,10.707115,MAE
2,wds_mnm,H2_NCI_MASS_,xgb,0.966168,5.671012,0.012348,124.943568,10.988778,MAE
3,wds_mnm,H2_P2_V_SCT_,xgb,0.965375,5.6727,0.012375,127.710301,11.133988,MAE
4,wds_mnm,H2_P2_NCI_MASS_SASA_,xgb,0.966899,5.676726,0.012362,121.483123,10.836245,MAE


In [10]:
top_mnm_aa_prop_df = pd.concat([wds_df_filtered[0:5], wds_df_filtered2[0:5]], ignore_index=True)

### <font color=#c994c7>Wild-Type Dataset (WT)</font>

In [11]:
wt_file = f'{report_dir}/wt_mnm_aa_prop_combos_results.csv'
wt_df = pd.read_csv(wt_file)
wt_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,wt_mnm,H1_,rf,0.950737,7.327905,0.015531,148.573726,11.992238
1,wt_mnm,H1_,gbr,0.947569,8.277377,0.017533,158.201857,12.403676
2,wt_mnm,H1_,xgb,0.944907,7.537731,0.016003,167.415062,12.659047
3,wt_mnm,H1_,BayesianRidge,0.929397,10.435866,0.022261,212.134591,14.48542
4,wt_mnm,H2_,rf,0.951839,7.47451,0.015815,147.115153,11.864408


In [12]:
wt_df_filtered = wt_df.copy()
wt_df_filtered.sort_values('R2',ascending=False,inplace=True)
wt_df_filtered.reset_index(inplace=True, drop=True)
wt_df_filtered['Ranked_By'] = 'R2'
wt_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt_mnm,H2_P1_NCI_SASA_PKB_,rf,0.956613,7.091059,0.014978,131.245268,11.254194,R2
1,wt_mnm,H1_H2_H3_NCI_MASS_PKA_,rf,0.956481,7.049049,0.014873,131.446296,11.294314,R2
2,wt_mnm,H1_P2_V_MASS_SASA_PKA_,xgb,0.956462,6.755903,0.014269,130.956569,11.286445,R2
3,wt_mnm,H1_H2_H3_PKA_,rf,0.956444,7.101091,0.015026,131.303899,11.299985,R2
4,wt_mnm,H2_P1_NCI_MASS_SASA_PKB_,rf,0.956406,7.035764,0.014881,131.889177,11.280034,R2


In [13]:
wt_df_filtered2 = wt_df.copy()
wt_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
wt_df_filtered2.reset_index(inplace=True, drop=True)
wt_df_filtered2['Ranked_By'] = 'MAE'
wt_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt_mnm,H1_P2_V_MASS_SASA_PKA_,xgb,0.956462,6.755903,0.014269,130.956569,11.286445,MAE
1,wt_mnm,H1_H2_SASA_PKA_,xgb,0.956036,6.870606,0.014399,131.888735,11.316416,MAE
2,wt_mnm,H1_P2_V_SASA_PKA_,xgb,0.954161,6.877228,0.014469,137.151787,11.539039,MAE
3,wt_mnm,H1_MASS_SASA_PKA_,xgb,0.955788,6.885814,0.01449,133.654676,11.374865,MAE
4,wt_mnm,H2_V_MASS_PKA_,xgb,0.954254,6.906498,0.014518,137.623605,11.489862,MAE


In [14]:
top_mnm_aa_prop_df = pd.concat([top_mnm_aa_prop_df, wt_df_filtered[0:5], wt_df_filtered2[0:5]], ignore_index=True)

### <font color=#c994c7>Vertebrate Dataset (Vert)</font>

In [15]:
vert_file = f'{report_dir}/vert_mnm_aa_prop_combos_results.csv'
vert_df = pd.read_csv(vert_file)
vert_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,vert_mnm,H1_,xgb,0.977057,5.350207,0.011794,84.828715,9.029259
1,vert_mnm,H1_,gbr,0.976692,5.535381,0.012191,86.454283,9.120135
2,vert_mnm,H2_,xgb,0.979854,5.328838,0.011716,74.591598,8.529833
3,vert_mnm,H2_,gbr,0.979263,5.547366,0.012196,77.09119,8.630836
4,vert_mnm,H3_,xgb,0.969877,6.353459,0.014569,111.789472,10.472795


In [16]:
vert_df_filtered = vert_df.copy()
vert_df_filtered.sort_values('R2',ascending=False,inplace=True)
vert_df_filtered.reset_index(inplace=True, drop=True)
vert_df_filtered['Ranked_By'] = 'R2'
vert_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,vert_mnm,H3_P2_MASS_SCT_PKB_,xgb,0.98487,4.268384,0.009462,56.232083,7.314086,R2
1,vert_mnm,H3_P2_SCT_PKA_PKB_,xgb,0.984764,4.290904,0.009522,56.302639,7.314899,R2
2,vert_mnm,SASA_SCT_PKB_,xgb,0.984726,4.285697,0.009495,56.382285,7.346054,R2
3,vert_mnm,V_SCT_PKB_,gbr,0.984715,4.536325,0.010052,56.689946,7.393931,R2
4,vert_mnm,P2_MASS_SCT_PKB_,xgb,0.984708,4.341446,0.009593,56.660535,7.36409,R2


In [17]:
vert_df_filtered2= vert_df.copy()
vert_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
vert_df_filtered2.reset_index(inplace=True, drop=True)
vert_df_filtered2['Ranked_By'] = 'MAE'
vert_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,vert_mnm,H3_P2_MASS_SCT_PKB_,xgb,0.98487,4.268384,0.009462,56.232083,7.314086,MAE
1,vert_mnm,H2_H3_P2_V_SASA_SCT_,xgb,0.983913,4.271485,0.009511,59.663129,7.549439,MAE
2,vert_mnm,H3_V_SCT_PKB_,xgb,0.984601,4.280832,0.009524,57.162374,7.372168,MAE
3,vert_mnm,SASA_SCT_PKB_,xgb,0.984726,4.285697,0.009495,56.382285,7.346054,MAE
4,vert_mnm,H3_V_SASA_SCT_PKA_,xgb,0.984701,4.289048,0.009479,56.420776,7.338084,MAE


In [18]:
top_mnm_aa_prop_df = pd.concat([top_mnm_aa_prop_df, vert_df_filtered[0:5], vert_df_filtered2[0:5]], ignore_index=True)

### <font color=#c994c7>Wild-Type Vertebrates Dataset (WT-Vert)</font>

In [19]:
wt_vert_file = f'{report_dir}/wt_vert_mnm_aa_prop_combos_results.csv'
wt_vert_df = pd.read_csv(wt_vert_file)
wt_vert_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,wt_vert_mnm,H1_,gbr,0.979125,4.80867,0.010251,54.171985,7.205791
1,wt_vert_mnm,H1_,xgb,0.977287,4.659491,0.009755,58.608814,7.373147
2,wt_vert_mnm,H2_,xgb,0.982559,4.404909,0.009251,46.019757,6.675893
3,wt_vert_mnm,H2_,gbr,0.980642,4.828224,0.010226,50.402237,6.975627
4,wt_vert_mnm,H3_,gbr,0.974844,5.635713,0.012067,66.41448,8.082827


In [20]:
wt_vert_df_filtered = wt_vert_df.copy()
wt_vert_df_filtered.sort_values('R2',ascending=False,inplace=True)
wt_vert_df_filtered.reset_index(inplace=True, drop=True)
wt_vert_df_filtered['Ranked_By'] = 'R2'
wt_vert_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt_vert_mnm,H2_H3_PKB_,xgb,0.984778,4.206906,0.008792,39.682863,6.178714,R2
1,wt_vert_mnm,H2_P2_PKB_,xgb,0.984748,4.28757,0.00897,40.00957,6.232934,R2
2,wt_vert_mnm,P1_NCI_PKB_,xgb,0.98471,4.281993,0.009041,39.972267,6.280174,R2
3,wt_vert_mnm,H2_MASS_PKB_,xgb,0.984495,4.254941,0.008905,40.365974,6.289299,R2
4,wt_vert_mnm,H2_H3_P2_PKB_,xgb,0.984428,4.271105,0.00894,40.835878,6.290369,R2


In [21]:
wt_vert_df_filtered2 = wt_vert_df.copy()
wt_vert_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
wt_vert_df_filtered2.reset_index(inplace=True, drop=True)
wt_vert_df_filtered2['Ranked_By'] = 'MAE'
wt_vert_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,wt_vert_mnm,H2_H3_V_PKB_,xgb,0.984176,4.205598,0.008798,41.328718,6.305069,MAE
1,wt_vert_mnm,H2_H3_PKB_,xgb,0.984778,4.206906,0.008792,39.682863,6.178714,MAE
2,wt_vert_mnm,H1_H3_PKB_,xgb,0.984044,4.219248,0.008837,40.949851,6.280616,MAE
3,wt_vert_mnm,H1_PKB_,xgb,0.984232,4.225036,0.008859,40.528238,6.251466,MAE
4,wt_vert_mnm,H2_H3_SASA_PKB_,xgb,0.984282,4.234096,0.0089,41.265994,6.313336,MAE


In [22]:
top_mnm_aa_prop_df = pd.concat([top_mnm_aa_prop_df, wt_vert_df_filtered[0:5], wt_vert_df_filtered2[0:5]], ignore_index=True)

### <font color=#c994c7>Invertebrate Dataset (Inv)</font>


In [23]:
inv_file = f'{report_dir}/inv_mnm_aa_prop_combos_results.csv'
inv_df = pd.read_csv(inv_file)
inv_df.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE
0,inv_mnm,H1_,gbr,0.900928,11.287742,0.024187,352.513999,17.717048
1,inv_mnm,H1_,xgb,0.891524,11.89696,0.025657,385.310237,18.671503
2,inv_mnm,H2_,gbr,0.897268,11.49193,0.024614,365.605094,18.013855
3,inv_mnm,H2_,xgb,0.890657,11.889349,0.025646,393.329365,18.919184
4,inv_mnm,H3_,gbr,0.878471,12.842779,0.027423,416.578199,19.365671


In [24]:
inv_df_filtered = inv_df.copy()
inv_df_filtered.sort_values('R2',ascending=False,inplace=True)
inv_df_filtered.reset_index(inplace=True, drop=True)
inv_df_filtered['Ranked_By'] = 'R2'
inv_df_filtered.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,inv_mnm,H1_P1_SCT_,gbr,0.904808,10.944023,0.023393,337.223431,17.259654,R2
1,inv_mnm,H1_P1_V_SCT_PKA_,gbr,0.904612,10.891637,0.023254,337.504969,17.339714,R2
2,inv_mnm,H1_H3_P1_P2_MASS_SCT_,gbr,0.904512,10.97721,0.023453,337.772588,17.29443,R2
3,inv_mnm,H1_P1_MASS_SCT_,gbr,0.904242,11.015463,0.02353,339.177428,17.351121,R2
4,inv_mnm,H1_H3_P1_MASS_,gbr,0.904212,11.017102,0.023515,340.254432,17.356303,R2


In [25]:
inv_df_filtered2 = inv_df.copy()
inv_df_filtered2.sort_values('MAE',ascending=True,inplace=True)
inv_df_filtered2.reset_index(inplace=True, drop=True)
inv_df_filtered2['Ranked_By'] = 'MAE'
inv_df_filtered2.head()

Unnamed: 0,Dataset,Props_Used,Model,R2,MAE,MAPE,MSE,RMSE,Ranked_By
0,inv_mnm,H1_P1_V_SCT_PKA_,gbr,0.904612,10.891637,0.023254,337.504969,17.339714,MAE
1,inv_mnm,H1_H2_P1_V_MASS_,gbr,0.903494,10.915856,0.023263,339.554343,17.378695,MAE
2,inv_mnm,H1_P1_SCT_,gbr,0.904808,10.944023,0.023393,337.223431,17.259654,MAE
3,inv_mnm,H1_H2_P1_SASA_,gbr,0.903633,10.948423,0.023416,343.570413,17.38548,MAE
4,inv_mnm,H1_H2_P1_V_SCT_,gbr,0.902383,10.962536,0.023384,345.742301,17.516577,MAE


In [26]:
top_mnm_aa_prop_df = pd.concat([top_mnm_aa_prop_df, inv_df_filtered[0:5], inv_df_filtered2[0:5]], ignore_index=True)

In [28]:
top_mnm_aa_prop_df.to_csv(f'{report_dir}/top_mnm_aa_props_all_datasets.csv')

## <font color=#c51b8a>Amino Acid Property Analysis: (Part 2)</font> 
### A post-grid-search analysis of the best amino acid properties combinations for training ML models on different opsin datasets...  

In [None]:
import os
import re
import datetime
import time
import json
import pandas as pd
import numpy as np

### This is actually less of an analysis, and more just a way to see the final results of the aa_prop grid-search.
- Note - only one group of aa-properties, one model algorythm and one set of model parameters is chosen for each dataset in the end.

In [None]:
report_dir = './vpod_1.2/aa_prop_combos_analysis_2024-12-16_16-57-42'

In [None]:
top_mnm_aa_prop_file = f'{report_dir}/filtered_top_mnm_aa_props_all_datasets_gs.csv'
top_mnm_aa_prop_df = pd.read_csv(top_mnm_aa_prop_file, index_col=0)
top_mnm_aa_prop_df

Unnamed: 0,Dataset,Props_Used,best_gs_model,best_gs_r2,best_gs_params
0,wds,H1_H2_H3_P1_P2_V_SASA_SCT_PKA_PKB_,gbr,0.968836,"{'gbr__learning_rate': 0.2, 'gbr__max_depth': ..."
1,wt,H1_H3_P1_NCI_PKA_,gbr,0.939306,"{'gbr__learning_rate': 0.1, 'gbr__max_depth': ..."
2,vert,H2_H3_NCI_SCT_PKB_,xgb,0.981047,"{'xgb__colsample_bytree': 1.0, 'xgb__gamma': 0..."
3,wt_vert,H2_P2_V_MASS_,gbr,0.979911,"{'gbr__learning_rate': 0.1, 'gbr__max_depth': ..."
4,inv,H1_H3_,gbr,0.839023,"{'gbr__learning_rate': 0.01, 'gbr__max_depth':..."
