In [None]:
%load_ext autoreload

In [None]:
# ML utils
from ML.machine_learning_models import *
from ML.ml_utils_reg import create_directory, potency_classes, metrics_potency_classes, metric_potency_classes_ub
import seaborn as sns
from ML.ml_utils_reg import plot_regression_models_cat, plot_regression_models_cat_mod, plot_heatmap_stat_analysis
%autoreload 2

In [None]:
# parameters
# Models
model_list =  ['1-NN','kNN','SVR','RFR','MR']
# Number of trials (datasplits)
trial_splits = 10
#Molecular Fingerprint
fingerprint = 'ECFP4'
# Approach
approach = 'Unbalanced'
# Plot
sns.set_style("whitegrid", {'axes.grid' : True})
figure_path = create_directory('./figures/')

# **Load data** Fig3/S2/S3

In [None]:
# Results path
main_folder = 'regression_models_50_50'
result_path = f'./{main_folder}/{fingerprint}/{approach}/'
performance_test_df = pd.read_csv(os.path.join(result_path, f'performance_test.csv'), index_col=False)
performance_test_df

In [None]:
performance_test_df_mod = performance_test_df[['Target ID','Algorithm','Dataset size', 'Metric', 'Value', 'trial']].rename(columns={'Dataset size': 'Test size'})
performance_test_df_mod['potency_class'] = '5 - 11'
performance_test_df_mod

# Predictions Fig3/S2/S3

In [None]:
predictions_test_df = pd.read_csv(os.path.join(result_path, f'predictions_test.csv'), index_col=False)
predictions_test_df['potency_class'] = potency_classes(predictions_test_df.Experimental.values, [5,7,9,11])
predictions_test_df['dataset'] = 'Test'
predictions_test_df

# Calculate metrics for each potency range

In [None]:
results_pc_un = metric_potency_classes_ub(predictions_test_df,
                                          targets=[333, 268, 280, 203, 279, 2409, 260, 286],
                                          potency_classess=[5,7,9],
                                          trials=10,
                                          algorithms=['1-NN','kNN','SVR','RFR','MR'])

In [None]:
results_pc_tid_final = pd.concat([results_pc_un, performance_test_df_mod], axis=0, ignore_index=True, sort=False)

# Plot FigS2/3

In [None]:
results_pc_tid = results_pc_tid_final.copy()
results_pc_tid.rename(columns={'potency_class': 'Potency classes'}, inplace=True)

for i, metric_ind in enumerate(["MAE", "RMSE"]):
    plot_regression_models_cat(results_pc_tid, [metric_ind], x='Potency classes', y='Value',
                               col='Target ID',
                               hue = 'Algorithm', hue_order=['1-NN', 'kNN', 'SVR', 'RFR','MR'], order=['5 - 11', '5 - 7', '7 - 9', '9 - 11'], #
                               aspect=1.2, height=7,
                               palette='tab10',
                               y_labels= f'{metric_ind}',
                               font_size=22,
                               row=None, col_wrap=2,
                               sharey='row',
                               yticks=[0.25, 0.5, 0.75, 1.0, 1.25, 1.5, 1.75, 2.0, 2.25, 2.5],
                               bbox_to_anchor=(-0.0, -0.35),
                               results_path=figure_path, filename=f'S{i+2}', legend_title='Potency ranges',
                               )

# Plot Fig3

In [None]:
results_pc_tids = results_pc_tid_final.loc[results_pc_tid_final['Target ID'].isin([203, 280, 2409])]
results_pc_tids.rename(columns={'potency_class': 'Potency class'}, inplace=True)

In [None]:
plot_regression_models_cat_mod(results_pc_tids, ['MAE', 'RMSE'], x='Potency class', y='Value',
                               col='Metric',
                               hue = 'Algorithm', hue_order=['1-NN', 'kNN', 'SVR', 'RFR', 'MR'],
                               order=['5 - 11', '5 - 7', '7 - 9', '9 - 11'],
                               aspect=1.5, height=7, col_nr=2,
                               palette='tab10',
                               font_size=22,
                               row='Target ID',
                               sharey=False,
                               ymax=2.3,
                               yticks=[0.2, 0.4, 0.6, 0.8, 1.0, 1.2, 1.4, 1.6, 1.8, 2, 2.2],
                               bbox_to_anchor=(-0.08, -0.37), legend_title='Potency ranges',
                               results_path=figure_path, filename=f'F3'
                               )

# Load data Training set of increasing size

In [None]:
# Results path
approach_b = 'Balanced'
approach_utut = 'Unbalanced'
main_folder_b = 'regression_models_increase_tr_sizes_3_bins_330'
result_path_b = f'./{main_folder_b}/{fingerprint}/{approach_b}/'
main_folder_utut = 'regression_models_increase_tr_sizes_3_bins_unbalanced'
result_path_utut = f'./{main_folder_utut}/{fingerprint}/{approach_utut}/'

# Performance Test

In [None]:
# Load data
# performance Balanced
performance_test_df_b = pd.read_csv(os.path.join(result_path_b, f'performance_test.csv'), index_col=False)
performance_test_df_b['Approach'] = 'BTR-BTE'
# performance Balanced
performance_test_df_utut = pd.read_csv(os.path.join(result_path_utut, f'performance_test.csv'), index_col=False)
performance_test_df_utut['Approach'] = 'UTR-UTE'


In [None]:
performance_test_df_final = pd.concat([performance_test_df_b, performance_test_df_utut])
performance_test_df_final

# Predictions Test

In [None]:
predictions_test_df_btr_bte = pd.read_csv(os.path.join(result_path_b, f'predictions_test.csv'), index_col=False)
predictions_test_df_btr_bte['potency_class'] = potency_classes(predictions_test_df_btr_bte.Experimental.values, [5,7,9,11])
predictions_test_df_btr_bte['dataset'] = 'Test'

In [None]:
predictions_test_df_utr_ute = pd.read_csv(os.path.join(result_path_utut, f'predictions_test.csv'), index_col=False)
predictions_test_df_utr_ute['potency_class'] = potency_classes(predictions_test_df_utr_ute.Experimental.values, [5,7,9,11])
predictions_test_df_utr_ute['dataset'] = 'Test'

# Balanced TR/ Balanced TE

In [None]:
results_pc = metrics_potency_classes(predictions_test_df_btr_bte)
results_pc.potency_class.replace({5:'5 - 7', 7:'7 - 9', 9: '9 - 11'}, inplace=True)

# Unbalanced TR/ Unbalanced TE

In [None]:
results_pc_ut_ut = metrics_potency_classes(predictions_test_df_utr_ute)
results_pc_ut_ut.potency_class.replace({5:'5 - 7', 7:'7 - 9', 9: '9 - 11'}, inplace=True)

# Plot Fig4

In [None]:
results_pc_tid = results_pc.copy()
results_pc_tid.rename(columns={'potency_class': 'Potency class'}, inplace=True)
metric_ind = 'MAE'
plot_regression_models_cat(results_pc_tid, [metric_ind], x='Training size', y='Value',
                           col='Potency class',
                           hue = 'Algorithm', hue_order=['1-NN', 'kNN', 'SVR', 'RFR', 'MR'],
                           aspect=1, height=10, col_nr=3, width=0.8,
                           palette='tab10',
                           y_labels= f'{metric_ind}',
                           font_size=25, row="Target ID",
                           sharey='row',
                           yticks=[0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8],
                           bbox_to_anchor=(-0.55, -0.3), title=True,
                           legend_title='Training size',
                           results_path=figure_path, filename=f'F4'
                           )

# Plot S4

In [None]:
results_pc_tid_ut_ut = results_pc_ut_ut.copy()
results_pc_tid_ut_ut.rename(columns={'potency_class': 'Potency class'}, inplace=True)
metric_ind = 'MAE'
plot_regression_models_cat(results_pc_tid_ut_ut, [metric_ind], x='Training size', y='Value',
                           col='Potency class',
                           hue = 'Algorithm', hue_order=['1-NN', 'kNN', 'SVR', 'RFR', 'MR'],
                           aspect=1, height=10, col_nr=3, width=0.8,
                           palette='tab10',
                           y_labels= f'{metric_ind}',
                           font_size=25, row="Target ID",
                           sharey='row',
                           #yticks=[0.2, 0.4, 0.6, 0.8, 1, 1.2, 1.4, 1.6, 1.8, 2.0, 2.2, 2.4, 2.6, 2.8],
                           bbox_to_anchor=(-0.55, -0.3), title=True,
                           legend_title='Training size',
                           #sub_fig_title='b',
                           results_path=figure_path, filename=f'S4'
                           )

# Statistical Analysis Fig S5

In [None]:
import scipy.stats as st
from itertools import combinations

mut_result = []
for target in results_pc_tid_final['Target ID'].unique()[:]:
    for df_analysis in [results_pc_tid_final]:
        for metric in ['MAE', 'RMSE']:
            df_ = df_analysis.loc[(df_analysis['Target ID'] == target) & (df_analysis['Metric'] == f'{metric}')]
            for i, pc in enumerate(['5 - 11', '5 - 7', '7 - 9','9 - 11']):
                df = df_.loc[df_.potency_class == pc]
                for alg in combinations(['1-NN','kNN','SVR', 'RFR', 'MR'], 2):
                    print(alg)
                    alg1 = df.query(f'Algorithm == "{alg[0]}"')['Value']
                    alg2 = df.query(f'Algorithm == "{alg[1]}"')['Value']

                    df_1 = pd.DataFrame(list(zip(alg1, alg2)), columns=[f'{alg[0]} {target}', f'{alg[1]} {target}'])
                    stats, p_value = st.wilcoxon(list(alg1),list(alg2))
                    mut_result_dict = {"Algorithms": f'{alg[0]} / {alg[1]}',
                                       "Potency range": pc,
                                       "Metric": metric,
                                       "Target ID": target,
                                       "p_value":round(p_value, 3),
                                       'figure':i,
                                       'approach': approach}
                    mut_result.append(mut_result_dict)
mut_result = pd.DataFrame(mut_result)
display(mut_result)

In [None]:
for t, met in zip(['a', 'b'], ['MAE', 'RMSE']):
    plot_heatmap_stat_analysis(mut_result.loc[(mut_result.Metric == f'{met}')], x='Target ID', y='Algorithms', value='p_value', col='Potency range', col_wrap=2, height=5, aspect=1.5, square=False,
                               results_path=figure_path, filename=f'S5_{met}', sub_fig_title=t)