In [1]:
%load_ext autoreload

In [2]:
from collections import defaultdict
import joblib
from tqdm.notebook import tqdm
from ML.data_preprocessing import Dataset
from ML.machine_learning_models import *
# ML utils
from ML.machine_learning_models import Model_Evaluation as ml_evaluation
from ML.ml_utils_reg import create_directory, ECFP4, set_global_determinism, potency_classes, \
    select_train_subsets_unbalanced
%autoreload 2

## Parameters

In [3]:
# parameters
# Models
model_list =  ['1-NN','kNN','SVR','RFR','MR']
# Number of trials (datasplits)
trial_splits = 10
#Molecular Fingerprint
fingerprint = 'ECFP4'
# Approach
approach = 'Unbalanced'
# Training set sizes
set_sizes = [6, 12, 18, 30, 48, 78, 126, 204, 330]

## Load Dataset

In [4]:
# Dataset path
db_path = "./dataset/Datasets Chembl/"
# Load actives dB
regression_db_all = pd.read_csv(db_path + f'chembl_30_IC50_500_CPDs.csv')
regression_db_all['potency_class'] = potency_classes(regression_db_all.pPot.values, [5, 7, 9, 11])
regression_db_all

Unnamed: 0,nonstereo_aromatic_smiles,standard_type,pPot,chembl_cid,chembl_tid,Potency class,potency_class
0,COc1cccc2c1CCN(S(=O)(=O)c1cccc(C(=O)Nc3ccc(Cl)...,IC50,5.300000,CHEMBL520827,CHEMBL235,5 - 6,5
1,Cc1nsc(C)c1CCC1CCN(S(=O)(=O)CC(C)(CC(C)c2ncc(F...,IC50,5.619789,CHEMBL1683460,CHEMBL333,5 - 6,5
2,Cc1nonc1NS(=O)(=O)c1ccc(Oc2ccc(Cl)cc2-c2ccnn2C...,IC50,5.000000,CHEMBL2325553,CHEMBL4296,5 - 6,5
3,CCN(CCCCCCCCc1cccc(OC)c1)Cc1ccccc1OC,IC50,5.247952,CHEMBL3752227,CHEMBL220,5 - 6,5
4,CCN(CCCCCCCOc1ccc2c(=O)c3ccccc3oc2c1)Cc1ccccc1OC,IC50,5.501689,CHEMBL224553,CHEMBL220,5 - 6,5
...,...,...,...,...,...,...,...
40435,CC1NC(=O)c2cc(-c3c(F)ccc4c(=O)n(C)c(NC(C)(C)C)...,IC50,10.000000,CHEMBL3902148,CHEMBL2147,10 - 11,9
40436,CCOc1ccccc1-c1cc2c(NC(CCO)c3ccccc3)ncnc2[nH]1,IC50,10.000000,CHEMBL4760328,CHEMBL203,10 - 11,9
40437,COCc1ccc(COCc2csc3nc(C(=O)NCc4cccc(OC)c4)[nH]c...,IC50,10.096910,CHEMBL3337890,CHEMBL280,10 - 11,9
40438,CN1CCN(c2cccc(CCNC(=O)c3cnc(C#N)nc3NCC(C)(C)C)...,IC50,10.958607,CHEMBL414530,CHEMBL268,10 - 11,9


# Select Targets dataset

In [5]:
regression_tids = ['CHEMBL280', 'CHEMBL203', 'CHEMBL2409']
regression_db = regression_db_all.loc[regression_db_all.chembl_tid.isin(regression_tids)]
regression_db

Unnamed: 0,nonstereo_aromatic_smiles,standard_type,pPot,chembl_cid,chembl_tid,Potency class,potency_class
20,Cc1nsc(C)c1CCC1CCN(S(=O)(=O)CC(C)(CC(C)c2ncc(F...,IC50,5.337242,CHEMBL1683460,CHEMBL280,5 - 6,5
75,Cc1noc(C)c1CCC1CCN(S(=O)(=O)CC2(N(O)C=O)CCN(S(...,IC50,5.337242,CHEMBL1784342,CHEMBL280,5 - 6,5
96,CCN(CC)CCCNc1ncc2cc(-c3c(Cl)cccc3Cl)c(=O)n(C)c2n1,IC50,5.869666,CHEMBL50470,CHEMBL203,5 - 6,5
138,CCN(CC)CCNC(=O)c1c(C)[nH]c(C=C2C(=O)Nc3ccc(Br)...,IC50,5.000000,CHEMBL13629,CHEMBL203,5 - 6,5
166,Clc1cc(Nc2ncnc3cccc(OC4CCOCC4)c23)ccc1OCc1ccccn1,IC50,5.619789,CHEMBL193578,CHEMBL203,5 - 6,5
...,...,...,...,...,...,...,...
40431,COc1cccc(CNC(=O)c2nc3scc(NC(=O)Cc4ccccc4)c3c(=...,IC50,10.275724,CHEMBL3337902,CHEMBL280,10 - 11,9
40432,CC(C)C(CS(=O)(=O)c1ccc(-c2cccc(CNC(=O)c3nc4ccc...,IC50,10.292430,CHEMBL3889936,CHEMBL280,10 - 11,9
40433,COc1cccc(CNC(=O)c2nc3scc(NC(=O)Cc4ccc(C(=O)O)c...,IC50,10.677781,CHEMBL3337903,CHEMBL280,10 - 11,9
40436,CCOc1ccccc1-c1cc2c(NC(CCO)c3ccccc3)ncnc2[nH]1,IC50,10.000000,CHEMBL4760328,CHEMBL203,10 - 11,9


# Create results folder

In [6]:
# Results path
main_folder = 'regression_models_increase_tr_sizes_3_bins_unbalanced'
result_path = f'./{main_folder}/{fingerprint}/{approach}/'

In [7]:
regression_tids = regression_db.chembl_tid.unique()[:]
regression_tids

array(['CHEMBL280', 'CHEMBL203', 'CHEMBL2409'], dtype=object)

In [None]:
# Performance/prediction datasets
dict_storage = defaultdict(pd.DataFrame)
parameter_resume = []

performance_train_df = pd.DataFrame()
predictions_train_df = pd.DataFrame()
performance_test_df = pd.DataFrame()
predictions_test_df = pd.DataFrame()

for target in tqdm(regression_tids):

    target_path = create_directory(f'./{main_folder}/{fingerprint}/{approach}/{target}/')

    print(f'Training on {target}')

    # Select Target Database
    regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]

    for trial in range(trial_splits):
        print(f'Starting Trial {trial}')
        df_regression_train_cids = select_train_subsets_unbalanced(regression_db_tid, sizes=set_sizes, seed=trial)

        #Set seed
        set_global_determinism(seed=trial)

        # TEST set
        df_regression_test = regression_db_tid.loc[~regression_db_tid.chembl_cid.isin(df_regression_train_cids[set_sizes[-1]])]
        test_set = Dataset(np.array(ECFP4(df_regression_test.nonstereo_aromatic_smiles.values)), np.array(df_regression_test.pPot.values))
        test_set.add_instance("target", df_regression_test.chembl_tid.values)
        test_set.add_instance("smiles", df_regression_test.nonstereo_aromatic_smiles.values)
        test_set.add_instance("cid", df_regression_test.chembl_cid.values)

        for size in set_sizes:
            print(size)

            # TRAIN set
            df_regression_train = regression_db_tid.loc[regression_db_tid.chembl_cid.isin(df_regression_train_cids[size])]
            train_set = Dataset(np.array(ECFP4(df_regression_train.nonstereo_aromatic_smiles.values)), np.array(df_regression_train.pPot.values))
            train_set.add_instance("target", df_regression_train.chembl_tid.values)
            train_set.add_instance("smiles", df_regression_train.nonstereo_aromatic_smiles.values)
            train_set.add_instance("cid", df_regression_train.chembl_cid.values)
            train_set.add_instance("potency_classes", df_regression_train.potency_class.values)

            for model in model_list:

                print(f'Training {model}')

                # Create saving directory
                model_fpath = create_directory(f"./{main_folder}/{fingerprint}/{approach}/{target}/{model}/{size}", verbose=False)

                ml_model = MLModel(train_set, model, data_type='unbalanced')
                joblib.dump(ml_model.model, os.path.join(model_fpath, f"{model}_{trial}.pkl"))

                #Best model parameters
                opt_parameters_dict = {**{'model': model,
                                            'trial': trial,
                                            'Target ID': target,
                                            'Training size':size}, **ml_model.best_params}
                parameter_resume.append(opt_parameters_dict)

                # TEST
                #Model Evaluation
                model_eval_train = ml_evaluation(ml_model, train_set, train_set, model_id=model)
                model_eval_test = ml_evaluation(ml_model, test_set, train_set, model_id=model)

                #Performance df
                performance_train = model_eval_train.pred_performance
                performance_train["trial"] = trial
                performance_train["Approach"] = approach
                performance_train["Training size"] = size
                performance_train_df = pd.concat([performance_train_df, performance_train])

                # Prediction df
                predictions_train = model_eval_train.predictions
                predictions_train["trial"] = trial
                predictions_train["Approach"] = approach
                predictions_train["Training size"] = size
                predictions_train_df = pd.concat([predictions_train_df, predictions_train])

                #Performance df
                performance_test = model_eval_test.pred_performance
                performance_test["trial"] = trial
                performance_test["Approach"] = approach
                performance_test["Training size"] = size
                performance_test_df = pd.concat([performance_test_df, performance_test])

                # Prediction df
                predictions_test = model_eval_test.predictions
                predictions_test["trial"] = trial
                predictions_test["Approach"] = approach
                predictions_test["Training size"] = size
                predictions_test_df = pd.concat([predictions_test_df, predictions_test])

parameter_df = pd.DataFrame(parameter_resume)

# Save results
performance_train_df.to_csv(os.path.join(result_path, f'performance_train.csv'))
predictions_train_df.to_csv(os.path.join(result_path, f'predictions_train.csv'))
performance_test_df.to_csv(os.path.join(result_path, f'performance_test.csv'))
predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test.csv'))
parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters.csv'))