In [1]:
%load_ext autoreload

In [2]:
from collections import defaultdict
import joblib
from tqdm.notebook import tqdm
from ML.data_preprocessing import Dataset
from ML.machine_learning_models import *
# ML utils
from ML.machine_learning_models import Model_Evaluation as ml_evaluation
from ML.ml_utils_reg import create_directory, ECFP4, set_global_determinism, potency_classes
%autoreload 2

In [14]:
# parameters
# Models
model_list =  ['1-NN','kNN','SVR','RFR','MR']
# Number of trials (datasplits)
trial_splits = 10
#Molecular Fingerprint
fingerprint = 'ECFP4'
# Approach
approach = 'Unbalanced'

In [4]:
# Dataset path
db_path = "./dataset/Datasets Chembl/"
# Load actives dB
regression_db_all = pd.read_csv(db_path + f'chembl_30_IC50_500_CPDs.csv')
regression_db_all['potency_class'] = potency_classes(regression_db_all.pPot.values, [5, 7, 9, 11])
regression_db_all

Unnamed: 0,nonstereo_aromatic_smiles,standard_type,pPot,chembl_cid,chembl_tid,Potency class,potency_class
0,COc1cccc2c1CCN(S(=O)(=O)c1cccc(C(=O)Nc3ccc(Cl)...,IC50,5.300000,CHEMBL520827,CHEMBL235,5 - 6,5
1,Cc1nsc(C)c1CCC1CCN(S(=O)(=O)CC(C)(CC(C)c2ncc(F...,IC50,5.619789,CHEMBL1683460,CHEMBL333,5 - 6,5
2,Cc1nonc1NS(=O)(=O)c1ccc(Oc2ccc(Cl)cc2-c2ccnn2C...,IC50,5.000000,CHEMBL2325553,CHEMBL4296,5 - 6,5
3,CCN(CCCCCCCCc1cccc(OC)c1)Cc1ccccc1OC,IC50,5.247952,CHEMBL3752227,CHEMBL220,5 - 6,5
4,CCN(CCCCCCCOc1ccc2c(=O)c3ccccc3oc2c1)Cc1ccccc1OC,IC50,5.501689,CHEMBL224553,CHEMBL220,5 - 6,5
...,...,...,...,...,...,...,...
40435,CC1NC(=O)c2cc(-c3c(F)ccc4c(=O)n(C)c(NC(C)(C)C)...,IC50,10.000000,CHEMBL3902148,CHEMBL2147,10 - 11,9
40436,CCOc1ccccc1-c1cc2c(NC(CCO)c3ccccc3)ncnc2[nH]1,IC50,10.000000,CHEMBL4760328,CHEMBL203,10 - 11,9
40437,COCc1ccc(COCc2csc3nc(C(=O)NCc4cccc(OC)c4)[nH]c...,IC50,10.096910,CHEMBL3337890,CHEMBL280,10 - 11,9
40438,CN1CCN(c2cccc(CCNC(=O)c3cnc(C#N)nc3NCC(C)(C)C)...,IC50,10.958607,CHEMBL414530,CHEMBL268,10 - 11,9


## Selecting Targets

In [5]:
tid_list = []
n = 75
for tid in regression_db_all.chembl_tid.unique()[:]:
    df_reg_tid = regression_db_all.loc[regression_db_all.chembl_tid == tid]
    if df_reg_tid['potency_class'].unique().tolist() == [5, 7, 9]:
        if len(df_reg_tid.loc[df_reg_tid['potency_class'] == 5]) > n and len(df_reg_tid.loc[df_reg_tid['potency_class'] == 7]) > n and len(df_reg_tid.loc[df_reg_tid['potency_class'] == 9]) > n:
            tid_list.append(tid)
print(tid_list)

['CHEMBL333', 'CHEMBL268', 'CHEMBL280', 'CHEMBL203', 'CHEMBL279', 'CHEMBL2409', 'CHEMBL260', 'CHEMBL286']


In [6]:
regression_tids = tid_list
regression_db = regression_db_all.loc[regression_db_all.chembl_tid.isin(regression_tids)]
regression_db

Unnamed: 0,nonstereo_aromatic_smiles,standard_type,pPot,chembl_cid,chembl_tid,Potency class,potency_class
1,Cc1nsc(C)c1CCC1CCN(S(=O)(=O)CC(C)(CC(C)c2ncc(F...,IC50,5.619789,CHEMBL1683460,CHEMBL333,5 - 6,5
17,Cc1noc(CN2CCC(CCOc3ccc(-c4cc5c(ncn5C)c(C#N)n4)...,IC50,5.940058,CHEMBL1669280,CHEMBL268,5 - 6,5
20,Cc1nsc(C)c1CCC1CCN(S(=O)(=O)CC(C)(CC(C)c2ncc(F...,IC50,5.337242,CHEMBL1683460,CHEMBL280,5 - 6,5
75,Cc1noc(C)c1CCC1CCN(S(=O)(=O)CC2(N(O)C=O)CCN(S(...,IC50,5.337242,CHEMBL1784342,CHEMBL280,5 - 6,5
96,CCN(CC)CCCNc1ncc2cc(-c3c(Cl)cccc3Cl)c(=O)n(C)c2n1,IC50,5.869666,CHEMBL50470,CHEMBL203,5 - 6,5
...,...,...,...,...,...,...,...
40433,COc1cccc(CNC(=O)c2nc3scc(NC(=O)Cc4ccc(C(=O)O)c...,IC50,10.677781,CHEMBL3337903,CHEMBL280,10 - 11,9
40434,CCCc1ccc(CCC2C(=O)NC(C(=O)NC)Cc3ccc(cc3)OCCCCC...,IC50,10.000000,CHEMBL418291,CHEMBL333,10 - 11,9
40436,CCOc1ccccc1-c1cc2c(NC(CCO)c3ccccc3)ncnc2[nH]1,IC50,10.000000,CHEMBL4760328,CHEMBL203,10 - 11,9
40437,COCc1ccc(COCc2csc3nc(C(=O)NCc4cccc(OC)c4)[nH]c...,IC50,10.096910,CHEMBL3337890,CHEMBL280,10 - 11,9


In [7]:
# Results path
main_folder = 'regression_models_50_50_R2'
result_path = f'./{main_folder}/{fingerprint}/{approach}/'

In [8]:
regression_tids = regression_db.chembl_tid.unique()

In [16]:
# Performance/prediction datasets
dict_storage = defaultdict(pd.DataFrame)
parameter_resume = []

performance_train_df = pd.DataFrame()
predictions_train_df = pd.DataFrame()
performance_test_df = pd.DataFrame()
predictions_test_df = pd.DataFrame()
datasets_final = pd.DataFrame()

for target in tqdm(regression_tids):

    target_path = create_directory(f'./{main_folder}/{fingerprint}/{approach}/{target}/')

    print(f'Training on {target}')

    # Select Target Database
    regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]

    # Compound potency and mol smiles
    potency = regression_db_tid.pPot.values
    smiles = regression_db_tid.nonstereo_aromatic_smiles.values

    # Generate ECFP4
    fp_matrix = ECFP4(smiles)

    # Constructing Dataset
    dataset = Dataset(np.array(fp_matrix), np.array(potency))
    dataset.add_instance("target", regression_db_tid.chembl_tid.values)
    dataset.add_instance("smiles", smiles)
    dataset.add_instance("cid", regression_db_tid.chembl_cid.values)
    dataset.add_instance("fingerprint", np.array(fp_matrix))
    dataset.add_instance("potency_classes", regression_db_tid.potency_class.values)

    data_splitter = ShuffleSplit(n_splits=trial_splits, random_state=42, test_size=0.5)
    for trial, (train_idx, test_idx) in enumerate(data_splitter.split(dataset.features, dataset.target)):

        #display(df_regression)
        print(f'Starting Trial {trial}')

        #Training dataset
        training_set = dataset[train_idx]
        #Test dataset
        test_set = dataset[test_idx]

        datasets_train_size_df = pd.DataFrame(list(zip(training_set.cid, training_set.labels, training_set.target)), columns=['Compound ID', 'Potency', 'Target ID'])
        datasets_train_size_df['trial'] = trial
        datasets_train_size_df['Approach'] = approach
        datasets_train_size_df['dataset'] = 'Train'
        datasets_train_size_df["Training size (%)"] = 50
        datasets_final = pd.concat([datasets_final, datasets_train_size_df])

        datasets_test_size_df = pd.DataFrame(list(zip(test_set.cid, test_set.labels, test_set.target)), columns=['Compound ID', 'Potency', 'Target ID'])
        datasets_test_size_df['trial'] = trial
        datasets_test_size_df['Approach'] = approach
        datasets_test_size_df['dataset'] = 'Test'
        datasets_test_size_df["Test size (%)"] = 50
        datasets_final = pd.concat([datasets_final, datasets_test_size_df])

        #Set seed
        set_global_determinism(seed=trial)

        for model in model_list:

            print(f'Training {model}')

            # Create saving directory
            model_fpath = create_directory(f"./{main_folder}/{fingerprint}/{approach}/{target}/{model}/", verbose=False)

            ml_model = MLModel(training_set, model, data_type='unbalanced', opt_metric_name='R2')
            joblib.dump(ml_model.model, os.path.join(model_fpath, f"{model}_{trial}.pkl"))

            #Best model parameters
            opt_parameters_dict = {**{'model': model,
                                        'trial': trial,
                                        'Target ID': target,
                                        }, **ml_model.best_params}
            parameter_resume.append(opt_parameters_dict)

            # TEST
            #Model Evaluation
            model_eval_train = ml_evaluation(ml_model, training_set, training_set, model_id=model)
            model_eval_test = ml_evaluation(ml_model, test_set, training_set, model_id=model)

            #Performance df
            performance_train = model_eval_train.pred_performance
            performance_train["trial"] = trial
            performance_train["Approach"] = approach
            performance_train_df = pd.concat([performance_train_df, performance_train])

            # Prediction df
            predictions_train = model_eval_train.predictions
            predictions_train["trial"] = trial
            predictions_train["Approach"] = approach
            predictions_train_df = pd.concat([predictions_train_df, predictions_train])

            #Performance df
            performance_test = model_eval_test.pred_performance
            performance_test["trial"] = trial
            performance_test["Approach"] = approach
            performance_test_df = pd.concat([performance_test_df, performance_test])

            # Prediction df
            predictions_test = model_eval_test.predictions
            predictions_test["trial"] = trial
            predictions_test["Approach"] = approach
            predictions_test_df = pd.concat([predictions_test_df, predictions_test])

parameter_df = pd.DataFrame(parameter_resume)
datasets_final["Target ID"] = datasets_final["Target ID"].map(lambda x: x.lstrip("CHEMBL").rstrip(""))
datasets_final['Target ID'] = datasets_final['Target ID'].astype(int)

# Save results
performance_train_df.to_csv(os.path.join(result_path, f'performance_train.csv'))
predictions_train_df.to_csv(os.path.join(result_path, f'predictions_train.csv'))
performance_test_df.to_csv(os.path.join(result_path, f'performance_test.csv'))
predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test.csv'))
parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters.csv'))
datasets_final.to_csv(os.path.join(result_path, f'dataset_training_sizes.csv'))

  0%|          | 0/8 [00:00<?, ?it/s]

Training on CHEMBL333
Starting Trial 0
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 1
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 2
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 3
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 4
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 5
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 6
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 7
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 8
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 9
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Created new directory './regression_models_50_50_R2/ECFP4/Unbalanced/CHEMBL268/'
Training on CHEMBL268
Starting Trial 0
Training 1-NN
Training kNN
Training SV