In [1]:
%load_ext autoreload

In [2]:
from collections import defaultdict
import joblib
from tqdm.notebook import tqdm
from ML.data_preprocessing import Dataset
from ML.machine_learning_models import *
# ML utils
from ML.machine_learning_models import Model_Evaluation as ml_evaluation
from ML.ml_utils_reg import create_directory, ECFP4, set_global_determinism, potency_classes
from ML.utils_dataset import dataset_train_test
%autoreload 2

In [3]:
# parameters
# Models
model_list =  ['1-NN','kNN','SVR','RFR','MR'] #
# Number of trials (datasplits)
trial_splits = 10
#Molecular Fingerprint
fingerprint = 'ECFP4'
# Approach
approach = 'Balanced'
# Training set sizes
set_sizes = [6, 12, 18, 30, 48, 78, 126, 204, 330]

In [None]:
# Dataset path
db_path = "./dataset/Datasets Chembl/"

# Load actives dB
regression_db_all = pd.read_csv(db_path + f'chembl_30_IC50_500_CPDs.csv')
regression_db_all['potency_class'] = potency_classes(regression_db_all.pPot.values, [5, 7, 9, 11])
regression_db_all

# Select Activity classes

In [9]:
tid_list = []
n = 110
for tid in regression_db_all.chembl_tid.unique()[:]:
    df_reg_tid = regression_db_all.loc[regression_db_all.chembl_tid == tid]
    if df_reg_tid['potency_class'].unique().tolist() == [5, 7, 9]:
        if len(df_reg_tid.loc[df_reg_tid['potency_class'] == 5]) > n and len(df_reg_tid.loc[df_reg_tid['potency_class'] == 7]) > n and len(df_reg_tid.loc[df_reg_tid['potency_class'] == 9]) > n:
            tid_list.append(tid)

print(tid_list)

['CHEMBL280', 'CHEMBL203', 'CHEMBL2409']


# Selected dataset

In [None]:
regression_tids = tid_list
regression_db = regression_db_all.loc[regression_db_all.chembl_tid.isin(regression_tids)]
regression_db

# Results path

In [11]:
# Results path
main_folder = 'regression_models_increase_tr_sizes_3_bins_330'
result_path = f'./{main_folder}/{fingerprint}/{approach}/'

# Targets IDs

In [13]:
regression_tids = regression_db.chembl_tid.unique()
regression_tids

array(['CHEMBL280', 'CHEMBL203', 'CHEMBL2409'], dtype=object)

In [14]:
# Performance/prediction datasets
dict_storage = defaultdict(pd.DataFrame)
parameter_resume = []

performance_train_df = pd.DataFrame()
predictions_train_df = pd.DataFrame()
performance_test_df = pd.DataFrame()
predictions_test_df = pd.DataFrame()

for target in tqdm(regression_tids):

    target_path = create_directory(f'./{main_folder}/{fingerprint}/{approach}/{target}/')

    print(f'Training on {target}')

    # Select Target Database
    regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]
    #display(regression_db_tid)
    df_regression = dataset_train_test(regression_db_tid, pot_bins=[5,7,9], tr_set_sizes=set_sizes, balance_test_set=True, n_trials=trial_splits, plot=False)

    #display(df_regression)
    for trial in range(trial_splits):
        print(f'Starting Trial {trial}')

        df_regression_trial = df_regression.loc[df_regression.trial == trial]

        #Set seed
        set_global_determinism(seed=trial)

        # TEST set
        df_regression_test = df_regression_trial.loc[(df_regression_trial.dataset == "test")]
        test_set = Dataset(np.array(ECFP4(df_regression_test.nonstereo_aromatic_smiles.values)), np.array(df_regression_test.pPot.values))
        test_set.add_instance("target", df_regression_test.chembl_tid.values)
        test_set.add_instance("smiles", df_regression_test.nonstereo_aromatic_smiles.values)
        test_set.add_instance("cid", df_regression_test.chembl_cid.values)

        for size in set_sizes:
            print(size)

            # TRAIN set
            df_regression_train = df_regression_trial.loc[(df_regression_trial.dataset == "train") & (df_regression_trial.set_size == size)]
            train_set = Dataset(np.array(ECFP4(df_regression_train.nonstereo_aromatic_smiles.values)), np.array(df_regression_train.pPot.values))
            train_set.add_instance("target", df_regression_train.chembl_tid.values)
            train_set.add_instance("smiles", df_regression_train.nonstereo_aromatic_smiles.values)
            train_set.add_instance("cid", df_regression_train.chembl_cid.values)
            train_set.add_instance("potency_classes", df_regression_train.potency_class.values)

            for model in model_list:

                print(f'Training {model}')

                # Create saving directory
                model_fpath = create_directory(f"./{main_folder}/{fingerprint}/{approach}/{target}/{model}/{size}", verbose=False)

                ml_model = MLModel(train_set, model, data_type='balanced')
                joblib.dump(ml_model.model, os.path.join(model_fpath, f"{model}_{trial}.pkl"))

                #Best model parameters
                opt_parameters_dict = {**{'model': model,
                                            'trial': trial,
                                            'Target ID': target,
                                            'Training size':size}, **ml_model.best_params}
                parameter_resume.append(opt_parameters_dict)

                # TEST
                #Model Evaluation
                model_eval_train = ml_evaluation(ml_model, train_set, train_set, model_id=model)
                model_eval_test = ml_evaluation(ml_model, test_set, train_set, model_id=model)

                #Performance df
                performance_train = model_eval_train.pred_performance
                performance_train["trial"] = trial
                performance_train["Approach"] = approach
                performance_train["Training size"] = size
                performance_train_df = pd.concat([performance_train_df, performance_train])

                # Prediction df
                predictions_train = model_eval_train.predictions
                predictions_train["trial"] = trial
                predictions_train["Approach"] = approach
                predictions_train["Training size"] = size
                predictions_train_df = pd.concat([predictions_train_df, predictions_train])

                #Performance df
                performance_test = model_eval_test.pred_performance
                performance_test["trial"] = trial
                performance_test["Approach"] = approach
                performance_test["Training size"] = size
                performance_test_df = pd.concat([performance_test_df, performance_test])

                # Prediction df
                predictions_test = model_eval_test.predictions
                predictions_test["trial"] = trial
                predictions_test["Approach"] = approach
                predictions_test["Training size"] = size
                predictions_test_df = pd.concat([predictions_test_df, predictions_test])

parameter_df = pd.DataFrame(parameter_resume)

# Save results
performance_train_df.to_csv(os.path.join(result_path, f'performance_train.csv'))
predictions_train_df.to_csv(os.path.join(result_path, f'predictions_train.csv'))
performance_test_df.to_csv(os.path.join(result_path, f'performance_test.csv'))
predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test.csv'))
parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters.csv'))

  0%|          | 0/3 [00:00<?, ?it/s]

Created new directory './pred_models_increase_tr_sizes_3_bins_330/ECFP4/Balanced/CHEMBL280/'
Training on CHEMBL280
Starting Trial 0
6
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
12
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
18
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
30
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
48
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
78
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
126
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
204
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
330
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 1
6
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
12
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
18
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
30
Training 1-NN
Training kNN
Trai