In [5]:
#Utils
from ml_utils import *
from machine_learning_models import *
from fingerprints import *
from IPython.core.display_functions import display
from tqdm.notebook import tqdm
#Sklearn

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Models Parameters
### Select the desired parameters to be used by regression models
<p>
<li> <b>model_list</b>: ML/DL models for regression (kNN: k-neirest neighbor, SVR: Support Vector Regression, RFR: Random Forest Regression, DNN: Deep Neural Network, MR: Median regression)</li>
</p>
<p>
<li> <b>cv_fold</b>: Number do data splits (trials) to be performed</li>
</p>
<p>
<li> <b>opt_metric</b>: Optimization metric to be use for model optimization (MAE: 'neg_mean_absolute_error', MSE: ‘neg_mean_squared_error’)</li>
</p>
<p>
<li> <b>compound_sets</b>: Compound sets to be generated ('Cluster set': Largest Analogue series, ' Potent set': Most potent compounds) </li>
</p>
<p>
<li> <b>potent_size</b>: Potent sets size to be generated (0.1 = 10% original set) </li>
</p>



In [6]:
model_list = ['kNN', 'SVR', 'RFR', 'DNN', 'MR']
cv_folds=10
opt_metric = "neg_mean_absolute_error"
compound_sets = ['Cluster set', 'Potent set']
potent_size = 0.1

'./regression_results/'


# Loading Data

In [7]:
# Load CCR results path
ccr_path = "./ccr_results/"
# Load actives dB
db_path = './dataset/'
# Load actives dB
regression_db = pd.read_csv(os.path.join(db_path, f'chembl_30_IC50_10_tids_1000_CPDs.csv'))
# Target Classes
regression_tids = regression_db.chembl_tid.unique()[:10]

# Models

In [None]:

performance_train_df = pd.DataFrame()
predictions_train_df = pd.DataFrame()
performance_test_df = pd.DataFrame()
predictions_test_df = pd.DataFrame()
parameter_resume = []


# Molecular Fingerprints
morgan_radius2 = FoldedMorganFingerprint(radius=2)
morgan_radius2.fit_smiles(regression_db.nonstereo_aromatic_smiles.tolist())

for target in tqdm(regression_tids):
    for approach in ['Potent set', 'Cluster set']:
        for trial in range(1):
            print(f'Training on {target}')

            # Select Target Database
            regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]

            if approach == 'Cluster set':
                ccr_df = pd.read_csv(os.path.join(ccr_path, f'CCR_C30_IC50_HT_single_5_0.666_13_{target}.csv'))

                ccr_df_AS = ccr_df.loc[ccr_df['Core'] == ccr_df['Core'].value_counts().index[trial]].chembl_id.values

                df_TR = regression_db_tid.loc[~regression_db_tid['chembl_cid'].isin(ccr_df_AS)]

                df_TE = regression_db_tid.loc[regression_db_tid['chembl_cid'].isin(ccr_df_AS)]

            elif approach == 'Potent set':

                df_TE = regression_db_tid.nlargest(int(round(len(regression_db_tid.index)*potent_size, 0)), 'pPot')

                df_TR = regression_db_tid.loc[~regression_db_tid['chembl_cid'].isin(df_TE['chembl_cid'])]

            # Constructing ChEMBL Dataset
            fp_matrix_tr = morgan_radius2.transform_smiles(df_TR.nonstereo_aromatic_smiles.tolist())
            fp_matrix_te = morgan_radius2.transform_smiles(df_TE.nonstereo_aromatic_smiles.tolist())

            #Potency values
            potency_tr = df_TR.pPot.values
            potency_te = df_TE.pPot.values

            # Constructing Dataset
            training_set = Dataset(fp_matrix_tr, np.array(potency_tr))
            training_set.add_instance("target", df_TR.chembl_tid.values)
            training_set.add_instance("smiles", df_TR.nonstereo_aromatic_smiles.values)

            test_set = Dataset(fp_matrix_te, np.array(potency_te))
            test_set.add_instance("target", df_TE.chembl_tid.values)
            test_set.add_instance("smiles", df_TE.nonstereo_aromatic_smiles.values)

            for model in model_list:
                print(f'Training {model}')

                model_fpath = create_directory(f"./trained_models/{model}/" + f"{target}_{trial}", verbose=False)

                if model == 'DNN':
                    ml_model = DNN(training_set, model, training_set.features.shape[1], seed=trial)
                    model_fpath += ".h5"
                    ml_model.model.save(model_fpath)
                else:
                    ml_model = MLModel(training_set, model)
                    model_fpath += ".p.gz"

                #Best model parameters
                opt_parameters_dict = {'model': model,
                                       'trial': trial,
                                       'Target ID': target}
                for param, value in ml_model.best_params.items():
                    opt_parameters_dict[param] = value
                parameter_resume.append(opt_parameters_dict)

                # TRAIN
                #Model Evaluation
                model_eval_train = Model_Evaluation(ml_model, training_set)

                #Performance df
                performance_train = model_eval_train.pred_performance
                performance_train["trial"] = trial
                performance_train["Approach"] = approach
                performance_train["Approach_trial"] = trial
                performance_train_df = pd.concat([performance_train_df, performance_train])

                # TEST
                #Model Evaluation
                model_eval_test = Model_Evaluation(ml_model, test_set)

                #Performance df
                performance_test = model_eval_test.pred_performance
                performance_test["Approach"] = approach
                performance_test["trial"] = trial
                performance_test_df = pd.concat([performance_test_df, performance_test])

                # Prediction df
                predictions_test = model_eval_test.predictions
                predictions_test["Approach"] = approach
                predictions_test["trial"] = trial
                predictions_test_df = pd.concat([predictions_test_df, predictions_test])


parameter_df = pd.DataFrame(parameter_resume)
display(parameter_df)
display(performance_test_df)

# Save results
result_path = create_directory('./regression_results/cluster_potent/')
performance_train_df.to_csv(os.path.join(result_path, f'performance_train_cluster_potent.csv'))
performance_test_df.to_csv(os.path.join(result_path, f'performance_test_cluster_potent.csv'))
parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters_cluster_potent.csv'))
predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test_cluster_potent.csv'))