In [36]:
#Utils
from tqdm.notebook import tqdm
from ml_utils import *
from machine_learning_models import *
from fingerprints import *
from sklearn.model_selection import ShuffleSplit
import random
#deepchem
import deepchem as dc
from deepchem.models import GraphConvModel
from IPython.core.display_functions import display
import warnings
warnings.filterwarnings('ignore')
tf.get_logger().setLevel('ERROR')
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Models Parameters
### Select the desired parameters to be used by GCN models
<p>
<li> <b>model_list</b>: ML/DL models for regression (GCN: Graph Neural Networks)</li>
</p>
<p>
<li> <b>cv_fold</b>: Number do data splits (trials) to be performed</li>
</p>
<p>
<li> <b>data_order</b>: Different data orders ('regular': Normal potency (y) order, 'y_rand': Randomized potency values) </li>
</p>
<p>
<li> <b>compound_sets</b>: Compound sets to be generated ('Complete set': 100% compounds, 'Random set': Random set of compounds, 'Diverse set': Chemical diverse set of compounds) </li>
</p>
<p>
<li> <b>compound_sets_size</b>: Compound sets size to be generated for 'Random' and 'Diverse' based on the size of the respective 'Complete' ('Complete set': 100% compounds, 'Random set': 25%, 'Diverse set': 25%) </li>
</p>
<p>
<li> <b>params_dict</b>: GCN hyperparameter grid (nb_epoch: number of epochs, learning_rate, graph_conv_layer, dense_layer_size, dropout, number of atom features) </li>
</p>


In [None]:
model_list = ['GCN']
cv_folds=10
data_order = ['regular', 'y_rand']
compound_sets = ['Complete set', 'Random set', 'Diverse set']
compound_sets_size = 0.25

params_dict = {
     "nb_epoch":[100, 200],
     "learning_rate":[0.01, 0.001],
     "n_tasks":[1],
     "graph_conv_layers":[[64, 64], [256, 256], [512, 512], [1024, 1024]],
     "dense_layer_size":[64, 256, 512, 1024],
     "dropout":[0.0],
     "mode":["regression"],
     "number_atom_features":[75]}

# Load Data
### Load compound database to be used for the regression models

<li> <b>db_path</b>: dataset full path</li>
</p>

In [40]:
# Load actives dB
db_path = './dataset/'
# Load actives dB
regression_db = pd.read_csv(os.path.join(db_path, f'chembl_30_IC50_10_tids_1000_CPDs.csv'))
# Target Classes
regression_tids = regression_db.chembl_tid.unique()[:10]

# GCN Models
### Folowing code generates potency prediction based on GCN models

In [None]:
#Create saving path
create_directory('./regression_results/')

for data_ord in data_order:

    performance_train_df = pd.DataFrame()
    performance_test_df = pd.DataFrame()
    predictions_test_df = pd.DataFrame()
    parameter_resume = []

    for target in tqdm(regression_tids):

        # Select Target Database
        regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]

        #compound potency
        potency = regression_db_tid.pPot.values.tolist()

        # Randomized Class potency
        if data_ord == 'y_rand':
            random.shuffle(potency)

        for approach in compound_sets:
            for i in range(3):
                print(f'Training on {target} - {approach} - {data_ord}')

                # Generate Mol object from SMILES
                mols = [Chem.MolFromSmiles(smi) for smi in regression_db_tid.nonstereo_aromatic_smiles.tolist()]

                # Data featurization
                featurizer = dc.feat.ConvMolFeaturizer()
                mol_graphs = featurizer.featurize(mols)

                # Constructing Dataset
                dataset = dc.data.NumpyDataset(X=mol_graphs, y=np.array(potency), ids=np.array(regression_db_tid.chembl_tid.values))

                # Data Sampling Approaches
                if approach == 'Random set':
                    random.seed(i+1)
                    mol_idx = random.sample([idx for idx in range(len(dataset))], int(compound_sets_size*len(dataset)))
                    dataset = dataset.select(mol_idx)

                elif approach == 'Diverse set':
                    fp_bit_vec = ECFP4(regression_db_tid.nonstereo_aromatic_smiles.tolist())
                    mol_idx = maxminpicker(fp_bit_vec, compound_sets_size, seed=i+1)
                    dataset = dataset.select(mol_idx)

                # Split dataset into TR and TE
                data_splitter = ShuffleSplit(n_splits=cv_folds, random_state=20021997, test_size=0.2)
                for trial, (train_idx, test_idx) in enumerate(data_splitter.split(dataset.X)):

                    #Defining Training and Test sets
                    training_set = dataset.select(train_idx)
                    test_set = dataset.select(test_idx)

                    # Initialize transformers for train_dataset
                    transformers_train = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
                    training_set = transformers_train.transform(training_set)

                    # Initialize transformers for test_dataset
                    transformers_test = dc.trans.NormalizationTransformer(transform_y=True, dataset=dataset)
                    test_set = transformers_test.transform(test_set)

                    # Split dataset into TR and internal Validation
                    splitter = dc.splits.RandomSplitter()
                    train_set, valid_set = splitter.train_test_split(training_set, seed=trial)

                    for model in model_list:

                        #Define random seed (reproducibility)
                        tf.keras.utils.set_random_seed(trial)

                        #Initialize GridSearch optimizer
                        optimizer = dc.hyper.GridHyperparamOpt(dc.models.GraphConvModel)

                        # Select optimization metric (MAE)
                        metric = dc.metrics.Metric(dc.metrics.mae_score)

                        # Best GCN model, parameters and final results
                        best_model, best_params, all_results = optimizer.hyperparam_search(params_dict=params_dict,
                                                                                           train_dataset=train_set,
                                                                                           valid_dataset=valid_set,
                                                                                           metric=metric,
                                                                                           use_max=False,
                                                                                           output_transformers=[transformers_train],
                                                                                           #logdir=r'C:\\GCN\\'
                                                                                           )

                        # Define final GCN model
                        def final_gcn(data, best_params):

                            gcn = GraphConvModel(n_tasks=best_params["n_tasks"],
                                               graph_conv_layers=best_params["graph_conv_layers"],
                                               dropout=best_params["dropout"],
                                                mode=best_params["mode"],
                                               predictor_hidden_feats=best_params["dense_layer_size"],
                                               learning_rate=best_params["learning_rate"],
                                                )

                            gcn.fit(data, nb_epoch=best_params["nb_epoch"])

                            return gcn

                        #Best GCN model parameters
                        opt_parameters_dict = {'model': model,
                                               'trial': trial,
                                               'Target ID': target,
                                               'Approach':approach}

                        for param, value in best_params.items():
                            opt_parameters_dict[param] = value
                        parameter_resume.append(opt_parameters_dict)

                        # Generate final Model
                        ml_model = final_gcn(training_set, best_params)

                        # evaluate the model
                        train_score = ml_model.evaluate(training_set, [metric], [transformers_train])
                        test_score = ml_model.evaluate(test_set, [metric], [transformers_train])

                        #TRAIN
                        #Model Evaluation
                        model_eval_train = Model_Evaluation(ml_model, training_set, transformers_train, model_id=model)

                        #Performance df
                        performance_train = model_eval_train.pred_performance
                        performance_train["trial"] = trial
                        performance_train["Approach"] = approach
                        performance_train["Approach_trial"] = i
                        performance_train["data_order"] = data_ord
                        performance_train_df = pd.concat([performance_train_df, performance_train])

                        #Model Evaluation
                        model_eval_test = Model_Evaluation(ml_model, test_set, transformers_train, model_id=model)

                        #Performance df
                        performance_test = model_eval_test.pred_performance
                        performance_test["trial"] = trial
                        performance_test["Approach"] = approach
                        performance_test["Approach_trial"] = i
                        performance_test["data_order"] = data_ord
                        performance_test_df = pd.concat([performance_test_df, performance_test])

                        # Prediction df
                        predictions_test = model_eval_test.predictions
                        predictions_test["trial"] = trial
                        predictions_test["Approach"] = approach
                        predictions_test["Approach_trial"] = i
                        predictions_test["data_order"] = data_ord
                        predictions_test_df = pd.concat([predictions_test_df, predictions_test])

                        del best_model, best_params, all_results, ml_model

                if approach == 'Complete set':
                    break

    display(performance_test_df)
    parameter_df = pd.DataFrame(parameter_resume)

    # Save results
    if data_ord == 'y_rand':
        result_path = create_directory('./regression_results/y_rand/')
        performance_train_df.to_csv(os.path.join(result_path, f'performance_train_y_rand.csv'))
        performance_train_df.to_csv(os.path.join(result_path, f'performance_train_gcn_y_rand.csv'))
        performance_test_df.to_csv(os.path.join(result_path, f'performance_test_gcn_y_rand.csv'))
        parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters_gcn_y_rand.csv'))
        predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test_gcn_y_rand.csv'))
    else:
        result_path = create_directory('./regression_results/regular/')
        performance_train_df.to_csv(os.path.join(result_path, f'performance_train_gcn.csv'))
        performance_test_df.to_csv(os.path.join(result_path, f'performance_test_gcn.csv'))
        parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters_gcn.csv'))
        predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test_gcn.csv'))