# Load Libraries

In [5]:
#Utils
from ml_utils import *
from machine_learning_models import *
from fingerprints import *
import random
from IPython.core.display_functions import display
from tqdm.notebook import tqdm
#Sklearn
from sklearn.model_selection import ShuffleSplit

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Models Parameters
### Select the desired parameters to be used by regression models
<p>
<li> <b>model_list</b>: ML/DL models for regression (kNN: k-neirest neighbor, SVR: Support Vector Regression, RFR: Random Forest Regression, DNN: Deep Neural Network, MR: Median regression)</li>
</p>
<p>
<li> <b>cv_fold</b>: Number do data splits (trials) to be performed</li>
</p>
<p>
<li> <b>opt_metric</b>: Optimization metric to be use for model optimization (MAE: 'neg_mean_absolute_error', MSE: ‘neg_mean_squared_error’)</li>
</p>
<p>
<li> <b>data_order</b>: Different data orders ('regular': Normal potency (y) order, 'y_rand': Randomized potency values) </li>
</p>
<p>
<li> <b>compound_sets</b>: Compound sets to be generated ('Complete set': 100% compounds, 'Random set': Random set of compounds, 'Diverse set': Chemical diverse set of compounds) </li>
</p>
<p>
<li> <b>compound_sets_size</b>: Compound sets size to be generated for 'Random' and 'Diverse' based on the size of the respective 'Complete' ('Complete set': 100% compounds, 'Random set': 25%, 'Diverse set': 25%) </li>
</p>




In [3]:
model_list = ['kNN', 'SVR', 'RFR', 'DNN', 'MR']
cv_folds=10
opt_metric = "neg_mean_absolute_error"
data_order = ['regular', 'y_rand']
compound_sets = ['Complete set', 'Random set', 'Diverse set']
compound_sets_size = 0.25


# Load Data
### Load compound database to be used for the regression models

<li> <b>db_path</b>: dataset full path</li>
</p>

In [None]:
# Database path
db_path = './dataset/'
# Load actives dB
regression_db = pd.read_csv(os.path.join(db_path, f'chembl_30_IC50_10_tids_1000_CPDs.csv'))
# Regression Compound Targets
regression_tids = regression_db.chembl_tid.unique()[:10]

# Models

In [None]:
# Final Dataframes
performance_train_df = pd.DataFrame()
predictions_train_df = pd.DataFrame()
performance_test_df = pd.DataFrame()
predictions_test_df = pd.DataFrame()
parameter_resume = []

# Generate Molecular Fingerprints
morgan_radius2 = FoldedMorganFingerprint(radius=2)
morgan_radius2.fit_smiles(regression_db.nonstereo_aromatic_smiles.tolist())

for data_ord in data_order:
    for target in tqdm(regression_tids):
        for approach in compound_sets:
            for i in range(3):
                print(f'Training on {target}')

                # Select Target Database
                regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]

                # Constructing ChEMBL Dataset
                fp_matrix = morgan_radius2.transform_smiles(regression_db_tid.nonstereo_aromatic_smiles.tolist())

                # Randomized Class potency
                if data_ord == "y_rand":
                    random.shuffle(regression_db_tid.pPot.values)

                # Constructing Dataset
                dataset = Dataset(fp_matrix, np.array(regression_db_tid.pPot.values))
                dataset.add_instance("target", regression_db_tid.chembl_tid.values)
                dataset.add_instance("smiles", regression_db_tid.nonstereo_aromatic_smiles.values)

                # Data Sampling Approaches
                if approach == 'Diverse set':
                    fp_bit_vec = ECFP4(regression_db_tid.nonstereo_aromatic_smiles.tolist())
                    mol_idx = maxminpicker(fp_bit_vec, compound_sets_size, seed=i+1)
                    dataset = dataset[mol_idx]

                elif approach == 'Random set':
                    random.seed(i+1)
                    mol_idx = random.sample([idx for idx in range(dataset.features.shape[0])], int(compound_sets_size*dataset.features.shape[0]))
                    dataset = dataset[mol_idx]

                # Split dataset into TR and TE
                data_splitter = ShuffleSplit(n_splits=cv_folds, random_state=20021997, test_size=0.2)
                for trial, (train_idx, test_idx) in enumerate(data_splitter.split(dataset.features, dataset.target)):
                    print(f'Starting Trial {trial}')

                    #Defining Training and Test sets
                    training_set = dataset[train_idx]
                    test_set = dataset[test_idx]

                    for model in model_list:
                        print(f'Training {model}')

                        # Save ML models
                        model_fpath = create_directory(f"./regression_results/trained_models/{model}/", verbose=False)
                        if model == 'DNN':
                            ml_model = DNN(training_set, model, training_set.features.shape[1], seed=trial)
                            model_fpath += ".h5"
                            ml_model.model.save(model_fpath)
                        else:
                            ml_model = MLModel(training_set, model)
                            model_fpath += ".p.gz"

                        #Best model parameters dictionary
                        opt_parameters_dict = {'model': model,
                                               'trial': trial,
                                               'Target ID': target}
                        for param, value in ml_model.best_params.items():
                            opt_parameters_dict[param] = value
                        parameter_resume.append(opt_parameters_dict)

                        # TRAIN
                        #Model Evaluation
                        model_eval_train = Model_Evaluation(ml_model, training_set)

                        #Performance df
                        performance_train = model_eval_train.pred_performance
                        performance_train["trial"] = trial
                        performance_train["Approach"] = approach
                        performance_train["Approach_trial"] = i
                        performance_train_df = pd.concat([performance_train_df, performance_train])

                        # TEST
                        #Model Evaluation
                        model_eval_test = Model_Evaluation(ml_model, test_set)

                        #Performance df
                        performance_test = model_eval_test.pred_performance
                        performance_test["trial"] = trial
                        performance_test["Approach"] = approach
                        performance_test["Approach_trial"] = i
                        performance_test_df = pd.concat([performance_test_df, performance_test])

                        # Prediction df
                        predictions_test = model_eval_test.predictions
                        predictions_test["trial"] = trial
                        predictions_test["Approach"] = approach
                        predictions_test["Approach_trial"] = i
                        predictions_test_df = pd.concat([predictions_test_df, predictions_test])

                if approach == 'Complete set':
                    break

    # All Dataframes
    parameter_df = pd.DataFrame(parameter_resume)
    display(performance_test_df)

    # Save results
    result_path = './regression_results/'
    if data_ord == 'y_rand':
        performance_train_df.to_csv(os.path.join(result_path, f'performance_train_y_rand.csv'))
        performance_test_df.to_csv(os.path.join(result_path, f'performance_test_y_rand.csv'))
        parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters_y_rand.csv'))
        predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test_y_rand.csv'))
    else:
        performance_train_df.to_csv(os.path.join(result_path, f'performance_train.csv'))
        performance_test_df.to_csv(os.path.join(result_path, f'performance_test.csv'))
        parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters.csv'))
        predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test.csv'))