In [1]:
%load_ext autoreload

In [2]:
from collections import defaultdict
import joblib
from tqdm.notebook import tqdm
from ML.data_preprocessing import Dataset
from ML.machine_learning_models import *
# ML utils
from ML.machine_learning_models import Model_Evaluation as ml_evaluation
from ML.ml_utils_reg import create_directory, ECFP4, set_global_determinism, potency_classes
%autoreload 2

In [3]:
# parameters
# Models
model_list =  ['1-NN','kNN','SVR','RFR','MR']
# Number of trials (datasplits)
trial_splits = 10
#Molecular Fingerprint
fingerprint = 'ECFP4'
# Approach
approach = 'Unbalanced'

In [None]:
# Dataset path
db_path = "./dataset/Datasets Chembl/"
# Load actives dB
regression_db_all = pd.read_csv(db_path + f'chembl_30_IC50_500_CPDs.csv')
regression_db_all['potency_class'] = potency_classes(regression_db_all.pPot.values, [5, 7, 9, 11])
regression_db_all

## Selecting Targets

In [6]:
tid_list = []
n = 75
for tid in regression_db_all.chembl_tid.unique()[:]:
    df_reg_tid = regression_db_all.loc[regression_db_all.chembl_tid == tid]
    if df_reg_tid['potency_class'].unique().tolist() == [5, 7, 9]:
        if len(df_reg_tid.loc[df_reg_tid['potency_class'] == 5]) > n and len(df_reg_tid.loc[df_reg_tid['potency_class'] == 7]) > n and len(df_reg_tid.loc[df_reg_tid['potency_class'] == 9]) > n:
            tid_list.append(tid)
print(tid_list)

['CHEMBL333', 'CHEMBL268', 'CHEMBL280', 'CHEMBL203', 'CHEMBL279', 'CHEMBL2409', 'CHEMBL260', 'CHEMBL286']


In [None]:
regression_tids = tid_list
regression_db = regression_db_all.loc[regression_db_all.chembl_tid.isin(regression_tids)]
regression_db

In [8]:
# Results path
main_folder = 'regression_models_50_50'
result_path = f'./{main_folder}/{fingerprint}/{approach}/'

In [None]:
regression_tids = regression_db.chembl_tid.unique()

In [10]:
# Performance/prediction datasets
dict_storage = defaultdict(pd.DataFrame)
parameter_resume = []

performance_train_df = pd.DataFrame()
predictions_train_df = pd.DataFrame()
performance_test_df = pd.DataFrame()
predictions_test_df = pd.DataFrame()
datasets_final = pd.DataFrame()

for target in tqdm(regression_tids):

    target_path = create_directory(f'./{main_folder}/{fingerprint}/{approach}/{target}/')

    print(f'Training on {target}')

    # Select Target Database
    regression_db_tid = regression_db.loc[regression_db.chembl_tid == target]

    # Compound potency and mol smiles
    potency = regression_db_tid.pPot.values
    smiles = regression_db_tid.nonstereo_aromatic_smiles.values

    # Generate ECFP4
    fp_matrix = ECFP4(smiles)

    # Constructing Dataset
    dataset = Dataset(np.array(fp_matrix), np.array(potency))
    dataset.add_instance("target", regression_db_tid.chembl_tid.values)
    dataset.add_instance("smiles", smiles)
    dataset.add_instance("cid", regression_db_tid.chembl_cid.values)
    dataset.add_instance("fingerprint", np.array(fp_matrix))
    dataset.add_instance("potency_classes", regression_db_tid.potency_class.values)

    data_splitter = ShuffleSplit(n_splits=trial_splits, random_state=42, test_size=0.5)
    for trial, (train_idx, test_idx) in enumerate(data_splitter.split(dataset.features, dataset.target)):

        #display(df_regression)
        print(f'Starting Trial {trial}')

        #Training dataset
        training_set = dataset[train_idx]
        #Test dataset
        test_set = dataset[test_idx]

        datasets_train_size_df = pd.DataFrame(list(zip(training_set.cid, training_set.labels, training_set.target)), columns=['Compound ID', 'Potency', 'Target ID'])
        datasets_train_size_df['trial'] = trial
        datasets_train_size_df['Approach'] = approach
        datasets_train_size_df['dataset'] = 'Train'
        datasets_train_size_df["Training size (%)"] = 50
        datasets_final = pd.concat([datasets_final, datasets_train_size_df])

        datasets_test_size_df = pd.DataFrame(list(zip(test_set.cid, test_set.labels, test_set.target)), columns=['Compound ID', 'Potency', 'Target ID'])
        datasets_test_size_df['trial'] = trial
        datasets_test_size_df['Approach'] = approach
        datasets_test_size_df['dataset'] = 'Test'
        datasets_test_size_df["Test size (%)"] = 50
        datasets_final = pd.concat([datasets_final, datasets_test_size_df])

        #Set seed
        set_global_determinism(seed=trial)

        for model in model_list:

            print(f'Training {model}')

            # Create saving directory
            model_fpath = create_directory(f"./{main_folder}/{fingerprint}/{approach}/{target}/{model}/", verbose=False)

            ml_model = MLModel(training_set, model, data_type='unbalanced')
            joblib.dump(ml_model.model, os.path.join(model_fpath, f"{model}_{trial}.pkl"))

            #Best model parameters
            opt_parameters_dict = {**{'model': model,
                                        'trial': trial,
                                        'Target ID': target,
                                        }, **ml_model.best_params}
            parameter_resume.append(opt_parameters_dict)

            # TEST
            #Model Evaluation
            model_eval_train = ml_evaluation(ml_model, training_set, training_set, model_id=model)
            model_eval_test = ml_evaluation(ml_model, test_set, training_set, model_id=model)

            #Performance df
            performance_train = model_eval_train.pred_performance
            performance_train["trial"] = trial
            performance_train["Approach"] = approach
            performance_train_df = pd.concat([performance_train_df, performance_train])

            # Prediction df
            predictions_train = model_eval_train.predictions
            predictions_train["trial"] = trial
            predictions_train["Approach"] = approach
            predictions_train_df = pd.concat([predictions_train_df, predictions_train])

            #Performance df
            performance_test = model_eval_test.pred_performance
            performance_test["trial"] = trial
            performance_test["Approach"] = approach
            performance_test_df = pd.concat([performance_test_df, performance_test])

            # Prediction df
            predictions_test = model_eval_test.predictions
            predictions_test["trial"] = trial
            predictions_test["Approach"] = approach
            predictions_test_df = pd.concat([predictions_test_df, predictions_test])

parameter_df = pd.DataFrame(parameter_resume)
datasets_final["Target ID"] = datasets_final["Target ID"].map(lambda x: x.lstrip("CHEMBL").rstrip(""))
datasets_final['Target ID'] = datasets_final['Target ID'].astype(int)

# Save results
performance_train_df.to_csv(os.path.join(result_path, f'performance_train.csv'))
predictions_train_df.to_csv(os.path.join(result_path, f'predictions_train.csv'))
performance_test_df.to_csv(os.path.join(result_path, f'performance_test.csv'))
predictions_test_df.to_csv(os.path.join(result_path, f'predictions_test.csv'))
parameter_df.to_csv(os.path.join(result_path, f'model_best_parameters.csv'))
datasets_final.to_csv(os.path.join(result_path, f'dataset_training_sizes.csv'))

  0%|          | 0/8 [00:00<?, ?it/s]

Created new directory './pred_models_standart_50_50/ECFP4/Unbalanced/CHEMBL333/'
Training on CHEMBL333
Starting Trial 0
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 1
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 2
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 3
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 4
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 5
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 6
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 7
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 8
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Starting Trial 9
Training 1-NN
Training kNN
Training SVR
Training RFR
Training MR
Created new directory './pred_models_standart_50_50/ECFP4/Unbalanced/CHEMBL26