### BACE dataset

An example of classfication task.

### 1. Generate descriptors

In [None]:
# Import necessary packages
# -------------------------
import os
import sys
current_path = os.getcwd()
sys.path.append(current_path)
sys.path.append(os.path.join(sys.path[0], ".."))
from spoc.process import generate_descriptors

# Parameters
# ----------
task_name = "bace"

# Download dataset by using the linkage
url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/bace.csv"

# Data folder
data_file = "../data/sample_data/bace.csv"
feature_file = "./features/all_descriptor_set--bace.pkl.zip"

# SMILES column
smiles_col = "mol"

# If the test_mode="test", then a small amount of data will be used for test.
test_mode = "production"

# Descriptor generation
# ---------------------
# The generated descriptors will be stored with *.pkl.zip format in ./features/
generate_descriptors.load_data(
    task_name, url, data_file, feature_file, smiles_col, test_mode)

#### 2. Descriptor screening
Evaluate the performance of various descriptors by using random forest.

In [None]:
# Import necessary packages
# -------------------------
import pandas as pd
import numpy as np
import os
import sys
current_path = os.getcwd()
sys.path.append(current_path)
sys.path.append(os.path.join(sys.path[0], ".."))
from spoc.process import generate_dataset
from spoc.model import rf

import importlib
importlib.reload(rf)

# Parameters
# ----------
# Data folder
data_file = "../data/sample_data/bace.csv"
feature_file = "./features/all_descriptor_set--bace.pkl.zip"
output_file = "./output/rf_SF_opt--bace.csv"

# The data column of target y
task_col = "Class"

# The data column of SMILES
smiles_col = "mol"

# Data splitting mode, including:
# RandomSplitter, ScaffoldSplitter, SingletaskStratifiedSplitter
# The recommended splitting mode can be used by refering the published paper.
split_mode = "ScaffoldSplitter"

# Random seed.
# In this case, 5 random seeds are generated by 5 repeated tests.
rnds = range(0, 500, 100)

# Random seed for model training.
model_rnd = 42

# Load descriptor set & dataset
# -----------------------------
# 
data_df, desc_set_df = generate_dataset.load_dataset(data_file, feature_file)
print(f"data_df.shape: {data_df.shape}")
print(f"desc_set_df.shape: {desc_set_df.shape}")

# Model traning
# Screening the performance of all descriptors
#---------------------------------------------
# Feature list
feature_types = desc_set_df.columns
print(f"feature_types: {feature_types}")

# Random forest
# Hyper parameters of classification task.
n_estimators, criterion, max_features, max_depth = 100, 'gini', 'auto', None

result = []
for i, feat_type in enumerate(feature_types):

    print(f"{'-'*30}")
    print(f"{i}th task: {feat_type}")

    roc_auc_test_list = []
    for rnd in rnds:

        # Read dataset
        X_train, X_test, y_train, y_test = generate_dataset.single_descriptor(
            data_df, desc_set_df, smiles_col, task_col, feat_type, split_mode="RandomSplitter", frac_train=0.9, rnd=42)

        # Model training
        criteria = rf.rf_cls(X_train, X_test, y_train, y_test, n_estimators, criterion, max_features, max_depth, rnd=model_rnd)
        
        roc_auc_test = criteria['roc_auc_test']
        roc_auc_test_list.append(roc_auc_test)

    roc_auc_test_ave = round(np.average(roc_auc_test_list), 3)
    roc_auc_test_std = round(np.std(roc_auc_test_list), 3)

    result.append([feat_type, n_estimators, criterion, max_features, max_depth, str(
        roc_auc_test_list), roc_auc_test_ave, roc_auc_test_std])

df = pd.DataFrame(result, columns=['feat_type', 'n_estimators', 'criterion', 'max_features',
                  'max_depth', 'roc_auc_test_list', 'roc_auc_test_ave', 'roc_auc_test_std'])
# Save the results
df.to_csv(output_file)

#### 3. SPOC screening
Evaluate the performance of various S+POC combination by using random forest.

In [None]:
# Import necessary packages
# -------------------------
import pandas as pd
import numpy as np
import os
import sys
current_path = os.getcwd()
sys.path.append(current_path)
sys.path.append(os.path.join(sys.path[0], ".."))
from spoc.process import generate_dataset
from spoc.model import rf

import importlib
importlib.reload(generate_dataset)
importlib.reload(rf)

# Parameters
# ----------
# Data folder
data_file = "../data/sample_data/bace.csv"
feature_file = "./features/all_descriptor_set--bace.pkl.zip"
rf_SF_opt_file = "./output/rf_SF_opt--bace.csv"
output_file = "./output/rf_SPOC_opt--bace.csv"

# target y value
task_col = "Class"

# SMILES column
smiles_col = "mol"

# Data splitting mode
split_mode = "ScaffoldSplitter"

# The criteria used for descriptor evaluation
# For classification, roc_auc serves as the criteria
criterion_col = "roc_auc_test_ave"

# Performance sort order, bigger is better for roc_auc, so ascending=False  
ascending = False

# Random seed
rnds = range(0, 500, 100)

# Model random seed
model_rnd = 42

# Load descriptor set & dataset
# -----------------------------
data_df, desc_set_df = generate_dataset.load_dataset(data_file, feature_file)
print(f"data_df.shape: {data_df.shape}")
print(f"desc_set_df.shape: {desc_set_df.shape}")

# 20 best S+POC combination
# --------------------------
# exclude ['Mordred','RDKitDescriptors']
df = df[~df['feat_type'].isin(['Mordred', 'RDKitDescriptors'])]

# ROC_AUC: bigger is better
df = df.sort_values(by=[criterion_col], axis=0, ascending=ascending)

# Choose the best 20 fingerprint
df = df.iloc[:20, :]
feature_type_Ss = df['feat_type'].values
print(f"20 best feature_Ss: {feature_type_Ss}")
feature_type_POCs = ['Mordred', 'RDKitDescriptors']
print(f"feature_POCs: {feature_type_POCs}")

# Model traning
# Screening the performance of all descriptors
#---------------------------------------------
# Feature list
feature_types = desc_set_df.columns
print(f"feature_types: {feature_types}")

# Random forest
# Hyper parameters of classification
n_estimators, criterion, max_features, max_depth = 100, 'gini', 'auto', None

result = []
for i, feat_type_S in enumerate(feature_type_Ss):
    for j, feat_type_POC in enumerate(feature_type_POCs):
        print(f"{'-'*30}")
        print(f"{i}th task: {feat_type}")

        roc_auc_test_list = []
        for rnd in rnds:

            # Load dataset
            X_train, X_test, y_train, y_test = generate_dataset.SPOC_descriptor(
                data_df, desc_set_df, smiles_col, task_col, feat_type_S, feat_type_POC, split_mode="RandomSplitter", frac_train=0.9, rnd=42)

            # Model training
            criteria = rf.rf_cls(
                X_train, X_test, y_train, y_test, n_estimators, criterion, max_features, max_depth, rnd=model_rnd)
            roc_auc_test = criteria["roc_auc_test"]
            roc_auc_test_list.append(roc_auc_test)

        roc_auc_test_ave = round(np.average(roc_auc_test_list), 3)
        roc_auc_test_std = round(np.std(roc_auc_test_list), 3)

        result.append([feat_type_S, feat_type_POC, n_estimators, criterion, max_features,
                      max_depth, str(roc_auc_test_list), roc_auc_test_ave, roc_auc_test_std])

df = pd.DataFrame(result, columns=['feat_type_S', 'feat_type_POC', 'n_estimators', 'criterion',
                  'max_features', 'max_depth', 'roc_auc_test_list', 'roc_auc_test_ave', 'roc_auc_test_std'])

# Save the results
df.to_csv(output_file)


#### 4. Bayes Optimization by using LightGBM

In [None]:
# Import necessary packages
# -------------------------
import pandas as pd
import numpy as np
import os
import sys
current_path = os.getcwd()
sys.path.append(current_path)
sys.path.append(os.path.join(sys.path[0], ".."))
from spoc.model import rf, lightgbm
from spoc.process import generate_dataset

import importlib
importlib.reload(generate_dataset)
importlib.reload(lightgbm)

# Parameters
# ----------
# Data folder
data_file = "../data/sample_data/bace.csv"
feature_file = "./features/all_descriptor_set--bace.pkl.zip"
rf_SPOC_opt_file = "./output/rf_SPOC_opt--bace.csv"
output_file = "./output/lgb_SPOC_opt--bace.csv"

# target y value
task_col = "Class"

# SMILES column
smiles_col = "mol"

# Data splitting mode
split_mode = "ScaffoldSplitter"

# The criteria used for descriptor evaluation
# In this case (classification), roc_auc serves as the criteria
criterion_col = "roc_auc_test_ave"

# Performance sort order, bigger is better for ROC_AUC, so ascending=False 
ascending = False

# Random seed
rnds = range(0, 500, 100)

# Model random seed
model_rnd = 42

# Bayes Optimization
# Initial iteration, more is better
init_iter = 3

# Optimization times, more is better
n_iters = 5

# Task type: classification or regression
task_type = "binary_classification"

# LightGBM score function
feval, ascending = lightgbm.feval_value(criterion_col)

# Load descriptor set & dataset
# -----------------------------
data_df, desc_set_df = generate_dataset.load_dataset(data_file, feature_file)
print(f"data_df.shape: {data_df.shape}")
print(f"desc_set_df.shape: {desc_set_df.shape}")

# Best S+POC combination
# --------------------------
df = pd.read_csv(rf_SPOC_opt_file)
df = df.sort_values(by=[criterion_col], axis=0, ascending=ascending)
feat_type_S = df['feat_type_S'].values[0]
feat_type_POC = df['feat_type_POC'].values[0]
print(f"Best S+POCs combination: {feat_type_S} + {feat_type_POC}")

# Start hyper opt:
#------------------------------------------
roc_auc_train_list, roc_auc_test_list = [], []
result = []
for seed in rnds:
    # Load data
    X_train, X_test, y_train, y_test = generate_dataset.SPOC_descriptor(data_df, desc_set_df, smiles_col, task_col, feat_type_S, feat_type_POC, split_mode, frac_train=0.9, rnd=42)

    print(f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}")

    # Optimization
    best_params = lightgbm.bayesopt_lgb(X_train, y_train, init_iter, n_iters, feval, criterion_col, pds='default', random_state=model_rnd, seed=seed, task="binary_classification")
    print(best_params)

    criteria = lightgbm.lgb_train(X_train, X_test, y_train, y_test, best_params, feval, seed=seed, task=task_type)
    roc_auc_train =  criteria["roc_auc_train"]
    roc_auc_test =  criteria["roc_auc_test"]
    roc_auc_train_list.append(roc_auc_train)
    roc_auc_test_list.append(roc_auc_test)

    temp_result = [seed, roc_auc_train, roc_auc_test, best_params]
    result.append(temp_result)

# Summary
roc_auc_train_ave = np.average(roc_auc_train_list)
roc_auc_test_ave = np.average(roc_auc_test_list)
roc_auc_train_std = np.std(roc_auc_train_list)
roc_auc_test_std = np.std(roc_auc_test_list)

result.append(["ave", roc_auc_train_ave, roc_auc_test_ave, "--"])
result.append(["std", roc_auc_train_std, roc_auc_test_std, "--"])

# Save results
# --------------
df = pd.DataFrame(result, columns=['entry', 'roc_auc_train_ave', 'roc_auc_test_ave', 'best-params'])
df.to_csv(output_file)


#### 5. Bayes Optimization with XGBoost 

In [None]:
# Import necessary packages
# -------------------------
import pandas as pd
import numpy as np
import os
import sys
current_path = os.getcwd()
sys.path.append(current_path)
sys.path.append(os.path.join(sys.path[0], ".."))
from spoc.model import xgboost
from spoc.process import generate_dataset

import importlib
importlib.reload(xgboost)
importlib.reload(generate_dataset)

# Parameters
# ----------
# Data folder
data_file = "../data/sample_data/bace.csv"
feature_file = "./features/all_descriptor_set--bace.pkl.zip"
rf_SPOC_opt_file = "./output/rf_SPOC_opt--bace.csv"
output_file = "./output/xgb_SPOC_opt--bace.csv"

# target y value
task_col = "Class"

# SMILES column
smiles_col = "mol"

# Data splitting mode
split_mode = "ScaffoldSplitter"

# The criteria used for descriptor evaluation
# In this case (classification), roc_auc serves as the criteria
criterion_col = "roc_auc_test_ave"

# Performance sort order, bigger is better for ROC_AUC, so ascending=False 
ascending = False

# Random seed
rnds = range(0, 500, 100)

# Model random seed
model_rnd = 42

# Bayes Optimization
# Initial iteration, more is better
init_iter = 1

# Optimization times, more is better
n_iters = 1

# Task type: classification or regression
task_type = "binary_classification"

# Load descriptor set & dataset
# -----------------------------
data_df, desc_set_df = generate_dataset.load_dataset(data_file, feature_file)
data_df = data_df[:600]
print(f"data_df.shape: {data_df.shape}")
print(f"desc_set_df.shape: {desc_set_df.shape}")

# Best S+POC combination
# --------------------------
df = pd.read_csv(rf_SPOC_opt_file)
df = df.sort_values(by=[criterion_col], axis=0, ascending=ascending)
feat_type_S = df['feat_type_S'].values[0]
feat_type_POC = df['feat_type_POC'].values[0]
print(f"Best S+POCs combination: {feat_type_S} + {feat_type_POC}")

# XGBoost score function
feval, ascending = xgboost.feval_value(criterion_col)

# Start hyper opt:
#------------------------------------------
roc_auc_train_list, roc_auc_test_list = [], []
result = []
for seed in rnds:
    # Load data
    X_train, X_test, y_train, y_test = generate_dataset.SPOC_descriptor(data_df, desc_set_df, smiles_col, task_col, feat_type_S, feat_type_POC, split_mode, frac_train=0.9, rnd=42)

    print(f"X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, y_test.shape: {y_test.shape}")

    # Optimization
    best_params = xgboost.bayesion_opt_xgb(X_train, y_train, task_type, init_iter, n_iters, feval, criterion_col, pds='default', random_state=model_rnd, seed=seed)
    print(best_params)

    criteria = xgboost.xgb_train(X_train, X_test, y_train, y_test, task_type, best_params, feval)
    roc_auc_train =  criteria["roc_auc_train"]
    roc_auc_test =  criteria["roc_auc_test"]
    roc_auc_train_list.append(roc_auc_train)
    roc_auc_test_list.append(roc_auc_test)

    temp_result = [seed, roc_auc_train, roc_auc_test, best_params]
    result.append(temp_result)

# Summary
roc_auc_train_ave = np.average(roc_auc_train_list)
roc_auc_test_ave = np.average(roc_auc_test_list)
roc_auc_train_std = np.std(roc_auc_train_list)
roc_auc_test_std = np.std(roc_auc_test_list)

result.append(["ave", roc_auc_train_ave, roc_auc_test_ave, "--"])
result.append(["std", roc_auc_train_std, roc_auc_test_std, "--"])

# Save results
# ------------
df = pd.DataFrame(result, columns=['entry', 'roc_auc_train_ave', 'roc_auc_test_ave', 'best-params'])
df.to_csv(output_file)