# Index

## 1. Enviroment Preparation
#### &nbsp;&nbsp;&nbsp;&nbsp;1.1. Import Libraries
#### &nbsp;&nbsp;&nbsp;&nbsp;1.2. Define Error and Loss Fuction
#### &nbsp;&nbsp;&nbsp;&nbsp;1.3. Define Data Preprosessing Function
#### &nbsp;&nbsp;&nbsp;&nbsp;1.4. Define General Algorithms and able-to-be-looped Objects

## 2. Data Preparation
#### &nbsp;&nbsp;&nbsp;&nbsp;2.1. Define Data Preprosessing Function
#### &nbsp;&nbsp;&nbsp;&nbsp;2.2. Screening for Suitable Data Preprossessing Techniques

## 3. Optimization of Hyperparameters
#### &nbsp;&nbsp;&nbsp;&nbsp;3.1. Find the Best n_estimators/n_neighbors for Preliminary Search Space Reduction
#### &nbsp;&nbsp;&nbsp;&nbsp;3.2. Use GridSearchCV to Look Up Hyperparameters except NN and MLP
#### &nbsp;&nbsp;&nbsp;&nbsp;3.3. (Neural Network Intelligence is Implement in  Another File)
## 4. 10-Fold CV

In [None]:
### Separate Line ###

# 1. Enviroment Preparation
## 1.1 Import Libraries

In [None]:
# sklearn
from sklearn.preprocessing import normalize, power_transform, binarize, maxabs_scale, minmax_scale, quantile_transform
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn import ensemble
from sklearn import svm
from sklearn import neighbors
from sklearn import neural_network

# Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import TensorBoard

# XGBoost
from xgboost import XGBRegressor

# Matplotlib & Seaborn
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.lines as mlines


# Data structure and Math
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import gaussian_kde
import math

## 1.2 Define Error and Loss Fuction

In [None]:
# Define RMSE
def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())


# Define RMSE Loss
def rmse_loss(y_test, y_pred):
    loss = tf.sqrt(tf.reduce_mean(tf.square(y_test - y_pred)))
    return loss

## 1.3. Define Data Preprosessing Function

In [None]:
data_regularization = ['power_transform', 'quantile_transform',
                       'maxabs_scale', 'minmax_scale',
                       'binarize', 'l2']


def prepare_data(X, prepro):
    if prepro == 'l2':
        return normalize(X, norm='l2')
    if prepro == 'power_transform':
        return power_transform(X)
    if prepro == 'binarize':
        return binarize(X)
    if prepro == 'maxabs_scale':
        return maxabs_scale(X)
    if prepro == 'minmax_scale':
        return minmax_scale(X)
    if prepro == 'quantile_transform':
        return quantile_transform(X)

## 1.4. Define General Algorithms and able-to-be-looped Objects

In [None]:
# Define a code that run through algorithms of sklearn and result in the same format of output data
# sklearn algorithms and XGB
regressor_objects = {
                     # Ensemble regressors
                     'RF': ensemble.RandomForestRegressor(n_jobs=-1),
                     'ET': ensemble.ExtraTreesRegressor(n_jobs=-1),
                     'Bagging': ensemble.BaggingRegressor(n_jobs=-1),
                     # SVR and k-NN
                     'SVR': svm.SVR(),
                     'kNN': neighbors.KNeighborsRegressor(n_jobs=-1),
                     # XGB
                     'XGB': XGBRegressor(n_jobs=-1),
                     # plain NN
                     'MLP': neural_network.MLPRegressor((256,256,256))
                     }

# define a general sklearn machine learning model

def ml_model(splited_data,regressor_object):
    X_train, X_test, y_train, y_test = \
        splited_data[0], splited_data[1], splited_data[2], splited_data[3]
    regressor_object.fit(X_train, y_train)

    train_prediction = regressor_object.predict(X_train)
    R2_train = np.square(pearsonr(y_train, train_prediction)[0])
    RMSE_train = rmse(y_train, train_prediction)

    test_prediction = regressor_object.predict(X_test)
    R2_test = np.square(pearsonr(y_test, test_prediction)[0])
    RMSE_test = rmse(y_test, test_prediction)

    return [R2_train, RMSE_train, R2_test, RMSE_test, test_prediction]


# 2. Data Preparation
## 2.1. Import Data for Universal Usage

In [None]:
data = pd.read_csv(r'input_csv_path')

## 2.2 Screening for the Best Datapreparation Technique

In [None]:
columns = ['Regressor', 'Pearson R', 'Sigma R', 'RMSE', 'Sigma RMSE', 'Fold', 'Preprocessing Technique']
metrics = pd.DataFrame(columns=columns)
for technique in data_regularization:
    X = prepare_data(data.iloc[:, 5:], prepro=technique)
    y = data['IE']
    kf_num = 10
    kf = KFold(n_splits=kf_num)
    kf.get_n_splits(X)

    print('Now Examing {}'.format(technique))
    for regressor, regressor_object in regressor_objects.items():
        r2_list = []
        rmse_list = []
        fold_number = 0
        for train_index, test_index in kf.split(X):
            fold_number += 1
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            splited_data = [X_train, X_test, y_train, y_test]
            score = ml_model(splited_data, regressor_object)
            r2_list.append(score[2])
            rmse_list.append(score[3])

        r_mean = np.mean(np.array(r2_list))
        r_sigma = np.std(np.array(r2_list))
        rmse_mean = np.mean(np.array(rmse_list))
        rmse_sigma = np.std(np.array(rmse_list))
        new_row = pd.DataFrame([[regressor, r_mean, r_sigma, rmse_mean, rmse_sigma, fold_number, technique]], columns=columns)
        metrics = metrics.append(new_row, ignore_index=True)

In [None]:
plt.figure(figsize=(8,6))
colors = ['b', 'y', 'g', 'r', 'cyan', 'brown']
i = 0
for technique in data_regularization:
    plt.plot(metrics[metrics['Preprocessing Technique']==technique]['Regressor'],metrics[metrics['Preprocessing Technique']==technique]['RMSE'],'m.-',c=colors[i])
    i += 1

plt.ylim([0.4, 0.9])
plt.legend(data_regularization)
plt.tight_layout()
plt.savefig(r'output_png_path', dpi=328)
plt.show()
metrics

# 3. Optimization of Hyperparameters


#### Define Ploting Fuction

In [None]:
def plot_bar(metric, metric_name, n_param, n_param_name, algorithms_name):
    plt.bar(x=n_param, height=metric, color='b', width=0.7)
    plt.ylabel(metric_name)
    plt.xlabel(n_param_name)
    plt.xlim([n_param[0]-1, n_param[-1]+1])
    plt.title('The {} as the Increasing of {} for {}'.format(metric_name, n_param_name,algorithms_name))
    plt.show()

## 3.1. N_estimators / N_neighbors Selections for Full Dataset

In [None]:
X = prepare_data(data.iloc[:, 5:], prepro='quantile_transform')
y = data['IE']
kf_num = 10
kf = KFold(n_splits=kf_num)
kf.get_n_splits(X)
cv_df = pd.DataFrame(columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])

n_estrs = np.arange(1, 101, 1).tolist()
n_neibs = np.arange(1,21,1).tolist()

estimators_type_algorithms = ['RF', 'ET', 'Bagging','XGB', 'kNN']

for regressor in estimators_type_algorithms:
    x_lable = 'n_estimators'
    if regressor == 'kNN':
        n_estrs = n_neibs
        x_lable = 'n_neighbors'

    rmse_list = []
    r2_list = []
    for n_est in n_estrs:
        fold_number = 0
        rmse_mean = 0
        r2_mean = 0
        for train_index, test_index in kf.split(X):
            fold_number += 1
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            splited_data = [X_train, X_test, y_train, y_test]
            if regressor == 'RF':
                metrics = ml_model(splited_data, ensemble.RandomForestRegressor(n_estimators=n_est, n_jobs=-1))
            elif regressor == 'ET':
                metrics = ml_model(splited_data, ensemble.ExtraTreesRegressor(n_estimators=n_est, n_jobs=-1))
            elif regressor == 'Bagging':
                metrics = ml_model(splited_data, ensemble.BaggingRegressor(n_estimators=n_est, n_jobs=-1))
            elif regressor == 'kNN':
                metrics = ml_model(splited_data,  neighbors.KNeighborsRegressor(n_neighbors=n_est ,n_jobs=-1))
            elif regressor == 'XGB':
                metrics = ml_model(splited_data, XGBRegressor(n_estimators=n_est ,n_jobs=-1))
            r2_mean += metrics[2]
            rmse_mean += metrics[3]
        r2_list.append(r2_mean/kf_num)
        rmse_list.append(rmse_mean/kf_num)

    r2_min = min(r2_list)
    rmse_min = min(rmse_list)
    n_est_rmse_min = n_estrs[rmse_list.index(rmse_min)]
    print('The Best {} for {} is {} with R2={} and RMSE={}.'.format(x_lable, regressor, n_est_rmse_min, r2_min, rmse_min))
    plot_bar(r2_list, 'R2', n_estrs, x_lable, regressor)
    plot_bar(rmse_list, 'RMSE', n_estrs, x_lable, regressor)

## 3.2. Use GridSearchCV to Look Up Hyperparameters except NN and MLP

In [None]:
updated_regressor_objects = {
                     # Ensemble regressors
                     'RF': ensemble.RandomForestRegressor(n_estimators=80, n_jobs=-1),
                     'ET': ensemble.ExtraTreesRegressor(n_estimators=90, n_jobs=-1),
                     'Bagging': ensemble.BaggingRegressor(n_estimators=95, n_jobs=-1),
                     # SVR and k-NN
                     'SVR': svm.SVR(),
                     'kNN': neighbors.KNeighborsRegressor(n_neighbors=8, n_jobs=-1),
                     # XGB
                     'XGB': XGBRegressor(n_estimators=100, n_jobs=-1),
                     # plain NN
                     #'MLP': neural_network.MLPRegressor()
                     }

X = prepare_data(data.iloc[:,5:], prepro='quantile_transform')
y = data['IE']

In [None]:
RF_param = {'min_samples_split': [2, 3, 4, 5],
            'min_samples_leaf': [1, 2, 3, 4],
            'max_features': ['auto', 'sqrt', 'log2'],
            }

RF_grid = GridSearchCV(updated_regressor_objects['RF'],
                       param_grid=RF_param,
                       cv=10)
RF_grid.fit(X ,y)

print('Best Parameter of RF is ', RF_grid.best_params_)
print('Best Score of RF is ', RF_grid.best_score_)

In [None]:
ET_param = {'min_samples_split': [2, 3, 4, 5],
            'min_samples_leaf': [1, 2, 3, 4],
            'max_features': ['auto', 'sqrt', 'log2'],
            }

ET_grid = GridSearchCV(updated_regressor_objects['ET'],
                       param_grid=ET_param,
                       cv=10)
ET_grid.fit(X ,y)

print('Best Parameter of ET is ', ET_grid.best_params_)
print('Best Score of ET is ', ET_grid.best_score_)

In [None]:
Bagging_param = {'max_samples':[0.8, 0.9, 1.0],
                 'max_features':[0.8, 0.9, 1.0]}

Bagging_grid = GridSearchCV(updated_regressor_objects['Bagging'],
                       param_grid=Bagging_param,
                       cv=10)
Bagging_grid.fit(X ,y)

print('Best Parameter of Bagging is ', Bagging_grid.best_params_)
print('Best Score of Bagging is ', Bagging_grid.best_score_)


In [None]:
SVR_param = {'gamma': ['scale', 'auto'],
             'C': [18, 20, 22],
             'epsilon': [0.01, 0.05]
             }

SVR_grid = GridSearchCV(updated_regressor_objects['SVR'],
                       param_grid=SVR_param,
                       cv=10)
SVR_grid.fit(X ,y)

print('Best Parameter of SVR is ', SVR_grid.best_params_)
print('Best Score of SVR is ', SVR_grid.best_score_)

In [None]:
kNN_param = {'weights': ['uniform', 'distance'],
             'algorithm': ['ball_tree', 'kd_tree', 'brute'],
             'p': [1, 2, 3, 4]}

kNN_grid = GridSearchCV(updated_regressor_objects['kNN'],
                       param_grid=kNN_param,
                       cv=10)
kNN_grid.fit(X ,y)

print('Best Parameter of kNN is ', kNN_grid.best_params_)
print('Best Score of kNN is ', kNN_grid.best_score_)

In [None]:
XGB_param = {#'learning_rate ': [0.01, 0.1, 0.3],
             'gamma': [0, 1, 3, 5, 7, 9],
             #'max_depth ': [3, 6, 9],
             #'min_child_weight ': [0, 1, 10],
             }

XGB_grid = GridSearchCV(updated_regressor_objects['XGB'],
                       param_grid=XGB_param,
                       cv=10)
XGB_grid.fit(X ,y)

print('Best Parameter of XGB is ', XGB_grid.best_params_)
print('Best Score of XGB is ', XGB_grid.best_score_)

## 3.3. (Neural Network Intelligence is Implement in  Another File)

## 4. 10-Fold CV
#### Define Ploting Function

In [None]:
def plot_scatter(predict, experiment, regressor,line_split, property, outdir):
    # define density plot
    xy = np.vstack([experiment, predict])
    z = gaussian_kde(xy)(xy)
    # define stright lines
    num_list = []
    num_list.extend(predict)
    num_list.extend(experiment)
    mini=math.floor(min(num_list))
    maxi=math.ceil(max(num_list))
    fig=plt.figure(figsize=(12,10))
    ax = fig.add_subplot()
    plt.scatter(experiment,predict,s=20, c=z, cmap='Spectral')
    line = mlines.Line2D([mini,maxi], [mini, maxi], color='red')
    line1 = mlines.Line2D([mini,maxi - line_split], [mini + line_split, maxi], color='red')
    line2 = mlines.Line2D([mini + line_split,maxi], [mini, maxi - line_split], color='red')
    ax.add_line(line)
    ax.add_line(line1)
    ax.add_line(line2)
    plt.title("Experimental by Predicted {} using ".format(property) + regressor + '\nin 10 Fold Validation')
    plt.xlabel("Experimental {}".format(property))
    plt.ylabel("Predicted {}".format(property))
    plt.xlim(mini,maxi)
    plt.ylim(mini,maxi)
    #plt.grid(True)
    plt.colorbar()
    plt.tight_layout()
    outdir = outdir + '/{}.png'.format(regressor)
    plt.savefig(outdir, dpi=328)
    plt.show()




#### Define Algorithms

In [None]:
algorithms = ['RF', 'ET', 'Bagging', 'SVR','kNN','XGB', 'MLP', 'NN']
# Define a code that run through algorithms of sklearn and result in the same format of output data
# sklearn algorithms and XGB
regressor_objects = {
                     # Ensemble regressors
                     'RF': ensemble.RandomForestRegressor(n_estimators=80, n_jobs=-1,
                                                          max_features='auto',
                                                          min_samples_leaf=1,
                                                          min_samples_split=2),

                     'ET': ensemble.ExtraTreesRegressor(n_estimators=90,n_jobs=-1,
                                                        max_features='auto',
                                                        min_samples_leaf=1,
                                                        min_samples_split=2),

                     'Bagging': ensemble.BaggingRegressor(n_estimators=100, n_jobs=-1,
                                                          max_features=0.9,
                                                          max_samples=1.0),
                     # SVR and k-NN
                     'SVR': svm.SVR(gamma='scale',
                                    C=20,
                                    epsilon=0.01),

                     'kNN': neighbors.KNeighborsRegressor(n_neighbors=8,n_jobs=-1,
                                                          algorithm='ball_tree',
                                                          p=3,
                                                          weights='distance'),

                     # XGB
                     'XGB': XGBRegressor(n_estimators=100,n_jobs=-1),

                     # simple NN
                     'MLP': neural_network.MLPRegressor((256,256,256))
                     }


# define a NN from tensorflow
def nn(input_shape):
    model = keras.Sequential([
        # Hidden Layer 1
        layers.Dense(256, activation='relu', input_shape=input_shape),
        layers.BatchNormalization(),
        layers.Dropout(rate=0.24),

        # Hidden Layer 2
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(rate=0.24),

        # Hidden Layer 3
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(rate=0.24),

        # Output Layer
        layers.Dense(1)
    ])

    optimizer = tf.keras.optimizers.Adam(0.0001)
    # loss can be mse or rmse
    model.compile(
                loss='mse',
                #loss=rmse_loss,
                optimizer=optimizer,
                metrics=['mae', 'mse'])

    return model

#### 10 Fold-CV

In [None]:
data = pd.read_csv(r'input_csv_path')
outdir = r'output_path'

excel_dir = outdir + 'output_outliers_xlsx_path'
writer = pd.ExcelWriter(excel_dir)

# number of descriptors for covariance: 20, 90, 160, 360, 640
X = prepare_data(data.iloc[:,5:], prepro='quantile_transform')
y = data['IE']
kf = KFold(n_splits=10)
kf.get_n_splits(X)

# For data storage
train_cv_df = pd.DataFrame(columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])
test_cv_df = pd.DataFrame(columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])

for algorithm in algorithms:
    fold_number = 0
    y_predicted_list = []
    y_experimental_list = []
    # loop for CV of the tf NN object
    if algorithm == 'NN':
        outliers = pd.DataFrame(columns=['CAS Name','InChI','CAS Link','smiles','IE', 'Predicted IE'])
        for train_index, test_index in kf.split(X):
            fold_number += 1
            model = nn([len(data.iloc[:,5:].columns)]) # data.iloc[:,5:].columns
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            splited_data = [X_train, X_test, y_train, y_test]

            early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)

            model.fit(
                X_train, y_train,
                batch_size=32, # batch size is a hyper-opt
                epochs=1000, validation_split=0.1, verbose=0,
                callbacks=[early_stop])

            y_train_prediction = model.predict(X_train).flatten()
            # write train metrics
            r2_train = np.square(pearsonr(y_train, y_train_prediction)[0])
            rmse_train = rmse(y_train, y_train_prediction)
            train_new_row = pd.DataFrame([[algorithm, r2_train, rmse_train, fold_number]], columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])
            train_cv_df = train_cv_df.append(train_new_row, ignore_index=True)

            # Prediction by Neural Network
            y_test_prediction = model.predict(X_test).flatten()
            # write test metrics
            r2_test = np.square(pearsonr(y_test, y_test_prediction)[0])
            rmse_test = rmse(y_test, y_test_prediction)
            metrics = [r2_train, rmse_train, r2_test, rmse_test, y_test_prediction]
            test_new_row = pd.DataFrame([[algorithm, r2_test, rmse_test, fold_number]], columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])
            test_cv_df = test_cv_df.append(test_new_row, ignore_index=True)

            y_predicted_list.extend(y_test_prediction)
            y_experimental_list.extend(y_test)
            print(algorithm, 'at fold ', fold_number)
            for i in range(len(test_index)):
                ind_name = test_index[i]
                #if np.abs(metrics[2][i] - y_test[ind_name]) > 1.: # set outlier threshold
                mol = data.iloc[:,:5].iloc[[ind_name]].values[0]
                new_row = pd.DataFrame([[mol[0], mol[1], mol[2], mol[3], mol[4], metrics[4][i]]], columns=['CAS Name','InChI','CAS Link','smiles','IE', 'Predicted IE'])
                outliers = outliers.append(new_row, ignore_index=True)
        ots = outliers
        ots.to_excel(writer, sheet_name=algorithm)
        plot_scatter(y_predicted_list, y_experimental_list, algorithm, 1, 'IE',outdir)

    else:
        outliers = pd.DataFrame(columns=['CAS Name','InChI','CAS Link','smiles','IE', 'Predicted IE'])
        # loop for CVs of sklearn and xgb objects
        for train_index, test_index in kf.split(X):
            fold_number += 1

            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            splited_data = [X_train, X_test, y_train, y_test]

            metrics = ml_model(splited_data, regressor_objects[ algorithm])
            # [R2_train, RMSE_train, R2_test, RMSE_test, test_prediction]
            # write train metrics
            train_new_row = pd.DataFrame([[algorithm, metrics[0], metrics[1], fold_number]], columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])
            train_cv_df = train_cv_df.append(train_new_row, ignore_index=True)

            # write test metrics
            test_new_row = pd.DataFrame([[algorithm, metrics[2], metrics[3], fold_number]], columns=['Regressor', 'R^2', 'RMSE', 'Fold Number'])
            test_cv_df = test_cv_df.append(test_new_row, ignore_index=True)

            y_predicted_list.extend(metrics[4])
            y_experimental_list.extend(y_test)
            print(algorithm, 'at fold ', fold_number)
            
            for i in range(len(test_index)):
                ind_name = test_index[i]
                #if np.abs(metrics[2][i] - y_test[ind_name]) > 1.: # set outlier threshold
                mol = data.iloc[:,:5].iloc[[ind_name]].values[0]
                new_row = pd.DataFrame([[mol[0], mol[1], mol[2], mol[3], mol[4], metrics[4][i]]], columns=['CAS Name','InChI','CAS Link','smiles','IE', 'Predicted IE'])
                outliers = outliers.append(new_row, ignore_index=True)
        ots = outliers
        ots.to_excel(writer, sheet_name=algorithm)
        plot_scatter(y_predicted_list, y_experimental_list, algorithm, 1, 'IE',outdir)

writer.save()

In [None]:
train_dir = outdir + 'output_train_excel_path'
train_cv_df.to_excel(train_dir)
train_cv_df

In [None]:
test_dir = outdir + 'output_test_excel_path'
test_cv_df.to_excel(test_dir)
test_cv_df
