In [1]:
# Utils imports
import pandas as pd
import numpy as np
import os
import joblib
import random

#Optimization imports
import optuna as opt

#Evaluation imports
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
#Model imports
from xgboost import XGBRegressor
from pytorch_tabnet.tab_model import TabNetRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor

In [9]:
# Preparing data

df = pd.read_csv("../dataset/smogn_syn_data.csv")
tar_col = "LC50 [-LOG(mol/L)]"
Syn = "isSyn"
isSyn = df[Syn]
X = df.drop([tar_col,Syn], axis=1)
Y = df[tar_col]
isSyn


# PARAMS 
NUM_TRIALS = 15

0       0
1       0
2       0
3       0
4       0
       ..
1069    1
1070    1
1071    1
1072    1
1073    1
Name: isSyn, Length: 1074, dtype: int64

In [15]:
# Creating the dict for model and trial values
models = {
    "XGBRegressor" : {
        "model" : XGBRegressor,
        "param" : {
                "n_estimators":  'trial.suggest_categorical("xgb_est",[4500,5000])',
                "learning_rate": 'trial.suggest_categorical("xgb_lr",[0.01,3e-4,0.1])',
                "booster" : 'trial.suggest_categorical("xgb_booster",["gbtree","gblinear","dart"])',
                "tree_method" : 'trial.suggest_categorical("xgb_treemethod",["gpu_hist"])',
                "predictor" : 'trial.suggest_categorical("xgp_predictor",["gpu_predictor"])'
        }
    }
}   

In [36]:
# main objective function for optuna 
def train_main(X,Y,isSyn,train_fold,models):
    train_fold = joblib.load("../exports/train_test_fold_data.z")
    fold=0
    train_index = train_fold[fold]["train"]
    test_index = train_fold[fold]["test"]
    X_main = X.iloc[train_index, :].to_numpy(dtype=np.float64)
    Y_main = Y[train_index].to_numpy(dtype=np.float64)
    X_test = X.iloc[test_index, :].to_numpy(dtype=np.float64)
    Y_test = Y[test_index].to_numpy(dtype=np.float64)
    out_data = {}
    for model_name,model_item in models.items():
        def objective(trial):
            clf = model_item["model"](model_item["param"])
            clf.fit(X_main, Y_main,
                    eval_set = [(X_test, Y_test)]
                    eval_metric = ["rmse"])
            Y_pred = clf.predict(X_test)
            error = mean_squared_error(Y_pred, Y_test, squared=False)
            return error
        study = opt.create_study(direction='minimize')
        study.optimize(objective, n_trials = NUM_TRIALS)
        best_params = study.best_params
        trial_data = trial.get_trials()
        clf_main= model_item["model"](**best_params)
        clf.fit(X_main, Y_main,
                eval_set = [(X_test, Y_test)]
                eval_metric = ["rmse"])
        Y_pred_main = clf.predict(X_test)
        error_metrics_all = {
            "mse_error" : mean_squared_error(Y_pred_main,Y_test),
            "mae_error" : mean_absolute_error(Y_pred_main,Y_test),
            "rmse_error" : mean_squared_error(Y_pred_main,Y_test, squared=False),
            "r2_score" : r2_score(Y_pred_main,Y_test)}
        
        out_data[model_name] = { "best_params"  : best_params,
                                 "trial_data" : trial_data,
                                "error_metric_all": error_metrics_all,
                                
                                 
        
        

(1002, 6) (72, 6) (1002,) (72,)
