In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy.stats import uniform, randint

from sklearn.datasets import load_breast_cancer, load_diabetes, load_wine
from sklearn.metrics import auc, accuracy_score, balanced_accuracy_score, confusion_matrix, mean_squared_error
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, StratifiedKFold, RandomizedSearchCV, train_test_split
import catboost as cat

In [2]:
df=pd.read_csv(r"C:\Users\19189\Desktop\mordred_descriptors.csv")
df

Unnamed: 0,CASRN,CATMoS_LD50_mgkg,Canonical_QSARr,LogLD50,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,68523-18-2,460.00,CC1(C)C(C1C=C(Cl)Cl)C(=O)OC(C#N)C1C=CC=C(N=1)O...,2.662758,0.0,0.0,0,0,34.992970,2.613976,...,10.344674,84.588084,416.069448,9.044988,2244,39,146.0,172.0,10.201389,6.166667
1,88-04-0,3830.00,CC1C=C(O)C=C(C)C=1Cl,3.583199,0.0,0.0,0,0,11.643052,2.307250,...,9.078065,39.748909,156.034193,8.212326,110,13,48.0,54.0,4.944444,2.222222
2,603-50-9,4305.00,CC(=O)OC1C=CC(=CC=1)C(C1C=CC=CN=1)C1C=CC(=CC=1...,3.633973,0.0,0.0,0,0,34.387058,2.415622,...,10.045811,61.772145,361.131408,7.850683,1954,39,136.0,155.0,8.638889,6.000000
3,120-36-5,689.00,CC(OC1C=CC(Cl)=CC=1Cl)C(O)=O,2.838219,0.0,0.0,1,0,16.532794,2.301365,...,9.254262,45.062973,233.985049,10.635684,316,18,66.0,73.0,6.555556,3.138889
4,103-26-4,2610.00,COC(=O)C=CC1C=CC=CC=1,3.416641,0.0,0.0,0,0,15.354433,2.172565,...,8.688622,41.277218,162.068080,7.366731,226,12,52.0,55.0,4.222222,2.916667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11982,996-04-3,1.78,CP(=O)(SCCC)SCCC,0.250420,0.0,0.0,0,0,12.592066,2.188901,...,8.586906,39.549123,212.045844,7.573066,180,10,44.0,44.0,5.562500,2.750000
11983,996-05-4,18.00,CP(=S)(SCCC)SCCC,1.255273,0.0,0.0,0,0,12.592066,2.188901,...,8.586906,39.549123,228.023000,8.143679,180,10,44.0,44.0,5.562500,2.750000
11984,99784-08-4,7500.00,COC1C=CC(=CC=1O)C1COC2=CC=CC=C2O1,3.875061,0.0,0.0,0,0,25.276144,2.403626,...,9.883948,52.530255,258.089209,7.820885,720,30,100.0,118.0,5.416667,4.250000
11985,99874-01-8,5220.00,CC(C)(S)C(=O)NC(CS)C(O)=O,3.717671,0.0,0.0,1,0,13.983995,2.272120,...,9.131838,43.455545,223.033685,8.578219,256,16,58.0,62.0,7.895833,2.944444


In [3]:
import graphviz
from sklearn.metrics import mean_squared_error as MSE
x=df.iloc[:,4:1831]
x = x.astype(float)
y=df.LogLD50.values

In [None]:
import optuna
def objective(trial):
    param = {
        'iterations':trial.suggest_int("iterations", 500, 1000, step=100),
        'learning_rate': trial.suggest_float("learning_rate",0.0001, 0.1,log=True),
        'depth':trial.suggest_int("depth", 4, 12, step=2),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-6, 1.0, log=True),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]), 
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]), 
        'random_strength':trial.suggest_float("random_strength", 1e-6, 5.0, log=True),
        'leaf_estimation_method':"Gradient",
        'score_function':trial.suggest_categorical("score_function", ["L2", "Cosine"]),
        'early_stopping_rounds':20,
        'eval_metric':'R2',
        'rsm':trial.suggest_float("rsm", 0.1, 1.0, log=True)
    }
    if param["bootstrap_type"] == "Bayesian": 
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10) 
    elif param["bootstrap_type"] == "Bernoulli": 
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1) 
    
    cat_model = cat.CatBoostRegressor(**param)

    r2 = cross_val_score(cat_model, x, y, n_jobs=-1, cv=5, scoring="r2").mean()
    
    return r2

study = optuna.create_study(
    direction="maximize",
    study_name="catboost_parameter_opt",
    pruner = optuna.pruners.MedianPruner(n_warmup_steps=5)
    )

study.optimize(objective, n_trials=100)
df = study.trials_dataframe()
df
df.to_csv(r'C:\Users\19189\Desktop\MD_cat_trial1.csv',index=False)

[I 2024-04-21 11:47:57,392] A new study created in memory with name: catboost_parameter_opt
[I 2024-04-21 11:55:56,325] Trial 0 finished with value: 0.3696670438408053 and parameters: {'iterations': 700, 'learning_rate': 0.03007414396676305, 'depth': 6, 'l2_leaf_reg': 0.0006242035821544579, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'random_strength': 0.004547337743564287, 'score_function': 'L2', 'rsm': 0.93752479191974, 'bagging_temperature': 6.218668238190835}. Best is trial 0 with value: 0.3696670438408053.
[I 2024-04-21 12:57:18,716] Trial 1 finished with value: -0.05074410850361546 and parameters: {'iterations': 600, 'learning_rate': 0.00015116762301032596, 'depth': 12, 'l2_leaf_reg': 0.0016595459096589117, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'random_strength': 0.0015839232347978093, 'score_function': 'Cosine', 'rsm': 0.1688054224027743, 'bagging_temperature': 3.2271508881335755}. Best is trial 0 with value: 0.3696670438408053.
[I 2024-04-21 12:59: