In [1]:
'''
Optimizing multilple ml models using Optuna with hyperparameter tuning
'''
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import optuna
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [4]:
# Split into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optional: Scale the data for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')


Training set shape: (537, 8)
Test set shape: (231, 8)


In [5]:
def objective(trial):
    # Choose the model type
    classifier_name = trial.suggest_categorical('classifier', ['RandomForest', 'GradientBoosting', 'SVC'])
    
    if classifier_name == 'SVC':
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
        
        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)
    elif classifier_name == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])
        
        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_leaf=min_samples_leaf,
            min_samples_split=min_samples_split,
            bootstrap=bootstrap,
            random_state=42
            )
        
    elif classifier_name == 'GradientBoosting':
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [6]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2026-01-29 00:31:25,657][0m A new study created in memory with name: no-name-bf620968-fc06-47d4-b127-8990c20c3815[0m
[32m[I 2026-01-29 00:31:25,687][0m Trial 0 finished with value: 0.7821229050279329 and parameters: {'classifier': 'SVC', 'C': 0.3758324874451357, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 0 with value: 0.7821229050279329.[0m
[32m[I 2026-01-29 00:31:25,716][0m Trial 1 finished with value: 0.7839851024208566 and parameters: {'classifier': 'SVC', 'C': 1.057891999026675, 'kernel': 'linear', 'gamma': 'auto'}. Best is trial 1 with value: 0.7839851024208566.[0m
[32m[I 2026-01-29 00:31:26,387][0m Trial 2 finished with value: 0.7616387337057727 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 109, 'learning_rate': 0.19098471767523448, 'max_depth': 4, 'min_samples_split': 5, 'min_samples_leaf': 5}. Best is trial 1 with value: 0.7839851024208566.[0m
[32m[I 2026-01-29 00:31:28,072][0m Trial 3 finished with value: 0.7560521415270017 an

In [7]:
best_trial = study.best_trial
print(f'Best trial accuracy: {best_trial.value}')
print(f'Best hyperparameters: {best_trial.params}')

Best trial accuracy: 0.7895716945996275
Best hyperparameters: {'classifier': 'SVC', 'C': 0.121894146323621, 'kernel': 'linear', 'gamma': 'auto'}


In [8]:
#Visualize the optimization history
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.782123,2026-01-29 00:31:25.660190,2026-01-29 00:31:25.687026,0 days 00:00:00.026836,0.375832,,SVC,auto,linear,,,,,,COMPLETE
1,1,0.783985,2026-01-29 00:31:25.688191,2026-01-29 00:31:25.716190,0 days 00:00:00.027999,1.057892,,SVC,auto,linear,,,,,,COMPLETE
2,2,0.761639,2026-01-29 00:31:25.717413,2026-01-29 00:31:26.387664,0 days 00:00:00.670251,,,GradientBoosting,,,0.190985,4.0,5.0,5.0,109.0,COMPLETE
3,3,0.756052,2026-01-29 00:31:26.389161,2026-01-29 00:31:28.072227,0 days 00:00:01.683066,,True,RandomForest,,,,10.0,7.0,8.0,289.0,COMPLETE
4,4,0.767225,2026-01-29 00:31:28.073665,2026-01-29 00:31:29.418708,0 days 00:00:01.345043,,True,RandomForest,,,,18.0,5.0,8.0,220.0,COMPLETE
5,5,0.769088,2026-01-29 00:31:29.419987,2026-01-29 00:31:30.916922,0 days 00:00:01.496935,,False,RandomForest,,,,7.0,9.0,8.0,290.0,COMPLETE
6,6,0.752328,2026-01-29 00:31:30.918235,2026-01-29 00:31:33.823577,0 days 00:00:02.905342,,,GradientBoosting,,,0.027768,8.0,4.0,5.0,226.0,COMPLETE
7,7,0.763501,2026-01-29 00:31:33.825230,2026-01-29 00:31:33.857672,0 days 00:00:00.032442,0.277605,,SVC,auto,rbf,,,,,,COMPLETE
8,8,0.731844,2026-01-29 00:31:33.859246,2026-01-29 00:31:36.681789,0 days 00:00:02.822543,,,GradientBoosting,,,0.053321,15.0,7.0,7.0,204.0,COMPLETE
9,9,0.787709,2026-01-29 00:31:36.683152,2026-01-29 00:31:36.712259,0 days 00:00:00.029107,0.164479,,SVC,auto,linear,,,,,,COMPLETE


In [9]:
study.trials_dataframe()['params_classifier'].value_counts()

params_classifier
SVC                 36
GradientBoosting     7
RandomForest         7
Name: count, dtype: int64

In [10]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

params_classifier
GradientBoosting    0.743549
RandomForest        0.765097
SVC                 0.776226
Name: value, dtype: float64

In [11]:
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [12]:
# 1. Optimization History
plot_optimization_history(study).show()