In [3]:
import optuna
import pandas as pd


In [4]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']


df = pd.read_csv(url, names=columns)

In [56]:
df.sample(5)
# df.isnull().sum()
df['Insulin']

0        0
1        0
2        0
3       94
4      168
      ... 
763    180
764      0
765    112
766      0
767      0
Name: Insulin, Length: 768, dtype: int64

In [5]:
import numpy as np

# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

# Impute the missing values with the mean of the respective column
df.fillna(df.mean(), inplace=True)

# Check if there are any remaining missing values
print(df.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [58]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier




def get_para(xtr,ytr,xtn,ytn):
    lr = LogisticRegression()
    svc = SVC()
    rfc = RandomForestClassifier()
    xgc = XGBClassifier()
    algo  = [lr,svc,rfc,xgc]
    ss = StandardScaler()
    ss.fit(xtr)
    xtn = ss.transform(xtn)
    xtr = ss.transform(xtr)
    for al in algo:
        al.fit(xtr,ytr)
        ypr = al.predict(xtn)
        a =accuracy_score(ytn,ypr)
        print(al," -> ",a)

In [6]:
y = df.iloc[:,-1]
x = df.iloc[:,:-2]

In [7]:
from sklearn.model_selection import train_test_split


xtr,xtn,ytr,ytn = train_test_split(x,y,random_state=42,test_size=.3)
ytr

334    0
139    0
485    1
547    0
18     0
      ..
71     0
106    0
270    1
435    1
102    0
Name: Outcome, Length: 537, dtype: int64

In [11]:
get_para(xtr,ytr,xtn,ytn)

NameError: name 'get_para' is not defined

In [62]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

# Define the objective function
def objective(trial):
    # Suggest values for the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 200)
    max_depth = trial.suggest_int('max_depth', 3, 20)

    # Create the RandomForestClassifier with suggested hyperparameters
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    # Perform 3-fold cross-validation and calculate accuracy
    score = cross_val_score(model, x, y, cv=3, scoring='accuracy').mean()

    return score  # Return the accuracy score for Optuna to maximize


In [63]:
# Create a study object and optimize the objective function
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())  # We aim to maximize accuracy
study.optimize(objective, n_trials=50)  # Run 50 trials to find the best hyperparameters


[I 2025-12-29 21:00:38,166] A new study created in memory with name: no-name-21f0aa2b-e383-4445-b657-be7961ea295d
[I 2025-12-29 21:00:38,899] Trial 0 finished with value: 0.7526041666666666 and parameters: {'n_estimators': 195, 'max_depth': 10}. Best is trial 0 with value: 0.7526041666666666.
[I 2025-12-29 21:00:39,740] Trial 1 finished with value: 0.7591145833333334 and parameters: {'n_estimators': 164, 'max_depth': 12}. Best is trial 1 with value: 0.7591145833333334.
[I 2025-12-29 21:00:40,085] Trial 2 finished with value: 0.7486979166666666 and parameters: {'n_estimators': 70, 'max_depth': 13}. Best is trial 1 with value: 0.7591145833333334.
[I 2025-12-29 21:00:41,059] Trial 3 finished with value: 0.7526041666666666 and parameters: {'n_estimators': 175, 'max_depth': 19}. Best is trial 1 with value: 0.7591145833333334.
[I 2025-12-29 21:00:41,718] Trial 4 finished with value: 0.7591145833333334 and parameters: {'n_estimators': 138, 'max_depth': 16}. Best is trial 1 with value: 0.75911

In [64]:
# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7630208333333334
Best hyperparameters: {'n_estimators': 148, 'max_depth': 16}


In [65]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(xtr, ytr)

# Make predictions on the test set
y_pred = best_model.predict(xtn)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(ytn, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.74


In [66]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())  # We aim to maximize accuracy
study.optimize(objective, n_trials=50)  # Run 50 trials to find the best hyperparameters

[I 2025-12-29 21:01:13,847] A new study created in memory with name: no-name-579594ba-3b06-4d4f-9914-deb396dfcac5
[I 2025-12-29 21:01:14,297] Trial 0 finished with value: 0.75 and parameters: {'n_estimators': 89, 'max_depth': 11}. Best is trial 0 with value: 0.75.
[I 2025-12-29 21:01:15,203] Trial 1 finished with value: 0.7526041666666666 and parameters: {'n_estimators': 169, 'max_depth': 16}. Best is trial 1 with value: 0.7526041666666666.
[I 2025-12-29 21:01:16,027] Trial 2 finished with value: 0.7526041666666666 and parameters: {'n_estimators': 173, 'max_depth': 17}. Best is trial 1 with value: 0.7526041666666666.
[I 2025-12-29 21:01:16,503] Trial 3 finished with value: 0.7513020833333334 and parameters: {'n_estimators': 102, 'max_depth': 19}. Best is trial 1 with value: 0.7526041666666666.
[I 2025-12-29 21:01:17,347] Trial 4 finished with value: 0.7434895833333334 and parameters: {'n_estimators': 177, 'max_depth': 7}. Best is trial 1 with value: 0.7526041666666666.
[I 2025-12-29 21

In [67]:

# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7578125
Best hyperparameters: {'n_estimators': 187, 'max_depth': 15}


In [68]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(xtr, ytr)

# Make predictions on the test set
y_pred = best_model.predict(xtn)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(ytn, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.74


In [70]:
search_space = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20]
}

In [71]:
# Create a study and optimize it using GridSampler
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.GridSampler(search_space))
study.optimize(objective)

[I 2025-12-29 21:03:05,044] A new study created in memory with name: no-name-d2ee73c1-f6d6-4b98-828b-3299d3b9307d
[I 2025-12-29 21:03:05,435] Trial 0 finished with value: 0.7473958333333334 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7473958333333334.
[I 2025-12-29 21:03:06,020] Trial 1 finished with value: 0.75390625 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with value: 0.75390625.
[I 2025-12-29 21:03:06,236] Trial 2 finished with value: 0.7447916666666666 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 1 with value: 0.75390625.
[I 2025-12-29 21:03:06,698] Trial 3 finished with value: 0.7447916666666666 and parameters: {'n_estimators': 100, 'max_depth': 15}. Best is trial 1 with value: 0.75390625.
[I 2025-12-29 21:03:07,267] Trial 4 finished with value: 0.7526041666666666 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 1 with value: 0.75390625.
[I 2025-12-29 21:03:07,514] 

In [72]:

# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7604166666666666
Best hyperparameters: {'n_estimators': 150, 'max_depth': 20}


In [73]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(xtr, ytr)

# Make predictions on the test set
y_pred = best_model.predict(xtn)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(ytn, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.74


In [18]:
# For visualizations
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [75]:
# 1. Optimization History
plot_optimization_history(study).show()

In [77]:
plot_parallel_coordinate(study)

In [78]:
plot_slice(study).show()

In [79]:
plot_contour(study).show()

In [80]:
plot_param_importances(study).show()

In [9]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

In [10]:
# Define the objective function for Optuna
def objective(trial):
    # Choose the algorithm to tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, x, y, cv=3, scoring='accuracy').mean()
    return score

In [12]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10) 

[I 2025-12-29 21:15:49,550] A new study created in memory with name: no-name-38926348-6902-4115-a3e0-719ed1d5121b
[I 2025-12-29 21:15:50,753] Trial 0 finished with value: 0.7565104166666666 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 243, 'learning_rate': 0.2668957711823138, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 0 with value: 0.7565104166666666.
[I 2025-12-29 21:15:52,492] Trial 1 finished with value: 0.7408854166666666 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 281, 'learning_rate': 0.010664946228163523, 'max_depth': 4, 'min_samples_split': 10, 'min_samples_leaf': 8}. Best is trial 0 with value: 0.7565104166666666.
[I 2025-12-29 21:15:56,068] Trial 2 finished with value: 0.7408854166666666 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 201, 'learning_rate': 0.06700870141917684, 'max_depth': 20, 'min_samples_split': 8, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.75651041

In [13]:
# Retrieve the best trial
best_trial = study.best_trial
print("Best trial parameters:", best_trial.params)
print("Best trial accuracy:", best_trial.value)

Best trial parameters: {'classifier': 'GradientBoosting', 'n_estimators': 243, 'learning_rate': 0.2668957711823138, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 2}
Best trial accuracy: 0.7565104166666666


In [14]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.75651,2025-12-29 21:15:49.550855,2025-12-29 21:15:50.753303,0 days 00:00:01.202448,,,GradientBoosting,,,0.266896,4.0,2.0,4.0,243.0,COMPLETE
1,1,0.740885,2025-12-29 21:15:50.753303,2025-12-29 21:15:52.492086,0 days 00:00:01.738783,,,GradientBoosting,,,0.010665,4.0,8.0,10.0,281.0,COMPLETE
2,2,0.740885,2025-12-29 21:15:52.492712,2025-12-29 21:15:56.068949,0 days 00:00:03.576237,,,GradientBoosting,,,0.067009,20.0,6.0,8.0,201.0,COMPLETE
3,3,0.744792,2025-12-29 21:15:56.068949,2025-12-29 21:15:56.857216,0 days 00:00:00.788267,,True,RandomForest,,,,5.0,10.0,5.0,235.0,COMPLETE
4,4,0.74349,2025-12-29 21:15:56.857216,2025-12-29 21:15:58.707164,0 days 00:00:01.849948,,,GradientBoosting,,,0.022648,5.0,1.0,6.0,240.0,COMPLETE
5,5,0.753906,2025-12-29 21:15:58.707960,2025-12-29 21:15:59.503364,0 days 00:00:00.795404,,False,RandomForest,,,,9.0,6.0,4.0,203.0,COMPLETE
6,6,0.533854,2025-12-29 21:15:59.505241,2025-12-29 21:15:59.553309,0 days 00:00:00.048068,0.289419,,SVM,scale,sigmoid,,,,,,COMPLETE
7,7,0.734375,2025-12-29 21:15:59.553309,2025-12-29 21:16:03.017164,0 days 00:00:03.463855,,,GradientBoosting,,,0.01879,10.0,6.0,9.0,261.0,COMPLETE
8,8,0.710938,2025-12-29 21:16:03.018306,2025-12-29 21:16:07.947821,0 days 00:00:04.929515,,,GradientBoosting,,,0.023726,13.0,1.0,5.0,196.0,COMPLETE
9,9,0.74349,2025-12-29 21:16:07.953143,2025-12-29 21:16:08.238837,0 days 00:00:00.285694,,False,RandomForest,,,,12.0,7.0,3.0,67.0,COMPLETE


In [15]:
study.trials_dataframe()['params_classifier'].value_counts()

params_classifier
GradientBoosting    6
RandomForest        3
SVM                 1
Name: count, dtype: int64

In [16]:
study.trials_dataframe().groupby('params_classifier')['value'].mean()

params_classifier
GradientBoosting    0.737847
RandomForest        0.747396
SVM                 0.533854
Name: value, dtype: float64

In [19]:
plot_optimization_history(study).show()