In [5]:
from sklearn.datasets import load_diabetes
from sklearn.preprocessing import StandardScaler

In [6]:
import optuna 

# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source


In [7]:
import pandas as pd

url ="https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"

columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

df =pd.read_csv(url, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
import numpy as np

#in this dataset have missing values as 0
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [9]:
# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)
df.fillna(df.median(), inplace=True)
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [10]:
#Split the data into features and target variable
from sklearn.model_selection import train_test_split
X=df.drop('Outcome', axis=1)
y=df['Outcome']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
scale=StandardScaler()
X_train=scale.fit_transform(X_train)
X_test=scale.transform(X_test)


print(X_train.shape, y_train.shape)

(614, 8) (614,)


In [12]:
X_train

array([[-0.52639686, -1.25688146, -0.01899526, ..., -0.00745016,
        -0.49073479, -1.03594038],
       [ 1.58804586, -0.32605067,  0.8081742 , ..., -0.59909194,
         2.41502991,  1.48710085],
       [-0.82846011,  0.57153617, -2.16963585, ..., -0.52694051,
         0.54916055, -0.94893896],
       ...,
       [ 1.8901091 , -0.69173419,  1.13904198, ...,  1.91177805,
         1.981245  ,  0.44308379],
       [-1.13052335,  0.63802409, -0.01899526, ...,  1.45000885,
        -0.78487662, -0.33992901],
       [-1.13052335,  0.10612077,  1.96621144, ..., -1.42161832,
        -0.61552223, -1.03594038]], shape=(614, 8))

Define Optuna for model and find the best value

In [13]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [14]:
#Make a Objective Function  

def objective(trial):
    #suggent the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 32)

    #Crerate the model

    model= RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    #Evaluate the model using cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    accuracy = score.mean()
    return accuracy

Make Study Function

In [15]:
study=optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-09-17 10:38:13,724] A new study created in memory with name: no-name-edf7b272-52e9-46c9-89f8-04f5fe5345c8
[I 2025-09-17 10:38:14,239] Trial 0 finished with value: 0.7768691216323927 and parameters: {'n_estimators': 179, 'max_depth': 21}. Best is trial 0 with value: 0.7768691216323927.
[I 2025-09-17 10:38:14,752] Trial 1 finished with value: 0.7785031085604973 and parameters: {'n_estimators': 175, 'max_depth': 17}. Best is trial 1 with value: 0.7785031085604973.
[I 2025-09-17 10:38:15,167] Trial 2 finished with value: 0.7850071736011478 and parameters: {'n_estimators': 146, 'max_depth': 17}. Best is trial 2 with value: 0.7850071736011478.
[I 2025-09-17 10:38:15,706] Trial 3 finished with value: 0.7784791965566714 and parameters: {'n_estimators': 195, 'max_depth': 9}. Best is trial 2 with value: 0.7850071736011478.
[I 2025-09-17 10:38:16,331] Trial 4 finished with value: 0.7752351347042882 and parameters: {'n_estimators': 214, 'max_depth': 20}. Best is trial 2 with value: 0.78500

In [16]:
print('Best trial:',study.best_trial.params)
print('Best value:',study.best_trial.value)

Best trial: {'n_estimators': 106, 'max_depth': 13}
Best value: 0.7899171050534036


In [17]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model=RandomForestClassifier(**study.best_trial.params,random_state=42)

best_model.fit(X_train, y_train)
y_pred=best_model.predict(X_test)

print('Accuracy:', accuracy_score(y_test, y_pred))

# print(classification_report(y_test, y_pred))

print(f"Test accuracy: {accuracy_score(y_test, y_pred):.2f}")




Accuracy: 0.7402597402597403
Test accuracy: 0.74


In [18]:
def objective(trial):
    #suggent the hyperparameters
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 32)

    #Crerate the model

    model= RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=42
    )

    #Evaluate the model using cross-validation
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy')
    accuracy = score.mean()
    return accuracy

In [19]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler())  # We aim to maximize accuracy
study.optimize(objective, n_trials=50)  # Run 50 trials to find the best hyperparameters

[I 2025-09-17 10:38:40,155] A new study created in memory with name: no-name-d5fed2b0-ff58-491f-838b-67fca1db2a8e
[I 2025-09-17 10:38:40,743] Trial 0 finished with value: 0.7686991869918699 and parameters: {'n_estimators': 222, 'max_depth': 6}. Best is trial 0 with value: 0.7686991869918699.
[I 2025-09-17 10:38:41,259] Trial 1 finished with value: 0.7768611509644509 and parameters: {'n_estimators': 178, 'max_depth': 10}. Best is trial 1 with value: 0.7768611509644509.
[I 2025-09-17 10:38:42,048] Trial 2 finished with value: 0.7752351347042882 and parameters: {'n_estimators': 269, 'max_depth': 31}. Best is trial 1 with value: 0.7768611509644509.
[I 2025-09-17 10:38:42,640] Trial 3 finished with value: 0.7768611509644509 and parameters: {'n_estimators': 208, 'max_depth': 17}. Best is trial 1 with value: 0.7768611509644509.
[I 2025-09-17 10:38:42,841] Trial 4 finished with value: 0.744285031085605 and parameters: {'n_estimators': 84, 'max_depth': 3}. Best is trial 1 with value: 0.77686115

In [20]:
print('Best trial:',study.best_trial.params)
print('Best value:',study.best_trial.value)

Best trial: {'n_estimators': 142, 'max_depth': 31}
Best value: 0.7850151442690897


In [21]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.77


In [22]:
search_space = {
    'n_estimators': [50, 100, 150, 200],
    'max_depth': [5, 10, 15, 20]
}

In [23]:
# Create a study and optimize it using GridSampler
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.GridSampler(search_space))
study.optimize(objective)

[I 2025-09-17 10:39:04,013] A new study created in memory with name: no-name-5561a44f-3bae-4a3d-867f-897460742b81
[I 2025-09-17 10:39:04,263] Trial 0 finished with value: 0.7621791806153356 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 0 with value: 0.7621791806153356.
[I 2025-09-17 10:39:04,687] Trial 1 finished with value: 0.7801052128168341 and parameters: {'n_estimators': 150, 'max_depth': 10}. Best is trial 1 with value: 0.7801052128168341.
[I 2025-09-17 10:39:04,837] Trial 2 finished with value: 0.78012912482066 and parameters: {'n_estimators': 50, 'max_depth': 15}. Best is trial 2 with value: 0.78012912482066.
[I 2025-09-17 10:39:05,127] Trial 3 finished with value: 0.7833811573409851 and parameters: {'n_estimators': 100, 'max_depth': 15}. Best is trial 3 with value: 0.7833811573409851.
[I 2025-09-17 10:39:05,419] Trial 4 finished with value: 0.7833811573409851 and parameters: {'n_estimators': 100, 'max_depth': 20}. Best is trial 3 with value: 0.7833811573

In [24]:
from sklearn.metrics import accuracy_score

# Train a RandomForestClassifier using the best hyperparameters from Optuna
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)

# Fit the model to the training data
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate the accuracy on the test set
test_accuracy = accuracy_score(y_test, y_pred)

# Print the test accuracy
print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.76


In [25]:
# %pip install plotly

In [26]:
# %pip install "optuna[visualization]"

Optuna Visulization

In [27]:
import plotly

In [28]:
# For visualizations
from optuna.visualization import plot_optimization_history, plot_parallel_coordinate, plot_slice, plot_contour, plot_param_importances

In [29]:
plot_optimization_history(study).show()

In [30]:
# Parallel Coordinate Plot
plot_parallel_coordinate(study).show()

In [31]:
# 3. Slice Plot
plot_slice(study).show()

In [32]:
# 4. Contour Plot
plot_contour(study).show()

In [33]:
# 5. Parameter Importance Plot
plot_param_importances(study).show()

In [35]:
# 6. Hyperparameter Importances
from optuna.importance import get_param_importances
importances = get_param_importances(study)
import plotly.express as px
fig = px.bar(x=list(importances.keys()), y=list(importances.values()), labels={'x': 'Hyperparameter', 'y': 'Importance'}, title='Hyperparameter Importances')
fig.show()