In [12]:
import torch
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Pima Indian Diabetes dataset from sklearn
# Note: Scikit-learn's built-in 'load_diabetes' is a regression dataset.
# We will load the actual diabetes dataset from an external source
import pandas as pd
# Load the Pima Indian Diabetes dataset (from UCI repository)
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']
dataset = pd.read_csv(url, names=columns)
dataset.head()



Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
import numpy as np 

# Replace zero values with NaN in columns where zero is not a valid value
cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
dataset[cols_with_missing_vals] = dataset[cols_with_missing_vals].replace(0, np.nan)

# Impute missing values with the median of each column
dataset[cols_with_missing_vals] = dataset[cols_with_missing_vals].fillna(dataset[cols_with_missing_vals].median())

dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [19]:
X=dataset.drop('Outcome', axis=1)
y=dataset['Outcome']

X

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33
...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.0,180.0,32.9,0.171,63
764,2,122.0,70.0,27.0,125.0,36.8,0.340,27
765,5,121.0,72.0,23.0,112.0,26.2,0.245,30
766,1,126.0,60.0,29.0,125.0,30.1,0.349,47


In [22]:
X_train ,X_test ,y_train ,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

#Optional Scale the data for better model performance


scaler=StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train

array([[-0.52639686, -1.25688146, -0.01899526, ..., -0.00745016,
        -0.49073479, -1.03594038],
       [ 1.58804586, -0.32605067,  0.8081742 , ..., -0.59909194,
         2.41502991,  1.48710085],
       [-0.82846011,  0.57153617, -2.16963585, ..., -0.52694051,
         0.54916055, -0.94893896],
       ...,
       [ 1.8901091 , -0.69173419,  1.13904198, ...,  1.91177805,
         1.981245  ,  0.44308379],
       [-1.13052335,  0.63802409, -0.01899526, ...,  1.45000885,
        -0.78487662, -0.33992901],
       [-1.13052335,  0.10612077,  1.96621144, ..., -1.42161832,
        -0.61552223, -1.03594038]])

In [23]:
X_train.shape

(614, 8)

In [24]:
X_test.shape

(154, 8)

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score



#Define the objective function

def objective(trial):
    #suggest hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 50, 200)
    max_depth = trial.suggest_int("max_depth", 3, 20)

    #create the Random Forest classifier
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)

    #evaluate the model using cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=3,scoring="accuracy")
    return scores.mean()


In [26]:
#create a study object and optimize the function

study = optuna.create_study(direction="maximize",sampler=optuna.samplers.TPESampler())
#we aim to maximize the accuracy of the model

study.optimize(objective, n_trials=50)



[I 2025-09-14 06:26:38,365] A new study created in memory with name: no-name-83bcd2ab-a484-4ba6-9cd1-bbcbfab838fb
[I 2025-09-14 06:26:38,949] Trial 0 finished with value: 0.7784951378925554 and parameters: {'n_estimators': 185, 'max_depth': 13}. Best is trial 0 with value: 0.7784951378925554.
[I 2025-09-14 06:26:39,518] Trial 1 finished with value: 0.7752431053722302 and parameters: {'n_estimators': 199, 'max_depth': 13}. Best is trial 0 with value: 0.7784951378925554.
[I 2025-09-14 06:26:39,713] Trial 2 finished with value: 0.7785031085604973 and parameters: {'n_estimators': 67, 'max_depth': 19}. Best is trial 2 with value: 0.7785031085604973.
[I 2025-09-14 06:26:40,226] Trial 3 finished with value: 0.7833811573409851 and parameters: {'n_estimators': 173, 'max_depth': 12}. Best is trial 3 with value: 0.7833811573409851.
[I 2025-09-14 06:26:40,493] Trial 4 finished with value: 0.7703491152558585 and parameters: {'n_estimators': 92, 'max_depth': 6}. Best is trial 3 with value: 0.7833811

In [27]:

# Print the best result
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best hyperparameters: {study.best_trial.params}')

Best trial accuracy: 0.7899011637175195
Best hyperparameters: {'n_estimators': 117, 'max_depth': 15}
