In [None]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from skopt import gp_minimize
from skopt.space import Integer, Categorical
from skopt.utils import use_named_args
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# Load the dataset
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = [
    'Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness',
    'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'
]
df = pd.read_csv(url, header=None, names=columns)
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Data Preprocessing
missing_columns = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
for column in missing_columns:
    df[column] = df[column].replace(0, np.nan)
df.sample(5)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
610,3,106.0,54.0,21.0,158.0,30.9,0.292,24,0
328,2,102.0,86.0,36.0,120.0,45.5,0.127,23,1
418,1,83.0,68.0,,,18.2,0.624,27,0
133,8,84.0,74.0,31.0,,38.3,0.457,39,0
374,2,122.0,52.0,43.0,158.0,36.2,0.816,28,0


In [8]:
df.fillna(df.median(), inplace=True)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42, stratify=y
)

In [10]:
# Define hyperparameter space
param_space = [
    Integer(50, 300, name='n_estimators'), # from 50 to 300 inclusive
    Integer(2, 20, name='max_depth'),
    Integer(2, 20, name='min_samples_split'),
    Integer(1, 20, name='min_samples_leaf'),
    Categorical(['sqrt', 'log2', None], name='max_features'),
    Categorical([True, False], name='bootstrap') # Boolean also considered into categorical
]

In [11]:
# Define the objective function
@use_named_args(param_space)
def objective(**params):
    model = RandomForestClassifier(
        random_state=42,
        n_jobs=-1,
        **params
    )
    cv_scores = cross_val_score(
        model, X_train_full, y_train_full, cv=5, scoring='accuracy', n_jobs=-1
    )
    score = -np.mean(cv_scores)
    return score

In [12]:
# Run the optimizer
res = gp_minimize(
    func=objective,
    dimensions=param_space,
    n_calls=10, # Loop of acquization and surrogate function
    verbose=True
)

Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 4.2715
Function value obtained: -0.7589
Current minimum: -0.7589
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 2.6360
Function value obtained: -0.7655
Current minimum: -0.7655
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.5533
Function value obtained: -0.7801
Current minimum: -0.7801
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.2357
Function value obtained: -0.7687
Current minimum: -0.7801
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.8055
Function value obtained: -0.7704
Current minimum: -0.7801
Iteration No: 6 started. 

In [13]:
res

          fun: -0.7801012928162068
            x: [np.int64(171), np.int64(20), np.int64(15), np.int64(14), 'sqrt', True]
    func_vals: [-7.589e-01 -7.655e-01 -7.801e-01 -7.687e-01 -7.704e-01
                -7.525e-01 -7.606e-01 -7.687e-01 -7.736e-01 -7.720e-01]
      x_iters: [[np.int64(241), np.int64(7), np.int64(13), np.int64(10), 'sqrt', False], [np.int64(264), np.int64(13), np.int64(8), np.int64(6), 'log2', False], [np.int64(171), np.int64(20), np.int64(15), np.int64(14), 'sqrt', True], [np.int64(62), np.int64(8), np.int64(8), np.int64(3), 'sqrt', True], [np.int64(235), np.int64(14), np.int64(5), np.int64(6), 'sqrt', True], [np.int64(166), np.int64(3), np.int64(3), np.int64(12), 'log2', False], [np.int64(259), np.int64(18), np.int64(15), np.int64(15), 'log2', True], [np.int64(264), np.int64(15), np.int64(16), np.int64(17), 'sqrt', True], [np.int64(96), np.int64(11), np.int64(15), np.int64(4), 'log2', True], [np.int64(135), np.int64(8), np.int64(7), np.int64(6), 'sqrt', True]]
  

In [14]:
# Get the best hyperparameters
best_params = {
    'n_estimators': res.x[0],
    'max_depth': res.x[1],
    'min_samples_split': res.x[2],
    'min_samples_leaf': res.x[3],
    'max_features': res.x[4],
    'bootstrap': res.x[5]
}

print("Best Hyperparameters:")
for param, value in best_params.items():
    print(f"{param}: {value}")

Best Hyperparameters:
n_estimators: 171
max_depth: 20
min_samples_split: 15
min_samples_leaf: 14
max_features: sqrt
bootstrap: True


In [15]:
# Evaluate the tuned model
best_model = RandomForestClassifier(
    random_state=42,
    n_jobs=-1,
    **best_params
)
best_model.fit(X_train_full, y_train_full)
y_pred = best_model.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"\nTest Accuracy: {acc:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Test Accuracy: 0.7403

Classification Report:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81       100
           1       0.68      0.50      0.57        54

    accuracy                           0.74       154
   macro avg       0.72      0.69      0.69       154
weighted avg       0.73      0.74      0.73       154

Confusion Matrix:
[[87 13]
 [27 27]]
