In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m386.6/386.6 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.1-py3-none-any.whl (242 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m242.5/242.5 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.16.1 colorlog-6.9.0 optuna-4.3.0


In [2]:
import optuna
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI',
           'DiabetesPedigreeFunction', 'Age', 'Outcome']

# Load the dataset
df = pd.read_csv(url, names=columns)

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
import numpy as np

cols_with_missing_vals = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_missing_vals] = df[cols_with_missing_vals].replace(0, np.nan)

df.fillna(df.median(), inplace=True)

print(df.isnull().sum())


Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [6]:
# Split into features (X) and target (y)
X = df.drop('Outcome', axis=1)
y = df['Outcome']

# Split data into training and test sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Optional: Scale the data for better model performance
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check the shape of the data
print(f'Training set shape: {X_train.shape}')
print(f'Test set shape: {X_test.shape}')


Training set shape: (537, 8)
Test set shape: (231, 8)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

def objective(trial):
  n_estimators = trial.suggest_int('n_estimators',50,200)
  max_depth = trial.suggest_int('max_depth',3,20)
  model = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,random_state = 42)
  score = cross_val_score(model,X_train,y_train,cv=5,scoring='accuracy',n_jobs=-1).mean()
  return score

In [8]:
study = optuna.create_study(direction='maximize',sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials=50)

[I 2025-06-12 11:35:39,158] A new study created in memory with name: no-name-993dcdfc-2683-41f1-b3b7-fedbbb1ea2d7
[I 2025-06-12 11:35:47,208] Trial 0 finished with value: 0.754153686396677 and parameters: {'n_estimators': 98, 'max_depth': 4}. Best is trial 0 with value: 0.754153686396677.
[I 2025-06-12 11:35:48,115] Trial 1 finished with value: 0.7652647975077882 and parameters: {'n_estimators': 96, 'max_depth': 11}. Best is trial 1 with value: 0.7652647975077882.
[I 2025-06-12 11:35:48,930] Trial 2 finished with value: 0.7634302526825891 and parameters: {'n_estimators': 97, 'max_depth': 7}. Best is trial 1 with value: 0.7652647975077882.
[I 2025-06-12 11:35:49,606] Trial 3 finished with value: 0.7578573901003807 and parameters: {'n_estimators': 77, 'max_depth': 18}. Best is trial 1 with value: 0.7652647975077882.
[I 2025-06-12 11:35:51,079] Trial 4 finished with value: 0.7522672204915196 and parameters: {'n_estimators': 144, 'max_depth': 3}. Best is trial 1 with value: 0.7652647975077

In [9]:
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best trial parameters: {study.best_trial.params}')

Best trial accuracy: 0.7746278989269643
Best trial parameters: {'n_estimators': 181, 'max_depth': 9}


In [10]:
from sklearn.metrics import accuracy_score
best_model = RandomForestClassifier(**study.best_trial.params, random_state=42)
best_model.fit(X_train, y_train)
y_pred = best_model.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

print(f'Test Accuracy with best hyperparameters: {test_accuracy:.2f}')


Test Accuracy with best hyperparameters: 0.76


In [11]:
from optuna.visualization import plot_optimization_history,plot_parallel_coordinate,plot_slice,plot_contour,plot_param_importances

In [12]:
plot_optimization_history(study)

In [13]:
plot_parallel_coordinate(study)

In [14]:
plot_slice(study).show()

In [15]:
plot_contour(study).show()

In [16]:
plot_param_importances(study).show()

In [17]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [18]:
# Define the objective function for Optuna
def objective(trial):
    # Choose the algorithm to tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42,
            n_jobs=-1
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, X_train, y_train, cv=3, scoring='accuracy',n_jobs=-1).mean()
    return score

In [19]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2025-06-12 12:06:07,489] A new study created in memory with name: no-name-31b5ce0d-07cb-4e08-93e5-021a809bd9a1
[I 2025-06-12 12:06:12,092] Trial 0 finished with value: 0.7057728119180634 and parameters: {'classifier': 'SVM', 'C': 20.9337216417056, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.7057728119180634.
[I 2025-06-12 12:06:12,373] Trial 1 finished with value: 0.6722532588454376 and parameters: {'classifier': 'SVM', 'C': 89.50596958774604, 'kernel': 'poly', 'gamma': 'scale'}. Best is trial 0 with value: 0.7057728119180634.
[I 2025-06-12 12:06:15,204] Trial 2 finished with value: 0.7541899441340782 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 273, 'learning_rate': 0.047133052106739896, 'max_depth': 7, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.7541899441340782.
[I 2025-06-12 12:06:15,253] Trial 3 finished with value: 0.7635009310986964 and parameters: {'classifier': 'SVM', 'C': 6.130004694316687, 'ker

In [20]:
print(f'Best trial accuracy: {study.best_trial.value}')
print(f'Best trial parameters: {study.best_trial.params}')

Best trial accuracy: 0.7895716945996275
Best trial parameters: {'classifier': 'SVM', 'C': 0.11589874809297444, 'kernel': 'linear', 'gamma': 'scale'}


In [21]:
! pip install optuna-integration[xgboost]

Collecting optuna-integration[xgboost]
  Downloading optuna_integration-4.3.0-py3-none-any.whl.metadata (12 kB)
Downloading optuna_integration-4.3.0-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: optuna-integration
Successfully installed optuna-integration-4.3.0
