<a href="https://colab.research.google.com/github/Satish-Kumar-1/Personal-Cancer-Diagnosis/blob/main/optuna_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip -q install optuna

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/364.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m276.5/364.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import optuna

import numpy as np
import pandas as pd

from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


In [3]:
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
columns = ['pregnancies', 'glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi',
           'diabetespedigreefunction', 'age', 'outcome']

In [4]:
data = pd.read_csv(url, names = columns)

In [5]:
data.sample(3)

Unnamed: 0,pregnancies,glucose,bloodpressure,skinthickness,insulin,bmi,diabetespedigreefunction,age,outcome
523,9,130,70,0,0,34.2,0.652,45,1
627,0,132,78,0,0,32.4,0.393,21,0
215,12,151,70,40,271,41.8,0.742,38,1


In [6]:
#check null values
#We convert 0 to np.nan then handle missing values

def handle_missing_values(data, columns):
    if 0 in data.columns:
        data[columns] = data[compile].replace(0, np.nan)
        data[columns] = data[columns].fillna(data[columns].mean())

    else:
        print(f"{data[columns]} has no missing value ")

    return data[columns]




In [7]:
handle_missing_values(data, 'glucose')

0      148
1       85
2      183
3       89
4      137
      ... 
763    101
764    122
765    121
766    126
767     93
Name: glucose, Length: 768, dtype: int64 has no missing value 


Unnamed: 0,glucose
0,148
1,85
2,183
3,89
4,137
...,...
763,101
764,122
765,121
766,126


In [8]:
data.glucose.value_counts()

Unnamed: 0_level_0,count
glucose,Unnamed: 1_level_1
99,17
100,17
111,14
129,14
125,14
...,...
191,1
177,1
44,1
62,1


In [9]:
columns = ['glucose', 'bloodpressure', 'skinthickness', 'insulin', 'bmi']

for col in data[columns]:
    data[col] = handle_missing_values(data, col)

0      148
1       85
2      183
3       89
4      137
      ... 
763    101
764    122
765    121
766    126
767     93
Name: glucose, Length: 768, dtype: int64 has no missing value 
0      72
1      66
2      64
3      66
4      40
       ..
763    76
764    70
765    72
766    60
767    70
Name: bloodpressure, Length: 768, dtype: int64 has no missing value 
0      35
1      29
2       0
3      23
4      35
       ..
763    48
764    27
765    23
766     0
767    31
Name: skinthickness, Length: 768, dtype: int64 has no missing value 
0        0
1        0
2        0
3       94
4      168
      ... 
763    180
764      0
765    112
766      0
767      0
Name: insulin, Length: 768, dtype: int64 has no missing value 
0      33.6
1      26.6
2      23.3
3      28.1
4      43.1
       ... 
763    32.9
764    36.8
765    26.2
766    30.1
767    30.4
Name: bmi, Length: 768, dtype: float64 has no missing value 


In [10]:
print(data.isnull().sum())

pregnancies                 0
glucose                     0
bloodpressure               0
skinthickness               0
insulin                     0
bmi                         0
diabetespedigreefunction    0
age                         0
outcome                     0
dtype: int64


In [11]:
x = data.drop(columns = ['outcome'])
y = data.outcome

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state =42, test_size = 0.2)

In [13]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [14]:
print(f"Training set shape: {x_train.shape}")
print(f"Test set shape: {x_test.shape}")

Training set shape: (614, 8)
Test set shape: (154, 8)


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

In [16]:
# define objective function
def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 100)
    max_depth = trial.suggest_int('max_depth', 3, 20)

    model = RandomForestClassifier(
        n_estimators = n_estimators,
        max_depth = max_depth,
        random_state = 42
    )

    score = cross_val_score(model, x_train, y_train, scoring= 'accuracy').mean()

    return score

In [17]:
study = optuna.create_study(direction = 'maximize', sampler = optuna.samplers.TPESampler())
study.optimize(objective, n_trials = 50)

[I 2024-12-20 13:56:33,166] A new study created in memory with name: no-name-66624c74-5c1f-4397-b3bd-58be4de3cc09
[I 2024-12-20 13:56:34,453] Trial 0 finished with value: 0.7719978675196588 and parameters: {'n_estimators': 78, 'max_depth': 10}. Best is trial 0 with value: 0.7719978675196588.
[I 2024-12-20 13:56:36,438] Trial 1 finished with value: 0.7541116886578703 and parameters: {'n_estimators': 67, 'max_depth': 3}. Best is trial 0 with value: 0.7719978675196588.
[I 2024-12-20 13:56:40,293] Trial 2 finished with value: 0.7785285885645742 and parameters: {'n_estimators': 98, 'max_depth': 7}. Best is trial 2 with value: 0.7785285885645742.
[I 2024-12-20 13:56:42,701] Trial 3 finished with value: 0.7703851792616286 and parameters: {'n_estimators': 98, 'max_depth': 10}. Best is trial 2 with value: 0.7785285885645742.
[I 2024-12-20 13:56:45,745] Trial 4 finished with value: 0.7736771957883514 and parameters: {'n_estimators': 95, 'max_depth': 18}. Best is trial 2 with value: 0.77852858856

In [18]:
print(f"Best trial accuracy: {study.best_trial.value}")
print(f"Best hyperparameters: {study.best_trial.params}")

Best trial accuracy: 0.7915767026522724
Best hyperparameters: {'n_estimators': 58, 'max_depth': 13}


In [19]:
from sklearn.metrics import accuracy_score

best_model = RandomForestClassifier(**study.best_trial.params, random_state = 42)

best_model.fit(x_train, y_train)

y_pred = best_model.predict(x_test)

test_accuracy = accuracy_score(y_test, y_test)

print(f"Test accuracy with best parameters: {test_accuracy*100:.2f}")


Test accuracy with best parameters: 100.00


In [22]:
# Optimize multiple ML models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [33]:
# define objective function

def objective(trial):
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoostingClassifier'])

    if classifier_name == 'SVM':
        c = trial.suggest_float('C', 0.1, 100, log = True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C =c,
                    kernel = kernel,
                    gamma = gamma,
                    random_state = 42
        )

    elif classifier_name == 'RandomForest':
       n_estimators = trial.suggest_int('n_estimators', 50, 300)
    #    learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log = True)
       max_depth = trial.suggest_int('max_depth', 3, 20)
       min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
       min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
       bootstrap = trial.suggest_categorical('bootstrap', [True, False])

       model = RandomForestClassifier(
           n_estimators = n_estimators,
        #    learning_rate = learning_rate,
           max_depth = max_depth,
           min_samples_leaf = min_samples_leaf,
           min_samples_split = min_samples_split,
           bootstrap = bootstrap,
           random_state =42

       )

    elif classifier_name == 'GradientBoosting':
       n_estimators = trial.suggest_int('n_estimators', 50, 300)
       learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log = True)
       max_depth = trial.suggest_int('max_depth', 3, 20)
       min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
       min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

       model = GradientBoostingClassifier(
           n_estimators = n_estimators,
           learning_rate = learning_rate,
           max_depth = max_depth,
           min_samples_leaf = min_samples_leaf,
           min_samples_split = min_samples_split,
           random_state =42

       )

       score = cross_val_score(model, x_train, y_train, cv = 3, scoring = 'accuracy').mean()
       return score

In [37]:
study = optuna.create_study(direction = 'maximize')
study.optimize(objective, n_trials= 50)

[I 2024-12-20 14:21:22,900] A new study created in memory with name: no-name-46e49cf9-a758-4307-9898-cf32a7824750
[I 2024-12-20 14:21:24,038] Trial 0 finished with value: 0.7556910569105691 and parameters: {'classifier': 'RandomForest', 'n_estimators': 200, 'max_depth': 4, 'min_samples_split': 4, 'min_samples_leaf': 9, 'bootstrap': True}. Best is trial 0 with value: 0.7556910569105691.
[I 2024-12-20 14:21:29,122] Trial 1 finished with value: 0.7540730113183485 and parameters: {'classifier': 'GradientBoosting', 'n_estimators': 278, 'learning_rate': 0.1194273292916394, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 0 with value: 0.7556910569105691.
[I 2024-12-20 14:21:29,245] Trial 2 finished with value: 0.7149689143950263 and parameters: {'classifier': 'SVM', 'C': 87.00843908561433, 'kernel': 'rbf', 'gamma': 'scale'}. Best is trial 0 with value: 0.7556910569105691.
[I 2024-12-20 14:21:30,396] Trial 3 finished with value: 0.7800892714809501 and parameters:

In [36]:
def objective(trial):
    # Choose the algorithm to tune
    classifier_name = trial.suggest_categorical('classifier', ['SVM', 'RandomForest', 'GradientBoosting'])

    if classifier_name == 'SVM':
        # SVM hyperparameters
        c = trial.suggest_float('C', 0.1, 100, log=True)
        kernel = trial.suggest_categorical('kernel', ['linear', 'rbf', 'poly', 'sigmoid'])
        gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])

        model = SVC(C=c, kernel=kernel, gamma=gamma, random_state=42)

    elif classifier_name == 'RandomForest':
        # Random Forest hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)
        bootstrap = trial.suggest_categorical('bootstrap', [True, False])

        model = RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            bootstrap=bootstrap,
            random_state=42
        )

    elif classifier_name == 'GradientBoosting':
        # Gradient Boosting hyperparameters
        n_estimators = trial.suggest_int('n_estimators', 50, 300)
        learning_rate = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
        max_depth = trial.suggest_int('max_depth', 3, 20)
        min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 10)

        model = GradientBoostingClassifier(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=42
        )

    # Perform cross-validation and return the mean accuracy
    score = cross_val_score(model, x_train, y_train, cv=3, scoring='accuracy').mean()
    return score

In [40]:
study.best_trial.params

{'classifier': 'RandomForest',
 'n_estimators': 79,
 'max_depth': 10,
 'min_samples_split': 4,
 'min_samples_leaf': 3,
 'bootstrap': False}

In [41]:
study.best_trial.value

0.7882193527817631

In [42]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_C,params_bootstrap,params_classifier,params_gamma,params_kernel,params_learning_rate,params_max_depth,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.755691,2024-12-20 14:21:22.903250,2024-12-20 14:21:24.037953,0 days 00:00:01.134703,,True,RandomForest,,,,4.0,9.0,4.0,200.0,COMPLETE
1,1,0.754073,2024-12-20 14:21:24.039857,2024-12-20 14:21:29.121668,0 days 00:00:05.081811,,,GradientBoosting,,,0.119427,10.0,6.0,5.0,278.0,COMPLETE
2,2,0.714969,2024-12-20 14:21:29.124777,2024-12-20 14:21:29.244820,0 days 00:00:00.120043,87.008439,,SVM,scale,rbf,,,,,,COMPLETE
3,3,0.780089,2024-12-20 14:21:29.247849,2024-12-20 14:21:30.396398,0 days 00:00:01.148549,,False,RandomForest,,,,15.0,10.0,2.0,170.0,COMPLETE
4,4,0.757285,2024-12-20 14:21:30.398603,2024-12-20 14:21:30.441363,0 days 00:00:00.042760,0.192517,,SVM,auto,rbf,,,,,,COMPLETE
5,5,0.768723,2024-12-20 14:21:30.443152,2024-12-20 14:21:32.366697,0 days 00:00:01.923545,,,GradientBoosting,,,0.275757,11.0,3.0,8.0,99.0,COMPLETE
6,6,0.786609,2024-12-20 14:21:32.368728,2024-12-20 14:21:32.907486,0 days 00:00:00.538758,,False,RandomForest,,,,14.0,3.0,9.0,98.0,COMPLETE
7,7,0.767081,2024-12-20 14:21:32.909339,2024-12-20 14:21:34.358854,0 days 00:00:01.449515,,True,RandomForest,,,,18.0,8.0,8.0,251.0,COMPLETE
8,8,0.762195,2024-12-20 14:21:34.361150,2024-12-20 14:21:37.531845,0 days 00:00:03.170695,,,GradientBoosting,,,0.230855,18.0,7.0,7.0,220.0,COMPLETE
9,9,0.742651,2024-12-20 14:21:37.533717,2024-12-20 14:21:39.476035,0 days 00:00:01.942318,,,GradientBoosting,,,0.033327,15.0,2.0,3.0,64.0,COMPLETE
