In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
df = pd.read_csv('C:\diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [4]:
import numpy as np
df['Glucose'] = np.where(df['Glucose']==0,df['Glucose'].median(),df['Glucose'])
df['Insulin'] = np.where(df['Insulin']==0,df['Insulin'].median(),df['Insulin'])
df['SkinThickness'] = np.where(df['SkinThickness']==0,df['SkinThickness'].median(),df['SkinThickness'])

df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72,35.0,30.5,33.6,0.627,50,1
1,1,85.0,66,29.0,30.5,26.6,0.351,31,0
2,8,183.0,64,23.0,30.5,23.3,0.672,32,1
3,1,89.0,66,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40,35.0,168.0,43.1,2.288,33,1


In [5]:
X = df.drop('Outcome', axis=1)
y = df['Outcome']

In [6]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72,35.0,30.5,33.6,0.627,50
1,1,85.0,66,29.0,30.5,26.6,0.351,31
2,8,183.0,64,23.0,30.5,23.3,0.672,32
3,1,89.0,66,23.0,94.0,28.1,0.167,21
4,0,137.0,40,35.0,168.0,43.1,2.288,33


In [7]:
y.head()

0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state=0)

In [9]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10).fit(X_train,y_train)
prediction = classifier.predict(X_test)

In [10]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score
print(confusion_matrix(y_test,prediction))
print(accuracy_score(y_test,prediction))
print(classification_report(y_test,prediction))

[[94 13]
 [20 27]]
0.7857142857142857
              precision    recall  f1-score   support

           0       0.82      0.88      0.85       107
           1       0.68      0.57      0.62        47

    accuracy                           0.79       154
   macro avg       0.75      0.73      0.74       154
weighted avg       0.78      0.79      0.78       154



# Optuna optimization

1. Handle a wide variety of tasks with a simple installation that has few requirements.
2. Define search spaces using familiar Python syntax including conditionals and loops.
3. Adopt state-of-the-art algorithms for sampling hyper parameters and efficiently pruning unpromising trials.
4. Scale studies to tens or hundreds or workers with little or no changes to the code.
5. Inspect optimization histories from a variety of plotting functions.


In [12]:
import optuna
import sklearn.svm
def objective(trial):

    classifier = trial.suggest_categorical('classifier', ['RandomForest', 'SVC'])
    
    if classifier == 'RandomForest':
        n_estimators = trial.suggest_int('n_estimators', 300, 3000,10)
        max_depth = int(trial.suggest_float('max_depth', 10, 100, log=True))

        clf = sklearn.ensemble.RandomForestClassifier(
            n_estimators=n_estimators, max_depth=max_depth)
    else:
        c = trial.suggest_float('svc_c', 1e-10, 1e10, log=True)
        
        clf = sklearn.svm.SVC(C=c, gamma='auto')

    return sklearn.model_selection.cross_val_score(
        clf,X_train,y_train, n_jobs=-1, cv=3).mean()

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2020-10-11 10:55:09,367][0m A new study created in memory with name: no-name-4a3e4a7d-8b05-4839-a96c-1729af3c8a87[0m
[32m[I 2020-10-11 10:55:30,183][0m Trial 0 finished with value: 0.7540650406504065 and parameters: {'classifier': 'RandomForest', 'n_estimators': 2680, 'max_depth': 84.94990868401386}. Best is trial 0 with value: 0.7540650406504065.[0m
[32m[I 2020-10-11 10:55:35,845][0m Trial 1 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 0.0019138619595667132}. Best is trial 0 with value: 0.7540650406504065.[0m
[32m[I 2020-10-11 10:55:40,803][0m Trial 2 finished with value: 0.640068547744301 and parameters: {'classifier': 'SVC', 'svc_c': 3.082011414595752e-07}. Best is trial 0 with value: 0.7540650406504065.[0m
[32m[I 2020-10-11 10:55:49,309][0m Trial 3 finished with value: 0.7491790212019768 and parameters: {'classifier': 'RandomForest', 'n_estimators': 2130, 'max_depth': 11.014985357402937}. Best is trial 0 with value: 0.75

Accuracy: 0.755699027578511
Best hyperparameters: {'classifier': 'RandomForest', 'n_estimators': 2330, 'max_depth': 12.864237181394243}


In [14]:
trial


FrozenTrial(number=9, value=0.755699027578511, datetime_start=datetime.datetime(2020, 10, 11, 10, 55, 56, 800963), datetime_complete=datetime.datetime(2020, 10, 11, 10, 56, 5, 505622), params={'classifier': 'RandomForest', 'n_estimators': 2330, 'max_depth': 12.864237181394243}, distributions={'classifier': CategoricalDistribution(choices=('RandomForest', 'SVC')), 'n_estimators': IntUniformDistribution(high=3000, low=300, step=10), 'max_depth': LogUniformDistribution(high=100, low=10)}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=9, state=TrialState.COMPLETE)

In [15]:
study.best_params

{'classifier': 'RandomForest',
 'n_estimators': 2330,
 'max_depth': 12.864237181394243}

In [16]:
rf=RandomForestClassifier(n_estimators=2330,max_depth=12)
rf.fit(X_train,y_train)

RandomForestClassifier(max_depth=12, n_estimators=2330)

In [17]:
y_pred=rf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[95 12]
 [16 31]]
0.8181818181818182
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       107
           1       0.72      0.66      0.69        47

    accuracy                           0.82       154
   macro avg       0.79      0.77      0.78       154
weighted avg       0.81      0.82      0.82       154

