# Lecture 7

diabetes.csv dataset

Pima Indians Diabetes Database
- The datasets consist of several medical predictor (independent) variables and one target (dependent) variable, Outcome.
- Independent variables include the number of pregnancies the patient has had, their BMI, insulin level, age, and so on.

In [4]:
!pip install scikeras

import pandas as pd
import numpy as np
import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout # dense is a fully connected layer

from scikeras.wrappers import KerasClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import  LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, roc_curve
from sklearn.model_selection import train_test_split,  StratifiedKFold, GridSearchCV



In [5]:
data = pd.read_csv('diabetes.csv')
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
data.isnull().sum()

Unnamed: 0,0
Pregnancies,0
Glucose,0
BloodPressure,0
SkinThickness,0
Insulin,0
BMI,0
DiabetesPedigreeFunction,0
Age,0
Outcome,0


In [7]:
data.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [8]:
y = data['Outcome']
print(f'Number of 0: {(y == 0).sum()}')
print(f'Number of 1: {(y == 1).sum()}')
data.drop(['Outcome'], axis=1, inplace=True)
colnames = data.columns.values.tolist()
colnames

Number of 0: 500
Number of 1: 268


['Pregnancies',
 'Glucose',
 'BloodPressure',
 'SkinThickness',
 'Insulin',
 'BMI',
 'DiabetesPedigreeFunction',
 'Age']

In [9]:
sscaler = StandardScaler()
mmscaler = MinMaxScaler()
diabet_tf = ColumnTransformer(
    transformers=[
        ('other', sscaler, colnames[:-1]),
        ('Age', mmscaler, ['Age'])
    ],
    verbose_feature_names_out=False
)
diabet_tf.fit_transform(data)

array([[ 0.63994726,  0.84832379,  0.14964075, ...,  0.20401277,
         0.46849198,  0.48333333],
       [-0.84488505, -1.12339636, -0.16054575, ..., -0.68442195,
        -0.36506078,  0.16666667],
       [ 1.23388019,  1.94372388, -0.26394125, ..., -1.10325546,
         0.60439732,  0.18333333],
       ...,
       [ 0.3429808 ,  0.00330087,  0.14964075, ..., -0.73518964,
        -0.68519336,  0.15      ],
       [-0.84488505,  0.1597866 , -0.47073225, ..., -0.24020459,
        -0.37110101,  0.43333333],
       [-0.84488505, -0.8730192 ,  0.04624525, ..., -0.20212881,
        -0.47378505,  0.03333333]])

In [10]:
new_data = pd.DataFrame(diabet_tf.transform(data), columns=diabet_tf.get_feature_names_out())
new_data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.90727,-0.692891,0.204013,0.468492,0.483333
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,0.166667
2,1.23388,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,0.183333
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,0.0
4,-1.141852,0.504055,-1.504687,0.90727,0.765836,1.409746,5.484909,0.2


We now evaluate a Logistic Regression model trained on such data.

In [11]:
parameters={
    'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0],
    'penalty': ['l1', 'l2']
}

skf = StratifiedKFold(n_splits=3)
test_f1 = []
test_acc = []

for train_index, test_index in skf.split(data, y):
    x_train, y_train = data.iloc[train_index,:], y[train_index]
    x_test, y_test = data.iloc[test_index,:], y[test_index]
    diabet_tf.fit(x_train)
    x_train = pd.DataFrame(diabet_tf.transform(x_train), columns=diabet_tf.get_feature_names_out())
    x_test = pd.DataFrame(diabet_tf.transform(x_test), columns=diabet_tf.get_feature_names_out())
    cls = LogisticRegression(solver='liblinear')
    GS = GridSearchCV(estimator=cls,
                      param_grid=parameters,
                      scoring='f1',
                      refit=True,
                      cv=3,
                      verbose=0)
    result = GS.fit(x_train, y_train)

    print(f'Best score got by the best estimator: {result.best_score_}')
    print(f'Configuration for the best estimator/classifier: {result.best_params_}')

    best_model = result.best_estimator_
    prediction = best_model.predict(x_test)
    test_f1.append(f1_score(y_test, prediction))
    test_acc.append(accuracy_score(y_test, prediction))

print(f"Mean f1 test:{np.mean(test_f1)} +/-:{np.std(test_f1)}")
print(f"Mean Accuracy test:{np.mean(test_acc)} +/-:{np.std(test_acc)}")



Best score got by the best estimator: 0.6565275957477792
Configuration for the best estimator/classifier: {'C': 0.0001, 'penalty': 'l2'}
Best score got by the best estimator: 0.6566685531031881
Configuration for the best estimator/classifier: {'C': 0.001, 'penalty': 'l2'}
Best score got by the best estimator: 0.6331841010187976
Configuration for the best estimator/classifier: {'C': 0.1, 'penalty': 'l2'}
Mean f1 test:0.6359184352403596 +/-:0.018128061310017263
Mean Accuracy test:0.7669270833333334 +/-:0.021236336497786574


We now use a keras. model for model selection in scikit-learn.

Keras models can be used in scikit-learn by wrapping them with the KerasClassifier, KerasRegressor classes from module SciKeras.

We need to define a method (e.g., create_model) to create our model. Then we feed it as an argument model of KerasClassifier constructor. (KerasClassifier(model=create_model)

We define our create_model() method.

- Argument which does not have defalut value must be specified in the text KerasClassifier() call as individual argument
- Optionally, also in the grid for grid search
- Kernel_regularizer performs the weight regularization.

In [12]:
np.random.seed(42) # for reproducibility
tf.random.set_seed(43)
input_shape = data.shape[1]
print(f"input_shape:{input_shape}, num_classes:{len(np.unique(y))}")

input_shape:8, num_classes:2


In [13]:
def create_model(nhid1, nhid2, # number of hidden neurons in hidden layers 1, 2
                 learning_rate=10**-1, # learning rate to be used by the optimizer
                 loss='BinaryCrossentropy', # loss function
                 hid_act='relu', # activation function for hidden layers
                 out_act='sigmoid', # activation function for output layer
                 dropout_rate=0, # the rate of dropout to be used
                 weight_reg=None):
    model = Sequential()
    model.add(Dense(nhid1,
                    input_shape=(input_shape,),
                    activation=hid_act,
                    kernel_regularizer=weight_reg))
    # set dropout regularization
    model.add(Dropout(dropout_rate))
    model.add(Dense(nhid2,
                    activation=hid_act,
                    kernel_regularizer=weight_reg))
    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation=out_act))
    model.compile(loss=loss,
                  optimizer=tf.keras.optimizers.SGD(learning_rate=learning_rate),
                  metrics=['accuracy'])
    return model


### Instantiating the model

In [14]:
# wrapper class for scikit_learn API
model = KerasClassifier(model=create_model,
                      nhid1=50,
                      nhid2=50,
                      epochs=20)
model.fit(new_data, y) # learning the model on all data
prediction = model.predict(new_data)
print(f"Training accuracy: {accuracy_score(y, prediction)}")
print(f"Training F: {f1_score(y, prediction)}")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.5276 - loss: 0.6896
Epoch 2/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7367 - loss: 0.5556
Epoch 3/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7448 - loss: 0.5213
Epoch 4/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7579 - loss: 0.5130
Epoch 5/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7603 - loss: 0.5082
Epoch 6/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7668 - loss: 0.5035
Epoch 7/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7668 - loss: 0.4995
Epoch 8/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7698 - loss: 0.4959
Epoch 9/20
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

### The optimizer

So far we have used the stochastic gradient descent.

In [15]:
'''keras.optimizers.SGD(learning_rate=0.01,
                     momentum=0.0,
                     nesterov=False,
                     weight_decay=None,
                     clipnorm=None,
                     clipvalue=None,
                     global_clipnorm=None,
                     use_ema=False,
                     ema_momentum=0.99,
                     ema_overwrite_frequency=None,
                     name='SGD',
                     **kwargs)'''

"keras.optimizers.SGD(learning_rate=0.01,\n                     momentum=0.0,\n                     nesterov=False,\n                     weight_decay=None,\n                     clipnorm=None,\n                     clipvalue=None,\n                     global_clipnorm=None,\n                     use_ema=False,\n                     ema_momentum=0.99,\n                     ema_overwrite_frequency=None,\n                     name='SGD',\n                     **kwargs)"

- learning_rate: a float, a keras.optimizers.schedules.LearningRateSchedule instance, or acallable that takes no arguments and returns the actualvalue to use. The learnign rate. Deafult is 0.01
- momentum: float hyperparameter >= 0that acceleratesgradient desccent in the relevant direction and dampens oscillations. 0 is vanilla gradient descent. Default is 0.0
- nesterov: boolean. Whether to apply Nesterov momentum or not. Default is False.

### Model Selection (hyperparameter optimization) using scikit-learn

* We can leverage for instance the ```GridSearchCV```  method!

* We want to tune the number of hidden neurons in the hidden layers

 * However we need to specify their value when calling the KerasClassifier constructor, since they dont have a default value
* The *learning rate*
* The *weight regularizer*
* The *batch_size*
* The *Activation function*
* The dropout level  
* Pay attention to the dictionary definition
 * To tune model arguments, the dictionary name of that argument must start with **"model__"**
 * The arguments to Keras `fit` method, like batch size and weight regularization, do not need that prefix in the dictionary name

In [16]:
model = KerasClassifier(model=create_model,
                        nhid1=100,
                        nhid2=50,
                        epochs=15)
# define the grid search parameters for the number of neurons in the hidden layers
nhid1 = [75, 100]
nhid2 = [50, 75]
lr = [10**-4, 10**-3]
weight_reg = [None, 'l2', 'l1']
hid_act = ['relu', 'sigmoid']
batch_size = [16, 32]
dropout = [0, 0.2]
loss = ['BinaryCrossentropy', 'mse']

# dictionary names must start with model__, if it is an argument of the model
# followed by the model argument name

params_grid = dict(model__nhid1=nhid1,
                   model__nhid2=nhid2,
                   model__learning_rate=lr,
                   model__weight_reg=weight_reg,
                   model__hid_act=hid_act,
                   batch_size=batch_size,
                   model__dropout_rate=dropout,
                   model__loss=loss)

### Model selection in holdout setting

In [17]:
X_train, X_test, y_train, y_test = train_test_split(new_data, y,
                                                    test_size=0.1,
                                                    shuffle=True,
                                                    stratify=y)
X_train = pd.DataFrame(diabet_tf.transform(X_train),
                       columns=diabet_tf.get_feature_names_out())
X_test = pd.DataFrame(diabet_tf.transform(X_test),
                      columns=diabet_tf.get_feature_names_out())
GS = GridSearchCV(estimator=model,
                  param_grid=params_grid,
                  n_jobs=-1,
                  scoring='f1',
                  cv=3,
                  verbose=1)
grid_result = GS.fit(X_train, y_train)

# best result
print("Best: %f using %s" % (grid_result.best_score_,
                             grid_result.best_params_))

Fitting 3 folds for each of 384 candidates, totalling 1152 fits


  pid = os.fork()
  pid = os.fork()
  _data = np.array(data, dtype=dtype, copy=copy,
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.6463 - loss: 0.2467
Epoch 2/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6463 - loss: 0.2466
Epoch 3/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6463 - loss: 0.2464
Epoch 4/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6463 - loss: 0.2462
Epoch 5/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6463 - loss: 0.2460
Epoch 6/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6463 - loss: 0.2458
Epoch 7/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6463 - loss: 0.2456
Epoch 8/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6463 - loss: 0.2454
Epoch 9/15
[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

In [19]:
means = grid_result.cv_results_['mean_test_score']
params = grid_result.cv_results_['params']

# printing results for all combinations
for mean, param in zip(means, params):
    print(f"{mean} \t with: {param}")

0.18864468864468864 	 with: {'batch_size': 16, 'model__dropout_rate': 0, 'model__hid_act': 'relu', 'model__learning_rate': 0.0001, 'model__loss': 'BinaryCrossentropy', 'model__nhid1': 75, 'model__nhid2': 50, 'model__weight_reg': None}
0.05017921146953405 	 with: {'batch_size': 16, 'model__dropout_rate': 0, 'model__hid_act': 'relu', 'model__learning_rate': 0.0001, 'model__loss': 'BinaryCrossentropy', 'model__nhid1': 75, 'model__nhid2': 50, 'model__weight_reg': 'l2'}
0.38217273954116066 	 with: {'batch_size': 16, 'model__dropout_rate': 0, 'model__hid_act': 'relu', 'model__learning_rate': 0.0001, 'model__loss': 'BinaryCrossentropy', 'model__nhid1': 75, 'model__nhid2': 50, 'model__weight_reg': 'l1'}
0.43284593134668103 	 with: {'batch_size': 16, 'model__dropout_rate': 0, 'model__hid_act': 'relu', 'model__learning_rate': 0.0001, 'model__loss': 'BinaryCrossentropy', 'model__nhid1': 75, 'model__nhid2': 75, 'model__weight_reg': None}
0.17049180327868851 	 with: {'batch_size': 16, 'model__dropo

In [20]:
print('Holdout test performance')
model = GS.best_estimator_
y_test_predicted = model.predict(X_test)
print(f"test  F1:{f1_score(y_test, y_test_predicted)}")
print(f"test  Accuracy:{accuracy_score(y_test, y_test_predicted)}")

Holdout test performance
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step
test  F1:0.0
test  Accuracy:0.6493506493506493


### Model selection in CV