In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier 
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

np.random.seed(1)

In [3]:
import os
os.chdir('C:\\Users\\simra\\Downloads')

In [2]:
X_train = pd.read_csv("hotel_res_train_X.csv")
X_test = pd.read_csv("hotel_res_test_X.csv")
y_train = pd.read_csv("hotel_res_train_y.csv")
y_test = pd.read_csv("hotel_res_test_y.csv")

In [11]:
X_train.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,reserved_Room_Type 3,reserved_Room_Type 4,reserved_Room_Type 5,reserved_Room_Type 6,reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online
0,-1.629881,-0.262925,0.216149,-0.143648,-0.181176,-0.619528,0.46661,-1.759539,-1.210875,-0.163232,...,0,0,0,0,0,0,0,0,0,1
1,-1.629881,-0.262925,0.216149,-0.143648,-0.181176,-0.818817,0.46661,-0.133387,-0.868052,-0.163232,...,0,0,0,0,0,0,0,0,0,1
2,0.294062,-0.262925,1.365321,-1.566784,-0.181176,-0.783649,0.46661,-0.458617,1.188886,-0.163232,...,0,0,0,0,0,0,0,0,0,1
3,0.294062,-0.262925,-0.933023,1.279489,-0.181176,2.475321,0.46661,0.517075,0.617514,-0.163232,...,0,0,0,0,0,0,0,0,1,0
4,-1.629881,-0.262925,-0.933023,-0.143648,-0.181176,1.209247,0.46661,-0.458617,-0.068132,-0.163232,...,0,0,0,0,0,0,0,0,1,0


### Modeling data
First, let's create a dataframe to load the model performance metrics into.

In [14]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

### Evaluation metric
In our dataset, for end goal, we are predicting the column 'booking_status_Not_Canceled' which is 1 when the reservation remains intact, and 0 when it is called-off. The selection of metric would depend on what parameter leads to maximum loss reduction to the business.
Interpretation for the confusion matrix:

TP= not canceled detected 1 correctly means the booking remained.

TN= not canceled detected 0 correctly means the booking got cancelled.

FP= not canceled detected 1 wrongly means detected that booking is there but it is actually cancelled.

FN= not canceled detected 0 wrongly means detected that booking is cancelled but instead it is actually intact.

Therefore FN and FP both need to be reduced because increase in both the cases would lead to loss in different ways.
Increased FP will lead to underutilization of rooms therefore revenue loss as there will be empty rooms available. 
Increased FN will lead to overbooking and therefore when customers arrive, it would lead to their dissatisfaction with the service as someone will have to adjust or compromize on either another room or another hotel. This would indirectly impact the ratings of the hotel and lead to loss in revenue. 

Precision metric takes care of the FP and Recall metric takes care of the FN, but we are interested to reduce both FP and FN.
Hence, for modeling data, we will use F1-score as it takes care of both metrics by taking their harmonic mean. 


#### Logistic Regression Random Search CV using default, L1, L2, Elastic, liblinear regularization
Conduct an initial random search across a wide range of possible parameters.

In [15]:
score_measure = "f1"
kfolds = 5

param_grid = {
    'max_iter':np.arange(90,110),
    'penalty': ['None','l1','l2','elasticnet'],
    'solver':['saga','liblinear']
}

log_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = log_reg, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
The best f1 score is 0.7794295844412918
... with parameters: {'solver': 'saga', 'penalty': 'l1', 'max_iter': 109}


#### Logistic Regression Grid Search CV using default, L1, L2, Elastic, liblinear regularization
Conduct an exhaustive search across a smaller range of parameters around the parameters found in the initial random search.

In [16]:
score_measure = "f1"
kfolds = 5
max_iter = rand_search.best_params_['max_iter']
penalty = rand_search.best_params_['penalty']
solver = rand_search.best_params_['solver']

param_grid = {
    'max_iter': np.arange(max_iter-5,max_iter+5),  
    'penalty': [penalty],
    'solver': [solver]
}

log_reg_model = LogisticRegression()
grid_search = GridSearchCV(estimator = log_reg_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestf1_logistic = grid_search.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The best f1 score is 0.7794295844412918
... with parameters: {'max_iter': 104, 'penalty': 'l1', 'solver': 'saga'}


In [17]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Logistic Regression", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357


#### SVM Random Search CV using linear, rbf and poly kernal
Conduct an initial random search across a wide range of possible parameters.

In [18]:
score_measure = "f1"
kfolds = 3

param_grid = {
    'C': np.arange(1,15),   
    'gamma': ['scale','auto'],
    'kernel':['linear','rbf','poly']
}

svm_m1 = SVC()
rand_search = RandomizedSearchCV(estimator = svm_m1, param_distributions=param_grid, cv=kfolds, n_iter=2,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 3 folds for each of 2 candidates, totalling 6 fits
The best f1 score is 0.8191377481759609
... with parameters: {'kernel': 'poly', 'gamma': 'scale', 'C': 5}


#### SVM Grid Search CV using linear, rbf and poly kernal
Conduct an initial random search across a wide range of possible parameters.

In [19]:
score_measure = "f1"
kfolds = 3

C = rand_search.best_params_['C']
gamma = rand_search.best_params_['gamma']
kernel = rand_search.best_params_['kernel']

param_grid = {
    'C': np.arange(C-1,C+1),  
    'gamma': [gamma],
    'kernel': [kernel]
    
}

svm_model = SVC()
grid_search = GridSearchCV(estimator = svm_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestf1_SVM = grid_search.best_estimator_

Fitting 3 folds for each of 2 candidates, totalling 6 fits
The best f1 score is 0.8195533072379352
... with parameters: {'C': 4, 'gamma': 'scale', 'kernel': 'poly'}


In [20]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"SVM", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357
0,SVM,0.823211,0.910836,0.818775,0.862355


#### DTree Random Search CV

In [21]:
score_measure = "f1"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,100),  
    'min_samples_leaf': np.arange(1,100),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 100), 
    'max_depth': np.arange(5,15), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=50,
                           scoring=score_measure, verbose=1, n_jobs=-1, 
                           return_train_score=True)

_ = rand_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

Fitting 5 folds for each of 50 candidates, totalling 250 fits
The best f1 score is 0.826500027995135
... with parameters: {'min_samples_split': 22, 'min_samples_leaf': 35, 'min_impurity_decrease': 0.0006000000000000001, 'max_leaf_nodes': 58, 'max_depth': 14, 'criterion': 'entropy'}


#### DTree Grid Search CV

In [22]:
score_measure = "f1"
kfolds = 3
min_samples_split = rand_search.best_params_['min_samples_split']
min_samples_leaf = rand_search.best_params_['min_samples_leaf']
min_impurity_decrease = rand_search.best_params_['min_impurity_decrease']
max_leaf_nodes = rand_search.best_params_['max_leaf_nodes']
max_depth = rand_search.best_params_['max_depth']
criterion = rand_search.best_params_['criterion']

param_grid = {
    'min_samples_split': np.arange(min_samples_split-2,min_samples_split+2),  
    'min_samples_leaf': np.arange(min_samples_leaf-2,min_samples_leaf+2),
    'min_impurity_decrease': np.arange(min_impurity_decrease-0.0001, min_impurity_decrease+0.0001, 0.00005),
    'max_leaf_nodes': np.arange(max_leaf_nodes-2,max_leaf_nodes+2), 
    'max_depth': np.arange(max_depth-2,max_depth+2), 
    'criterion': [criterion]
}

dtree = DecisionTreeClassifier()
grid_search = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestf1_dtree = grid_search.best_estimator_

Fitting 3 folds for each of 1280 candidates, totalling 3840 fits
The best f1 score is 0.8283334369927218
... with parameters: {'criterion': 'entropy', 'max_depth': 12, 'max_leaf_nodes': 59, 'min_impurity_decrease': 0.0005, 'min_samples_leaf': 33, 'min_samples_split': 20}


In [23]:
c_matrix = confusion_matrix(y_test, grid_search.predict(X_test))
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"Decision Tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357
0,SVM,0.823211,0.910836,0.818775,0.862355
0,Decision Tree,0.842047,0.896542,0.866458,0.881244


#### Best Estimators for the 3 models

In [24]:
best= [bestf1_logistic,bestf1_SVM,bestf1_dtree]
best


[LogisticRegression(max_iter=105, penalty='l1', solver='saga'),
 SVC(C=8),
 DecisionTreeClassifier(criterion='entropy', max_depth=11, max_leaf_nodes=77,
                        min_impurity_decrease=0.0005, min_samples_leaf=25,
                        min_samples_split=22)]

### Summary
Arranged by f1 score, the best models are:

In [24]:
performance.sort_values(by=['F1'],ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision Tree,0.842047,0.896542,0.866458,0.881244
0,SVM,0.823211,0.910836,0.818775,0.862355
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357


Due to technical limitations, models with few iterations were run which gave the above described F1-score. It seems that for our model the best performing model out of the three is SVM Classifier. 

Predicting correct booking cancellations would help the business in two ways-

-->There would be no empty rooms i.e. underutilization of the property.

-->There would be no customer issues who could clash on not getting the rooms because of overbooking, which eventually will lead to good reviews.


### Neural Net

#### With Random Search

In [25]:
%%time

score_measure = "f1"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (70,),(50,30), (40,20)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1, 0.2, 0.5],
    'max_iter': [100]
}

ann = MLPClassifier()
grid_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=50,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestf1Tree = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
{'solver': 'sgd', 'max_iter': 100, 'learning_rate_init': 0.2, 'learning_rate': 'adaptive', 'hidden_layer_sizes': (70,), 'alpha': 0.2, 'activation': 'relu'}
Wall time: 1min 54s


In [27]:
%%time
y_pred = bestf1Tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.70      0.84      0.77      3522
           1       0.92      0.83      0.87      7361

    accuracy                           0.83     10883
   macro avg       0.81      0.84      0.82     10883
weighted avg       0.85      0.83      0.84     10883

Wall time: 32.3 ms


In [28]:
%%time

score_measure = "f1"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (30,), (50,), (70,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [100]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, y_train)

bestf1Tree = grid_search.best_estimator_

print(grid_search.best_params_)


Fitting 5 folds for each of 108 candidates, totalling 540 fits
{'activation': 'relu', 'alpha': 0.5, 'hidden_layer_sizes': (70,), 'learning_rate': 'invscaling', 'learning_rate_init': 0.01, 'max_iter': 100, 'solver': 'adam'}
Wall time: 2min 10s


In [29]:
%%time
y_pred = bestf1Tree.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.82      0.74      3522
           1       0.90      0.81      0.85      7361

    accuracy                           0.81     10883
   macro avg       0.79      0.81      0.80     10883
weighted avg       0.83      0.81      0.82     10883

Wall time: 26.4 ms


In [30]:
c_matrix = confusion_matrix(y_test, y_pred)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"ANN", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357
0,SVM,0.823211,0.910836,0.818775,0.862355
0,Decision Tree,0.842047,0.896542,0.866458,0.881244
0,ANN,0.811817,0.90183,0.809944,0.853421


In [31]:
performance.sort_values(by=['F1'],ascending=False)

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Decision Tree,0.842047,0.896542,0.866458,0.881244
0,SVM,0.823211,0.910836,0.818775,0.862355
0,ANN,0.811817,0.90183,0.809944,0.853421
0,Logistic Regression,0.785721,0.879089,0.792148,0.833357


For this dataset, ANN doesn't seem to be the best model considering the parameters selected. It might turn better if we tune the model or use a larger dataset maybe. 

## Keras with sklearn search

In [70]:
import numpy as np
import tensorflow as tf
np.random.seed(1)
tf.random.set_seed(1)
from tensorflow import keras
from tensorflow.keras import layers

In [71]:
performance_nn = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

In [72]:
%%time

def build_clf(meta, hidden_layer_sizes, dropout):
    n_features_in_ = meta["n_features_in_"]
    n_classes_ = meta["n_classes_"]
    target_encoder_ = meta["target_encoder_"]
    
    model = tf.keras.models.Sequential()
    model.add(keras.layers.Input(shape=n_features_in_)),
    #for hidden_layer_size in hidden_layer_sizes:
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, 
            kernel_initializer= tf.keras.initializers.GlorotUniform(), 
            bias_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), 
            activation="relu"))
        model.add(keras.layers.Dropout(dropout))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    #though you could return a compiled model, it's not necessary, and would result in the loss of these
    # parameters in the tune process - as they would be 'hard coded'
    # model.compile(loss = 'sparse_categorical_crossentropy', metrics = ['accuracy']) 

    return model


CPU times: total: 0 ns
Wall time: 1.72 ms


In [73]:
%%time

# If you don't have the following installed, from command line '!pip install scikeras'
from scikeras.wrappers import KerasClassifier

keras_clf = KerasClassifier(
    model=build_clf,
    hidden_layer_sizes=20,
    dropout=0.5,
    optimizer=keras.optimizers.Adam,
    optimizer__learning_rate=0.0001
)
keras_clf.get_params()


CPU times: total: 0 ns
Wall time: 21.3 ms


{'model': <function __main__.build_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': None,
 'optimizer': keras.optimizers.optimizer_v2.adam.Adam,
 'loss': None,
 'metrics': None,
 'batch_size': None,
 'validation_batch_size': None,
 'verbose': 1,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 1,
 'hidden_layer_sizes': 20,
 'dropout': 0.5,
 'optimizer__learning_rate': 0.0001,
 'class_weight': None}

In [74]:
%%time

params = {
    
    # the following are model parameters, and therefore must be defined as parameters in the KarasClassifier, and then in the build_clf function
    'model__hidden_layer_sizes': [(20,),(30, ), (35,), (40, 30)], # this will require KarasClassifier and build_clf to have hidden_layer_sizes parameter set
    'model__dropout': [0, 0.1], # this will require KarasClassifier and build_clf to have hidden_layer_sizes parameter set
    
    # the following are 'fit' parameters, the scikeras wrapper provides these parameters. These are passed to the 'model.fit' method for each fit of the model
    'batch_size':[200, 600, 500],
    'epochs':[5],
    'optimizer':['adam','sgd'],
    'loss':['binary_crossentropy'],
    
    # this is added to the optimizer 
    'optimizer__learning_rate': [0.0001, 0.001, 0.01]

}
keras_clf.get_params()

CPU times: total: 0 ns
Wall time: 0 ns


{'model': <function __main__.build_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': None,
 'optimizer': keras.optimizers.optimizer_v2.adam.Adam,
 'loss': None,
 'metrics': None,
 'batch_size': None,
 'validation_batch_size': None,
 'verbose': 1,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 1,
 'hidden_layer_sizes': 20,
 'dropout': 0.5,
 'optimizer__learning_rate': 0.0001,
 'class_weight': None}

In [75]:
%%time

from sklearn.model_selection import RandomizedSearchCV
#from tensorflow.keras.callbacks import EarlyStopping

rnd_search_cv = RandomizedSearchCV(
    estimator=keras_clf, 
    param_distributions=params, 
    scoring='f1',  # we could use any appropriate sklearn metric here (i.e. accuracy, f1_micro, f1_macro)
    n_iter=5, 
    cv=3)

# In rare cases, you may find your model training results in exceeding python's default recursion limit.
# If needed, you can increase this excersion limit by using the following code.
#import sys
#sys.setrecursionlimit(10000) # note: the default is 3000 (python 3.9)

_ = rnd_search_cv.fit(X_train, y_train,  verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
CPU times: total: 49.7 s
Wall time: 20.2 s


In [76]:
rnd_search_cv.best_params_

{'optimizer__learning_rate': 0.001,
 'optimizer': 'adam',
 'model__hidden_layer_sizes': (40, 30),
 'model__dropout': 0.1,
 'loss': 'binary_crossentropy',
 'epochs': 5,
 'batch_size': 600}

In [77]:
best_model = rnd_search_cv.best_estimator_

In [78]:
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, best_model.predict(X_test))



In [79]:

TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
performance_nn = pd.concat([performance_nn, pd.DataFrame({'model':"Keras with sklearn search", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance_nn

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Keras with sklearn search,0.790131,0.879134,0.7946,0.834732


In [80]:
print(classification_report(y_test, best_model.predict(X_test), digits=4))

              precision    recall  f1-score   support

           0     0.6550    0.7812    0.7126      3624
           1     0.8791    0.7946    0.8347      7259

    accuracy                         0.7901     10883
   macro avg     0.7671    0.7879    0.7736     10883
weighted avg     0.8045    0.7901    0.7940     10883



The best model that reduces the loss function has 'model__hidden_layer_sizes': (40, 30) and 'batch_size': 600 for 5 epoch.

## Keras tuner

In [81]:
#!pip install -q -U keras-tuner

In [83]:
import keras_tuner
# If you don't have keras_tuner installed, run the following in your terminal (mac), or anaconda prompt (windows)
# conda install -c conda-forge keras-tuner



from tensorflow import keras
from tensorflow.keras import layers

In [85]:
import numpy as np
import tensorflow as tf
from keras import backend as K


def recall(y_true, y_pred):
    y_true = K.ones_like(y_true)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall_keras = true_positives / (possible_positives + K.epsilon())
    return recall_keras


def precision(y_true, y_pred):
    y_true = K.ones_like(y_true)
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision_keras = true_positives / (predicted_positives + K.epsilon())
    return precision_keras


def specificity(y_true, y_pred):
    y_true = K.ones_like(y_true)
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fp = K.sum(K.round(K.clip((1 - y_true) * y_pred, 0, 1)))
    return tn / (tn + fp + K.epsilon())


def negative_predictive_value(y_true, y_pred):
    tn = K.sum(K.round(K.clip((1 - y_true) * (1 - y_pred), 0, 1)))
    fn = K.sum(K.round(K.clip(y_true * (1 - y_pred), 0, 1)))
    return tn / (tn + fn + K.epsilon())


def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    return 2 * ((p * r) / (p + r + K.epsilon()))


def fbeta(y_true, y_pred, beta=2):
    y_pred = K.clip(y_pred, 0, 1)

    tp = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)), axis=1)
    fp = K.sum(K.round(K.clip(y_pred - y_true, 0, 1)), axis=1)
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)), axis=1)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    num = (1 + beta ** 2) * (p * r)
    den = (beta ** 2 * p + r + K.epsilon())
    return K.mean(num / den)

In [86]:
def build_model(hp):
    model = keras.Sequential()

    # create input layer
    model.add(layers.Input(30))

    dropout = hp.Boolean("dropout") # generate a boolean variable called dropout whos value is randomly set to either True of False
    normalize = hp.Boolean("normalize") # generate a boolean variable called normalize whos value is randomly set to either True of False

    # create hidden layers
    for i in range(hp.Int(name='hidden_layer_count', min_value=1, max_value=5, step=1)):
        model.add(layers.Dense(units=hp.Int("units", min_value=32, max_value=1024, step=32),activation=hp.Choice("activation", ["selu", "elu", "relu", "tanh"]))) 
        if dropout:
            model.add(layers.Dropout(rate=hp.Float("dropout rate", min_value=0.01, max_value=0.1, step=.005)))
        if normalize:
            model.add(layers.Normalization())

    model.add(layers.Dense(units=2, activation="softmax"))

    # Define the optimizer learning rate as a hyperparameter.
    lr = hp.Float("lr", min_value=1e-4, max_value=1e-2, sampling="log")   
    choice = hp.Choice(name='optimizer', values=['adam', 'sgd'])
    if 'adam' == choice:
        optimizer = keras.optimizers.legacy.Adam(learning_rate=lr)  # for M1/M2 use optimizers.legacy.Adam, otherwise use optimizers.Adam
    else:
        optimizer = keras.optimizers.legacy.SGD(learning_rate=lr)  # for M1/M2 use optimizers.legacy.SGD, otherwise use optimizers.SGD

    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        # see here https://www.tensorflow.org/api_docs/python/tf/keras/losses
        
        # Though you can add a metric, this doesn't get used to train the model, it's only informative (see previous notebook for more detail).
        metrics=[f1],  # you need to set this in order for keras_tuner to have one of these objectives!
        # for metrics, see https://www.tensorflow.org/api_docs/python/tf/keras/metrics/
    )
    return model

build_model(keras_tuner.HyperParameters())

<keras.engine.sequential.Sequential at 0x1d0043c3550>

In [87]:
import datetime
datestring = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

In [88]:
tuner = keras_tuner.RandomSearch(
    hypermodel=build_model,
    objective=keras_tuner.Objective("val_f1", direction="max"),
# 
# You can use metrics other than accuracy, but you need to have this defined in the build model first
# For instance, if you have f1 defined in the build model, to have keras_tuner 'tune' on f1, you set objective to be 'val_f1'. 
# For precision you would have to have precision defined in the build model, and to tune on precision, you set the object to 
# be 'val_precision', etc.
# 
# Also, for custom objectives, you need to set a 'direction' for the tune... so if you want to maximize f1, you set direction to be 'max'.
# 
#    objective = keras_tuner.Objective("val_f1", direction="max"), 
  
    max_trials=10, # max_trials represents the number of hyperparameter combinations that will be tested by the tuner (like n_iter in sklearn random search)
    executions_per_trial=2, # max number of models to fit per set of set of hyperparameters combinations

    # the next three parameters about where the results from the training are stored
    directory=f'logs/{datestring:s}',
    project_name="keras_tuned",
    overwrite=True
)

In [89]:
%%time
import time
from tensorflow.keras.callbacks import LambdaCallback
    
epoch_callback = LambdaCallback(
    on_epoch_begin=lambda epoch,logs: print(f'Starting Epoch {epoch+1}!')
)

batch_loss_callback = LambdaCallback(
    on_batch_end=lambda batch,logs: print(f'\n After batch {batch}, the loss is {logs}.')
)
    
train_finish_callback = LambdaCallback(
    on_train_end=lambda logs: print('Training finished!')
)

tuner.search(
    X_train, 
    y_train, 
    epochs=5, 
    batch_size=500, 
    validation_data=(X_test, y_test), 
    callbacks=[epoch_callback, batch_loss_callback, train_finish_callback]
)

Trial 10 Complete [00h 00m 39s]
val_f1: 1.0

Best val_f1 So Far: 1.0
Total elapsed time: 00h 05m 27s
INFO:tensorflow:Oracle triggered exit
CPU times: total: 28min 39s
Wall time: 5min 27s


In [90]:
best_hps = tuner.get_best_hyperparameters(5)
best_hps[0].values

{'dropout': False,
 'normalize': True,
 'hidden_layer_count': 2,
 'units': 896,
 'activation': 'selu',
 'lr': 0.004041246752831233,
 'optimizer': 'sgd'}

In [91]:
# Get list of the top 2 models.
models = tuner.get_best_models(num_models=2)
# select the first one in the list (this is the best performing model)
best_model = models[0] # select the first one
# display summary of model training
best_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 896)               27776     
                                                                 
 normalization (Normalizatio  (None, 896)              1793      
 n)                                                              
                                                                 
 dense_1 (Dense)             (None, 896)               803712    
                                                                 
 normalization_1 (Normalizat  (None, 896)              1793      
 ion)                                                            
                                                                 
 dense_2 (Dense)             (None, 2)                 1794      
                                                                 
Total params: 836,868
Trainable params: 833,282
Non-trai

In [92]:
# display the confusion matrix
cm = confusion_matrix(y_test, best_model.predict(X_test).argmax(axis=1))
cm



array([[2791,  833],
       [1846, 5413]], dtype=int64)

In [93]:
TP = cm[1][1]
TN = cm[0][0]
FP = cm[0][1]
FN = cm[1][0]
performance_nn = pd.concat([performance_nn, pd.DataFrame({'model':"Keras tuner", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance_nn

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,Keras with sklearn search,0.790131,0.879134,0.7946,0.834732
0,Keras tuner,0.753836,0.866635,0.745695,0.801629


In [94]:
print(classification_report(y_test,best_model.predict(X_test).argmax(axis=1), digits=4))

              precision    recall  f1-score   support

           0     0.6019    0.7701    0.6757      3624
           1     0.8666    0.7457    0.8016      7259

    accuracy                         0.7538     10883
   macro avg     0.7343    0.7579    0.7387     10883
weighted avg     0.7785    0.7538    0.7597     10883



The keras search with keras tuner doesn't seem to be improving on f1 alot. This might be due to small range of tested parameters which were selected because of time and processing limitations. Although, keras layers with sklearn search seem be be equivalent to SVM while considering f1. Also, wanted to mention that the keras model wasn't predicting the 1's in the binary classification therefore tried it as a multiclass variable with 2 classes 0 and 1. Without considering, metrics were not being calculated as they were 0's and NA's due to 0 predictions of category 1. Hence, used softmax for the output layer. 