### Best Models Results


In [1]:
import time
import math
notebook_start = time.time()

#### Init Libraries

In [2]:
import pandas as pd
import plotly.graph_objs as go
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
import os

In [3]:
#from keras.models import Sequential
#from keras.layers import Dense
#from keras.regularizers import l1, l2
#from keras.layers import Dropout, Flatten, BatchNormalization
from keras import models
from keras import layers
from keras import regularizers
from keras import optimizers
from keras import losses
from keras import metrics
from keras import callbacks
from keras import utils

from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold

import tensorflow as tf

#### Load Dataset and split into Features and Label

In [4]:
#load data
df=pd.read_csv("breast-cancer-wisconsin-data/data.csv")
#drop irelevent columns for the classification
df = df.drop(columns=['Unnamed: 32', 'id'])
# rearange the data for X - featuers and Y leabels 
X = df.iloc[:, 1:]
y = df.iloc[:, 0]

#### Encode Labels

In [5]:
le = LabelEncoder()
y = le.fit_transform(y)

#### The mapping of the labels

In [6]:
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print(le_name_mapping)

{'B': 0, 'M': 1}


#### Splitting the data into Train, Test and Valid datasets

In [7]:
#split the data
from sklearn.model_selection import train_test_split
XData_train_val, XData_test, ydata_train_val, ydata_test = train_test_split(X, y, test_size=0.2, random_state=0)
XData_train, XData_val, ydata_train, ydata_val = train_test_split(XData_train_val, ydata_train_val, test_size=0.2, random_state=0)

#### Scale the data

In [8]:
scaler = RobustScaler()
X_train = scaler.fit_transform(XData_train)
X_test = scaler.transform(XData_test)
X_val = scaler.transform(XData_val)

X_train_val = XData_train_val.values

y_train = ydata_train
y_test = ydata_test
y_val = ydata_val

y_train_val = ydata_train_val

#### Handling the imbalance between the classes

In [9]:
#handling imbalance data 
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
#the class weight is done only on the train data to impact the learning process and to evaluete beter the model proformence
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))
class_weight_dict

{0: 0.774468085106383, 1: 1.4108527131782946}

##### Function Helpers

In [10]:
def model_result(model, X, y, dataset_name):
    print('================================================================================')
    print(f'Evaluation Report for Model: {model.name} on {dataset_name}')

    # accuracy
    result = model.evaluate(X, y, verbose=0)        
    print(f'Loss Value: {result[0]:.3f}, Accuracy: {result[1]*100:.3f}%')

    # confusion matrix
    y_prob = model.predict(X, verbose=0)
    y_pred = np.round(y_prob).astype(int)    
    cm = confusion_matrix(y, y_pred)    
    #sns.heatmap(cm,annot=True)
    cm_disp = ConfusionMatrixDisplay(cm, display_labels=le.classes_)
    cm_disp.plot()
    plt.title(f'{model.name} Confusion Matrix on {dataset_name} Dataset')
    plt.show()

    # classification report
    report = classification_report(y, y_pred, target_names=le.classes_)
    print(report)

    print('Model Summary:')
    print(model.summary())
    print('================================================================================')

In [11]:
def model_history(history, param=None):
    plt.figure(figsize=(15,6))

    val_loss_min_pos = np.argmin(history.history['val_loss'])

    # Plotting the training and validation loss
    plt.subplot(1, 2, 1)
    plt.plot(history.history['loss'], label='Train Loss', color='#8502d1')
    plt.plot(history.history['val_loss'], label='Validation Loss')

    if param is None:
        plt.title('Train and Validation Loss')
    else:
        plt.title(f'Train and Validation Loss on {param}')

    plt.plot(val_loss_min_pos, history.history['val_loss'][val_loss_min_pos], 'r*', label='Min Validation Loss')
    plt.text(val_loss_min_pos, history.history['val_loss'][val_loss_min_pos], f'({val_loss_min_pos}, {history.history['val_loss'][val_loss_min_pos]:.3f})', va='bottom')
    plt.legend()

    # Plotting the training and validation accuracy
    plt.subplot(1, 2, 2)
    plt.plot(history.history['accuracy'], label='Train Accuracy', color='#8502d1')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.plot(val_loss_min_pos, history.history['val_accuracy'][val_loss_min_pos], 'r*', label='Validation Accuracy @ Min Validation Loss')
    plt.text(val_loss_min_pos, history.history['val_accuracy'][val_loss_min_pos], f'({val_loss_min_pos}, {history.history['val_accuracy'][val_loss_min_pos]:.3f})', va='bottom')

    if param is None:
        plt.title('Train and Validation Accuracy')
    else:
        plt.title(f'Train and Validation Accuracy on {param}')
        
    plt.legend()

    plt.show()    

In [12]:
def proc_history(history):

    val_loss_min_pos = np.argmin(history.history['val_loss'])

    train_acc = history.history['accuracy'][val_loss_min_pos]
    train_loss = history.history['loss'][val_loss_min_pos]

    val_acc = history.history['val_accuracy'][val_loss_min_pos]
    val_loss = history.history['val_loss'][val_loss_min_pos]    

    val_recall = history.history['val_recall'][val_loss_min_pos]

    return {'Train Accuracy': train_acc, 'Train Loss': train_loss, 'Validation Accuracy': val_acc, 'Validation Loss': val_loss, 'Validation Recall': val_recall}

In [13]:
def model_fit(model, optimizer=optimizers.RMSprop, learning_rate=0.001, epochNum=1000, batchSize=32, en_reduce_lr=False, en_early_stopping=True, pca=False, verbose="auto", Dataset=None):      
    
    
    early_stopping = callbacks.EarlyStopping(monitor='val_loss', patience=100, restore_best_weights=True, verbose=0)
    reduce_lr = callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=0.00001, verbose=0)
    checkpoint_filepath = f'model_checkpoints/{model.name}_checkpoint.model.keras'
    model_checkpoint_callback = callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        monitor='val_loss',
        mode='min',
        save_best_only=True,
        verbose=0)
    
    callbacks_list = [model_checkpoint_callback]

    if en_reduce_lr:
        callbacks_list.append(reduce_lr)

    if en_early_stopping:
        callbacks_list.append(early_stopping)

    model.compile(optimizer=optimizer(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy', metrics.Recall(name='recall')])
    if Dataset is None:
        history = model.fit(X_train, y_train, batch_size=batchSize, epochs=epochNum, validation_data=(X_val, y_val), class_weight=class_weight_dict, callbacks=callbacks_list, verbose=verbose)
    else:
        history = model.fit(Dataset['X_train'], Dataset['y_train'], batch_size=batchSize, epochs=epochNum, validation_data=(Dataset['X_val'], Dataset['y_val']), class_weight=class_weight_dict, callbacks=callbacks_list, verbose=verbose)

    model = models.load_model(checkpoint_filepath)

    return history, model

In [14]:
def model_kfold(xtrain, ytrain, model_base, optimizer=optimizers.RMSprop,learning_rate=0.001, epochNum=1000, batchSize=32, en_reduce_lr=False, en_early_stopping=True, verbose="auto"):
    model = models.clone_model(model_base)
    fold_k = StratifiedKFold(n_splits = 5).split(xtrain, ytrain)
    
    result_df = pd.DataFrame(columns=['k', 'Train Accuracy', 'Train Loss' , 'Validation Accuracy', 'Validation Loss',  'Validation Recall'])
    for k , (train, valid) in enumerate(fold_k):

        scaler = RobustScaler()

        X_train = scaler.fit_transform(xtrain[train])
        X_val = scaler.transform(xtrain[valid])
        
        dataset = {'X_train': X_train, 'y_train': ytrain[train], 'X_val': X_val, 'y_val': ytrain[valid]}
        model.set_weights(model_base.get_weights())

        

        history, model = model_fit(model, optimizer=optimizer, learning_rate=learning_rate, epochNum=epochNum, batchSize=batchSize, en_reduce_lr=en_reduce_lr, en_early_stopping=en_early_stopping, Dataset=dataset, verbose=verbose)    
        
        proc_data = proc_history(history)

        new_row = {'k': k, **proc_data}

        result_df.loc[len(result_df)] = new_row

    return result_df

### Best Models

In [15]:
BEST_MODELS = {}
BEST_MODELS_HISTORY = {}

BEST_OPTIMIZER = optimizers.SGD
BEST_LEARNING_RATE = 0.001
BEST_BATCH_SIZE = 2

#### SLP Model

In [None]:
# deterministic fitting   
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  

BEST_MODEL = models.Sequential(name=f'SLP')

BEST_MODEL.add(layers.Input((30,)))
BEST_MODEL.add(layers.Dense(1, activation='sigmoid'))

BEST_MODELS['SLP'] = BEST_MODEL

#### MLP 30x30x1 

In [None]:
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  

BEST_MODEL = models.Sequential(name=f'MLP_30x30x1')

BEST_MODEL.add(layers.Input((30,)))
BEST_MODEL.add(layers.Dense(30, activation='relu'))
BEST_MODEL.add(layers.Dense(1, activation='sigmoid'))

BEST_MODELS['MLP_30x30x1'] = BEST_MODEL

#### MLP 30x30x10x1 and at Second Layer: Dropout 0.1 & Activation Function Silu

In [None]:
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  

BEST_MODEL = models.Sequential(name=f'MLP_30x10x10-D1-silu-x1')

BEST_MODEL.add(layers.Input((30,)))
BEST_MODEL.add(layers.Dense(10, activation='relu'))
BEST_MODEL.add(layers.Dense(10, activation='silu'))
BEST_MODEL.add(layers.Dropout(0.1))
BEST_MODEL.add(layers.Dense(1, activation='sigmoid'))

BEST_MODELS['MLP_30x10x10-D1-silu-x1'] = BEST_MODEL

#### Fit Best Models

In [None]:
nn_best_models_df = pd.DataFrame(columns=['Model Name', 'Train Accuracy', 'Train Loss', 'Validation Accuracy', 'Validation Loss', 'Validation Recall'])

for model_name, model in BEST_MODELS.items():

    print('----------------------------------------------------------------------------------------------------')
    print(f'Model Name = {model_name}')
    
    if model_name == 'SLP':
        history, model = model_fit(model, epochNum=2500, batchSize=32, en_early_stopping=True, en_reduce_lr=True, verbose=0)
        res_df = model_kfold(X_train_val, y_train_val, model, epochNum=2500, batchSize=32, en_early_stopping=True, en_reduce_lr=True, verbose=0)
    elif model_name == 'MLP_30x30x1':
        history, model = model_fit(model, epochNum=3500, optimizer=BEST_OPTIMIZER, learning_rate=BEST_LEARNING_RATE, batchSize=32, en_early_stopping=True, en_reduce_lr=True, verbose=0)
        res_df = model_kfold(X_train_val, y_train_val, model, optimizer=BEST_OPTIMIZER, learning_rate=BEST_LEARNING_RATE, epochNum=3500, batchSize=32, en_early_stopping=True, en_reduce_lr=True, verbose=0)        
    else:
        history, model = model_fit(model, epochNum=2500, optimizer=BEST_OPTIMIZER, learning_rate=BEST_LEARNING_RATE, batchSize=BEST_BATCH_SIZE, en_early_stopping=True, en_reduce_lr=True, verbose=0)
        res_df = model_kfold(X_train_val, y_train_val, model, epochNum=2500, batchSize=BEST_BATCH_SIZE, en_early_stopping=True, en_reduce_lr=True, verbose=0)

    BEST_MODELS_HISTORY[model_name] = history
    proc_data = proc_history(history)

    new_row = {'Model Name': model.name, **proc_data}
    nn_best_models_df.loc[len(nn_best_models_df)] = new_row

    
    print(f"Validation Accuracy Mean: {res_df['Validation Loss'].mean()}, Validation Accuracy STD: {res_df['Validation Loss'].std()}")    
    print(f"Validation Loss Mean: {res_df['Validation Loss'].mean()}, Validation Loss STD: {res_df['Validation Loss'].std()}")    
    print(res_df)

    model_history(history, f'Model Name = {model.name}')

In [None]:
nn_best_models_df

##### Validation Accuracy Learning Curve of all all Best Models

In [None]:
for model_name, history in BEST_MODELS_HISTORY.items():
    plt.plot(history.history['val_accuracy'][:1000], label=model_name)
plt.legend()
plt.title('Validation Accuracy Learning Curve of all Best Models')
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.show()

##### Validation Loss Learning Curve of all all Best Models

In [None]:
for model_name, history in BEST_MODELS_HISTORY.items():
    plt.plot(history.history['val_loss'][:1000], label=model_name)
plt.legend()
plt.title('Validation Loss Learning Curve of all Best Models')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.show()

##### Train and Validation Accuracy of all Best Models

In [None]:
axes = nn_best_models_df.plot.bar(x='Model Name', y=['Train Accuracy', 'Validation Accuracy'], rot=15, figsize=(10, 5))
axes.legend(loc='lower right')

for p in axes.patches:
    axes.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

_ = axes.set_title('Train and Validation Accuracy of Best Models')

##### Train and Validation Loss of all Best Models

In [None]:
axes = nn_best_models_df.plot.bar(x='Model Name', y=['Train Loss', 'Validation Loss'], rot=15, figsize=(10, 5))
axes.legend(loc='lower right')

for p in axes.patches:
    axes.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

_ = axes.set_title('Train and Validation Loss of Best Models')

### Train+Validation Evaluation

#### Train Evaluation on Best Models

In [None]:
for model_name, model in BEST_MODELS.items():
    model_result(model, X_train, y_train, 'Train')

#### Valid Evaluation on Best Models

In [None]:
for model_name, model in BEST_MODELS.items():
    model_result(model, X_val, y_val, 'Valid')

#### F1_Score, Precision, Recall and ROC on Validation Dataset of Best Models

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

nn_best_models_scores_df = pd.DataFrame(columns=['Model Name', 'Validation F1_Score', 'Validation Precision', 'Validation Recall'])

for model_name, model in BEST_MODELS.items():

    y_prob = model.predict(X_val, verbose=0)
    y_pred = np.round(y_prob).astype(int)    

    f1_score_res = f1_score(y_val, y_pred, labels=le.classes_)
    precision_score_res = precision_score(y_val, y_pred, labels=le.classes_)
    recall_score_res = recall_score(y_val, y_pred, labels=le.classes_)

    new_row = {'Model Name': model.name, 'Validation F1_Score': f1_score_res, 'Validation Precision': precision_score_res, 'Validation Recall': recall_score_res}
    nn_best_models_scores_df.loc[len(nn_best_models_scores_df)] = new_row

In [None]:
nn_best_models_scores_df

In [None]:
axes = nn_best_models_scores_df.plot.bar(x='Model Name', y=['Validation F1_Score', 'Validation Precision', 'Validation Recall'], rot=15, figsize=(10, 5))
axes.legend(loc='lower right')

for p in axes.patches:
    axes.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

_ = axes.set_title('Validation F1_Score, Precision and Recall of Best Models')

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

axes = plt.axes()
for model_name, model in BEST_MODELS.items():
    y_prob = model.predict(X_val, verbose=0)    
    fpr, tpr, thresholds = metrics.roc_curve(y_val, y_prob)
    print(f"Model Name = {model_name}")
    print(f'FPR: {fpr}')
    print(f'TPR: {tpr}')
    print(f'Thresholds: {thresholds}')
    roc_disp = RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name=model.name)

    roc_disp.plot(ax=axes) 

    y_pred = np.round(y_prob).astype(int)    
    pos10 = (y_val==1) & (y_pred==0).flatten()
    X_failure_10 = X_val[pos10][:2]
    print(f'Positions of the false prediction in the validation dataset: {np.where(pos10)[0]}')
    print(f'Probability of the false prediciton: {y_prob[pos10]}')    

plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

axes = plt.axes()
for model_name, model in BEST_MODELS.items():
    y_prob = model.predict(X_val, verbose=0)    
    pos1 = (y_val==1)

    plt.plot(y_prob[pos1], '.-', label=model.name)   

plt.title('Prediction probability of True Malignant in Validation Dataset (Higher is better)')
plt.ylabel('Prediction probability')
plt.xlabel('Examples of True Malignant')
plt.legend()
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

axes = plt.axes()
for model_name, model in BEST_MODELS.items():
    y_prob = model.predict(X_val, verbose=0)    
    pos0 = (y_val==0)

    plt.plot(y_prob[pos0], '.-', label=model.name)   

plt.title('Prediction probability of True Benign in Validation Dataset (Lower is better)')
plt.ylabel('Prediction probability')
plt.xlabel('Examples of True Benign')
plt.legend()
plt.show()

### Test Evaluation

In [None]:
nn_test_best_models_df = nn_best_models_df.drop(columns=['Train Accuracy', 'Train Loss'])
nn_test_best_models_df

In [None]:

res_df = pd.DataFrame(columns=['Test Accuracy', 'Test Loss', 'Test Recall'])


for model_name, model in BEST_MODELS.items():
    res_dict = model.evaluate(X_test, y_test, verbose=0, return_dict=True)
    res_df.loc[len(res_df)] = {'Test Accuracy': res_dict['accuracy'], 'Test Loss': res_dict['loss'], 'Test Recall': res_dict['recall']}    

nn_test_best_models_df = pd.concat([nn_test_best_models_df, res_df], axis=1, join='inner')    
nn_test_best_models_df

##### Validation and Test Accuracy of all Best Models

In [None]:
axes = nn_test_best_models_df.plot.bar(x='Model Name', y=['Test Accuracy', 'Validation Accuracy'], rot=15, figsize=(10, 5), color=['r', 'orange'])
axes.legend(loc='lower right')

for p in axes.patches:
    axes.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

_ = axes.set_title('Test and Validation Accuracy of Best Models')

##### Train and Validation Loss of all Best Models

In [None]:
axes = nn_test_best_models_df.plot.bar(x='Model Name', y=['Test Loss', 'Validation Loss'], rot=15, figsize=(10, 5), color=['r', 'orange'])
axes.legend(loc='lower right')

for p in axes.patches:
    axes.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

_ = axes.set_title('Test and Validation Loss of Best Models')

#### Test Evaluation on Best Models

In [None]:
for model_name, model in BEST_MODELS.items():
    model_result(model, X_test, y_test, 'Test')

#### F1_Score, Precision, Recall and ROC on Test Dataset of Best Models

In [None]:
from sklearn.metrics import f1_score, precision_score, recall_score

nn_best_models_scores_df = pd.DataFrame(columns=['Model Name', 'Test F1_Score', 'Test Precision', 'Test Recall'])

for model_name, model in BEST_MODELS.items():

    y_prob = model.predict(X_test, verbose=0)
    y_pred = np.round(y_prob).astype(int)    

    f1_score_res = f1_score(y_test, y_pred, labels=le.classes_)
    precision_score_res = precision_score(y_test, y_pred, labels=le.classes_)
    recall_score_res = recall_score(y_test, y_pred, labels=le.classes_)

    new_row = {'Model Name': model.name, 'Test F1_Score': f1_score_res, 'Test Precision': precision_score_res, 'Test Recall': recall_score_res}
    nn_best_models_scores_df.loc[len(nn_best_models_scores_df)] = new_row

In [None]:
nn_best_models_scores_df

In [None]:
axes = nn_best_models_scores_df.plot.bar(x='Model Name', y=['Test F1_Score', 'Test Precision', 'Test Recall'], rot=15, figsize=(10, 5))
axes.legend(loc='lower right')

for p in axes.patches:
    axes.annotate(str(round(p.get_height(),3)), (p.get_x() * 1.005, p.get_height() * 1.005))

_ = axes.set_title('Test F1_Score, Precision and Recall on Test Dataset of Best Models')

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

axes = plt.axes()
for model_name, model in BEST_MODELS.items():
    y_prob = model.predict(X_test, verbose=0)    
    fpr, tpr, thresholds = metrics.roc_curve(y_test, y_prob)

    print(f'FPR: {fpr}')
    print(f'TPR: {tpr}')
    print(f'Thresholds: {thresholds}')
    roc_disp = RocCurveDisplay(fpr=fpr, tpr=tpr, estimator_name=model.name)

    roc_disp.plot(ax=axes) 

    y_pred = np.round(y_prob).astype(int)    
    pos10 = (y_test==1) & (y_pred==0).flatten()
    X_failure_10 = X_test[pos10][:2]
    print(f'Positions of the false prediction in the test dataset: {np.where(pos10)[0]}')
    print(f'Probability of the false prediciton: {y_prob[pos10]}')    

plt.title('ROC on Test Dataset of Best Models')
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

axes = plt.axes()
for model_name, model in BEST_MODELS.items():
    y_prob = model.predict(X_test, verbose=0)    
    pos1 = (y_test==1)

    plt.plot(y_prob[pos1], '.-', label=model.name)   

plt.title('Prediction probability of True Malignant in Test Dataset (Higher is better)')
plt.ylabel('Prediction probability')
plt.xlabel('Examples of True Malignant')
plt.legend()
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import RocCurveDisplay

axes = plt.axes()
for model_name, model in BEST_MODELS.items():
    y_prob = model.predict(X_test, verbose=0)    
    pos0 = (y_test==0)

    plt.plot(y_prob[pos0], '.-', label=model.name)   

plt.title('Prediction probability of True Benign in Test Dataset (Lower is better)')
plt.ylabel('Prediction probability')
plt.xlabel('Examples of True Benign')
plt.legend()
plt.show()

### Cases of Success and Failure

In [None]:
model = BEST_MODELS['SLP']

y_prob = model.predict(X_test, verbose=0)
y_pred = np.round(y_prob).astype(int) 
y_pred = y_pred.flatten() 

#### Best Model Success

Truth Benign, Predicted Bengin

In [None]:
pos00 = (y_test==0) & (y_pred==0).flatten()
X_success_00 = X_test[pos00][[0]]
pd.DataFrame(X_success_00)

Truth Malignant, Predicted Malignant

In [None]:
pos11 = (y_test==1) & (y_pred==1).flatten()
X_success_11 = X_test[pos11][:2]
pd.DataFrame(X_success_11)

#### Best Model Failure

Truth Benign, Predicted Malignant

In [None]:
(y_test==0) & (y_pred==1)

In [None]:
pos01 = (y_test==0) & (y_pred==1).flatten()
X_failure_01 = X_test[pos01][:]
pd.DataFrame(X_failure_01)

Truth Malignant, Predicted Benign

In [None]:
pos10 = (y_test==1) & (y_pred==0).flatten()
X_failure_10 = X_test[pos10][:2]
pd.DataFrame(X_failure_10)

#### More Successful Classifications

##### Example 1 - (TN) - Benign classified as Benign

In [None]:
example_num = np.array([1])

data_B_mean = X[y==0].mean()
data_B_std = X[y==0].std()

wrong_classified_data = scaler.inverse_transform(X_test)[example_num].T.flatten()
wrong_classified_data = pd.Series(wrong_classified_data, index=data_B_mean.index)

wrong_classified_data_n_2 = (wrong_classified_data-data_B_mean)/data_B_std

res_df = pd.concat([wrong_classified_data, data_B_mean, data_B_std, wrong_classified_data_n_2], axis=1)
res_df = res_df.rename(columns={0: 'Example', 1: 'Feature Mean', 2: 'Feature STD', 3: 'Normalized Example'})
res_df.style.apply(lambda x: ["background: green" if idx==3 and abs(v)>abs(3*x.iloc[2]) else "" for idx,v in enumerate(x)], axis = 1)

##### Example 106 - (TP) - Malignant classified as Malignant on all the Best Classifiers

In [None]:
example_num = np.array([106])

data_M_mean = X[y==1].mean()
data_M_std = X[y==1].std()

wrong_classified_data = scaler.inverse_transform(X_test)[example_num].T.flatten()
wrong_classified_data = pd.Series(wrong_classified_data, index=data_M_mean.index)

wrong_classified_data_n_2 = (wrong_classified_data-data_M_mean)/data_M_std

res_df = pd.concat([wrong_classified_data, data_M_mean, data_M_std, wrong_classified_data_n_2], axis=1)
res_df = res_df.rename(columns={0: 'Example', 1: 'Feature Mean', 2: 'Feature STD', 3: 'Normalized Example'})
res_df.style.apply(lambda x: ["background: green" if idx==3 and abs(v)>abs(3*x.iloc[2]) else "" for idx,v in enumerate(x)], axis = 1)

##### Example 112 - (TP) - Malignant classified as Malignant on all the Best Classifiers

In [None]:
example_num = np.array([112])

data_M_mean = X[y==1].mean()
data_M_std = X[y==1].std()

wrong_classified_data = scaler.inverse_transform(X_test)[example_num].T.flatten()
wrong_classified_data = pd.Series(wrong_classified_data, index=data_M_mean.index)

wrong_classified_data_n_2 = (wrong_classified_data-data_M_mean)/data_M_std

res_df = pd.concat([wrong_classified_data, data_M_mean, data_M_std, wrong_classified_data_n_2], axis=1)
res_df = res_df.rename(columns={0: 'Example', 1: 'Feature Mean', 2: 'Feature STD', 3: 'Normalized Example'})
res_df.style.apply(lambda x: ["background: green" if idx==3 and abs(v)>abs(3*x.iloc[2]) else "" for idx,v in enumerate(x)], axis = 1)

#### More Failure Classifications

##### Example 108 - (FN) - Malignant classified as Benign on most of the Best Classifiers

In [None]:
example_num = np.array([108])

data_M_mean = X[y==1].mean()
data_M_std = X[y==1].std()

wrong_classified_data = scaler.inverse_transform(X_test)[example_num].T.flatten()
wrong_classified_data = pd.Series(wrong_classified_data, index=data_M_mean.index)

wrong_classified_data_n_2 = (wrong_classified_data-data_M_mean)/data_M_std

res_df = pd.concat([wrong_classified_data, data_M_mean, data_M_std, wrong_classified_data_n_2], axis=1)
res_df = res_df.rename(columns={0: 'Example', 1: 'Feature Mean', 2: 'Feature STD', 3: 'Normalized Example'})
res_df.style.apply(lambda x: ["background: green" if idx==3 and abs(v)>abs(3*x.iloc[2]) else "" for idx,v in enumerate(x)], axis = 1)

##### Example 109 - (FN) - Malignant classified as Benign on all the Best Classifiers

In [None]:
example_num = np.array([109])

data_M_mean = X[y==1].mean()
data_M_std = X[y==1].std()

wrong_classified_data = scaler.inverse_transform(X_test)[example_num].T.flatten()
wrong_classified_data = pd.Series(wrong_classified_data, index=data_M_mean.index)

wrong_classified_data_n_2 = (wrong_classified_data-data_M_mean)/data_M_std

res_df = pd.concat([wrong_classified_data, data_M_mean, data_M_std, wrong_classified_data_n_2], axis=1)
res_df = res_df.rename(columns={0: 'Example', 1: 'Feature Mean', 2: 'Feature STD', 3: 'Normalized Example'})
res_df.style.apply(lambda x: ["background: green" if idx==3 and abs(v)>abs(3*x.iloc[2]) else "" for idx,v in enumerate(x)], axis = 1)

##### Example 13 - (FP) - Benign classified as Malignant

In [None]:
example_num = np.array([13])

data_B_mean = X[y==0].mean()
data_B_std = X[y==0].std()

wrong_classified_data = scaler.inverse_transform(X_test)[example_num].T.flatten()
wrong_classified_data = pd.Series(wrong_classified_data, index=data_B_mean.index)

wrong_classified_data_n_2 = (wrong_classified_data-data_B_mean)/data_B_std

res_df = pd.concat([wrong_classified_data, data_B_mean, data_B_std, wrong_classified_data_n_2], axis=1)
res_df = res_df.rename(columns={0: 'Example', 1: 'Feature Mean', 2: 'Feature STD', 3: 'Normalized Example'})
res_df.style.apply(lambda x: ["background: green" if idx==3 and abs(v)>abs(3*x.iloc[2]) else "" for idx,v in enumerate(x)], axis = 1)

### K-Fold Assessment

#### Base Models Init

In [16]:
BEST_MODELS_BASE = {}

##### SLP Model

In [17]:
# deterministic fitting   
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  

BEST_MODEL = models.Sequential(name=f'SLP')

BEST_MODEL.add(layers.Input((30,)))
BEST_MODEL.add(layers.Dense(1, activation='sigmoid'))

BEST_MODELS_BASE['SLP'] = BEST_MODEL

##### MLP 30x30x1 

In [18]:
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  

BEST_MODEL = models.Sequential(name=f'MLP_30x30x1')

BEST_MODEL.add(layers.Input((30,)))
BEST_MODEL.add(layers.Dense(30, activation='relu'))
BEST_MODEL.add(layers.Dense(1, activation='sigmoid'))

BEST_MODELS_BASE['MLP_30x30x1'] = BEST_MODEL

##### MLP 30x30x10x1 and at Second Layer: Dropout 0.1 & Activation Function Silu

In [19]:
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  

BEST_MODEL = models.Sequential(name=f'MLP_30x10x10-D1-silu-x1')

BEST_MODEL.add(layers.Input((30,)))
BEST_MODEL.add(layers.Dense(10, activation='relu'))
BEST_MODEL.add(layers.Dense(10, activation='silu'))
BEST_MODEL.add(layers.Dropout(0.1))
BEST_MODEL.add(layers.Dense(1, activation='sigmoid'))

BEST_MODELS_BASE['MLP_30x10x10-D1-silu-x1'] = BEST_MODEL

In [24]:
os.environ['PYTHONHASHSEED']=str(0)
utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()  
    
result_df = {}
for model_name, best_model in BEST_MODELS_BASE.items():

    fold_k = StratifiedKFold(n_splits = 5).split(X.values, y)
    
    result_df[model_name] = pd.DataFrame(columns=['k', 'Train Accuracy', 'Train Loss' , 'Validation Accuracy', 'Validation Loss',  'Validation Recall'])

    for k , (train, valid) in enumerate(fold_k):

        scaler = RobustScaler()

        X_train_f = scaler.fit_transform(X.values[train])
        X_val_f = scaler.transform(X.values[valid])
        
        dataset = {'X_train': X_train_f, 'y_train': y[train], 'X_val': X_val_f, 'y_val': y[valid]}
        model = models.clone_model(best_model)
        model.set_weights(BEST_MODELS_BASE[model_name].get_weights())

        history, model = model_fit(model, optimizer=BEST_OPTIMIZER, learning_rate=BEST_LEARNING_RATE, epochNum=2500, batchSize=BEST_BATCH_SIZE, en_reduce_lr=True, en_early_stopping=True, Dataset=dataset, verbose=0)    
        
        proc_data = proc_history(history)

        new_row = {'k': k, **proc_data}

        result_df[model_name].loc[len(result_df[model_name])] = new_row

In [25]:
result_df

{'SLP':    k  Train Accuracy  Train Loss  Validation Accuracy  Validation Loss  \
 0  0        0.978022    0.065383             0.982456         0.064810   
 1  1        0.984615    0.061824             0.956140         0.086625   
 2  2        0.978022    0.078468             0.973684         0.070427   
 3  3        0.982418    0.053811             0.973684         0.106260   
 4  4        0.978070    0.061502             0.982301         0.068523   
 
    Validation Recall  
 0           0.976744  
 1           0.953488  
 2           0.976190  
 3           0.952381  
 4           1.000000  ,
 'MLP_30x30x1':    k  Train Accuracy  Train Loss  Validation Accuracy  Validation Loss  \
 0  0        0.982418    0.035360             0.991228         0.055420   
 1  1        0.986813    0.036418             0.964912         0.081204   
 2  2        0.978022    0.088186             0.964912         0.096443   
 3  3        0.986813    0.038147             0.973684         0.106251   
 4  4 

### K-Fold Results on Best Models

#### SLP

In [30]:
print('SLP K-Fold:')
res_df = result_df['SLP']
print(f"Train Accuracy Mean: {res_df['Train Accuracy'].mean()}, Train Accuracy STD: {res_df['Train Accuracy'].std()}") 
print(f"Validation Accuracy Mean: {res_df['Validation Accuracy'].mean()}, Validation Accuracy STD: {res_df['Validation Accuracy'].std()}")    
print(f"Validation Loss Mean: {res_df['Validation Loss'].mean()}, Validation Loss STD: {res_df['Validation Loss'].std()}")    
print(f"Validation Recall Mean: {res_df['Validation Recall'].mean()}, Validation Recall STD: {res_df['Validation Recall'].std()}")    
res_df

SLP K-Fold:
Train Accuracy Mean: 0.9802294254302979, Train Accuracy STD: 0.003099700583179626
Validation Accuracy Mean: 0.9736531496047973, Validation Accuracy STD: 0.010711867522905585
Validation Loss Mean: 0.0793291449546814, Validation Loss STD: 0.017214117209590714
Validation Recall Mean: 0.9717607855796814, Validation Recall STD: 0.01969371116721688


Unnamed: 0,k,Train Accuracy,Train Loss,Validation Accuracy,Validation Loss,Validation Recall
0,0,0.978022,0.065383,0.982456,0.06481,0.976744
1,1,0.984615,0.061824,0.95614,0.086625,0.953488
2,2,0.978022,0.078468,0.973684,0.070427,0.97619
3,3,0.982418,0.053811,0.973684,0.10626,0.952381
4,4,0.97807,0.061502,0.982301,0.068523,1.0


#### MLP_30x30x1

In [31]:
print('MLP_30x30x1 K-Fold:')
res_df = result_df['MLP_30x30x1']
print(f"Train Accuracy Mean: {res_df['Train Accuracy'].mean()}, Train Accuracy STD: {res_df['Train Accuracy'].std()}") 
print(f"Validation Accuracy Mean: {res_df['Validation Accuracy'].mean()}, Validation Accuracy STD: {res_df['Validation Accuracy'].std()}")    
print(f"Validation Loss Mean: {res_df['Validation Loss'].mean()}, Validation Loss STD: {res_df['Validation Loss'].std()}")    
print(f"Validation Recall Mean: {res_df['Validation Recall'].mean()}, Validation Recall STD: {res_df['Validation Recall'].std()}")    
res_df

MLP_30x30x1 K-Fold:
Train Accuracy Mean: 0.9863745927810669, Train Accuracy STD: 0.007357128043880629
Validation Accuracy Mean: 0.9754075407981873, Validation Accuracy STD: 0.011413552262145516
Validation Loss Mean: 0.07828205525875091, Validation Loss STD: 0.024131153422157544
Validation Recall Mean: 0.9671096324920654, Validation Recall STD: 0.026617252368817138


Unnamed: 0,k,Train Accuracy,Train Loss,Validation Accuracy,Validation Loss,Validation Recall
0,0,0.982418,0.03536,0.991228,0.05542,0.976744
1,1,0.986813,0.036418,0.964912,0.081204,0.930233
2,2,0.978022,0.088186,0.964912,0.096443,0.97619
3,3,0.986813,0.038147,0.973684,0.106251,0.952381
4,4,0.997807,0.018307,0.982301,0.052092,1.0


#### MLP_30x10x10-D1-silu-x1

In [32]:
print('MLP_30x10x10-D1-silu-x1 K-Fold:')
res_df = result_df['MLP_30x10x10-D1-silu-x1']
print(f"Train Accuracy Mean: {res_df['Train Accuracy'].mean()}, Train Accuracy STD: {res_df['Train Accuracy'].std()}") 
print(f"Validation Accuracy Mean: {res_df['Validation Accuracy'].mean()}, Validation Accuracy STD: {res_df['Validation Accuracy'].std()}")    
print(f"Validation Loss Mean: {res_df['Validation Loss'].mean()}, Validation Loss STD: {res_df['Validation Loss'].std()}")    
print(f"Validation Recall Mean: {res_df['Validation Recall'].mean()}, Validation Recall STD: {res_df['Validation Recall'].std()}")    
res_df

MLP_30x10x10-D1-silu-x1 K-Fold:
Train Accuracy Mean: 0.9784692525863647, Train Accuracy STD: 0.004772750541060379
Validation Accuracy Mean: 0.9648812294006348, Validation Accuracy STD: 0.013820619544004176
Validation Loss Mean: 0.09037462025880813, Validation Loss STD: 0.025918232302843774
Validation Recall Mean: 0.9625692009925843, Validation Recall STD: 0.04524964219991074


Unnamed: 0,k,Train Accuracy,Train Loss,Validation Accuracy,Validation Loss,Validation Recall
0,0,0.982418,0.051955,0.95614,0.124006,0.976744
1,1,0.975824,0.070166,0.947368,0.106543,0.883721
2,2,0.971429,0.088107,0.973684,0.065959,0.97619
3,3,0.98022,0.050913,0.964912,0.091397,0.97619
4,4,0.982456,0.043729,0.982301,0.063967,1.0


### Notebook End

In [None]:
notebook_end = time.time()
notebook_elapsed = notebook_end-notebook_start
print('Finished Notebook Run!')
print(f'Elapsed Run Time: {math.floor(notebook_elapsed/3600)} (h), {math.floor(notebook_elapsed%3600/60)} (m), {math.floor(notebook_elapsed%60)} (s)')