Start

In [1]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import configparser

import xgboost as xgb
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score, balanced_accuracy_score

import tensorflow as tf
from keras.layers import Dense, Dropout, BatchNormalization, Input
from keras.optimizers import Adam
from keras import Model

config = configparser.ConfigParser()
config.read('config.ini')

pd.options.display.max_columns=None
pd.options.display.max_rows=None
pd.options.display.max_colwidth=None

In [2]:
PATH = config['Paths']['processed_data']
df_prepro = pd.read_csv(f'{PATH}train_test_relevant_features.csv', delimiter=",")
print(df_prepro.info())

df_prepro.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  radius_se                569 non-null    float64
 11  perimeter_se             569 non-null    float64
 12  area_se                  569 non-null    float64
 13  compactness_se           569 non-null    float64
 14  concavity_se             5

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,radius_se,perimeter_se,area_se,compactness_se,concavity_se,concave points_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,1.095,8.589,153.4,0.04904,0.05373,0.01587,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.5435,3.398,74.08,0.01308,0.0186,0.0134,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.7456,4.585,94.03,0.04006,0.03832,0.02058,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.4956,3.445,27.23,0.07458,0.05661,0.01867,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.7572,5.438,94.44,0.02461,0.05688,0.01885,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# define data & labels
df = df_prepro.copy()

X = df.iloc[:, 1:]
y = df.diagnosis

In [4]:
# split into train, test (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)

# normalize data
scaler = StandardScaler()
scaler.fit(X_train)

scaler.transform(X_train)
scaler.transform(X_test)

array([[-0.46649743, -0.13728933, -0.44421138, ..., -0.19435087,
         0.17275669,  0.20372995],
       [ 1.36536344,  0.49866473,  1.30551088, ...,  0.99177862,
        -0.561211  , -1.00838949],
       [ 0.38006578,  0.06921974,  0.40410139, ...,  0.57035018,
        -0.10783139, -0.20629287],
       ...,
       [-0.73547237, -0.99852603, -0.74138839, ..., -0.27741059,
        -0.3820785 , -0.32408328],
       [ 0.02898271,  2.0334026 ,  0.0274851 , ..., -0.49027026,
        -1.60905688, -0.33137507],
       [ 1.87216885,  2.80077153,  1.80354992, ...,  0.7925579 ,
        -0.05868885, -0.09467243]])

## Models

In [5]:
# custom fn since we will be using multiple models
# models are trained on default params

def train_evaluate_model(model, X_train, y_train, X_test,y_test): 
    # model fit   
    model.fit(X_train, y_train)
    # model predict
    predictions = model.predict(X_test)

    # compute metrics for evaluation
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    balanced_accuracy = balanced_accuracy_score(y_test, predictions)

    # metrics into df
    eval_df = pd.DataFrame([[accuracy, f1, precision, recall, balanced_accuracy]], columns=['accuracy', 'f1_score', 'precision', 'recall', 'balanced_accuracy'])

    return eval_df

In [6]:
# new empty df
results = pd.DataFrame()
results

### Using relevant features only

1. Logistic Regression (LogReg)

In [7]:
LR = LogisticRegression(max_iter=2000) # before max_iter: 0.958074
logreg_results = train_evaluate_model(LR, X_train, y_train, X_test, y_test)
logreg_results.insert(0, 'model', 'LogisticRegression')

# concat & sort
results = pd.concat([results, logreg_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074


2. DecisionTree (DT)

In [8]:
DT = DecisionTreeClassifier()
decision_tree_results = train_evaluate_model(DT, X_train, y_train, X_test, y_test)
decision_tree_results.insert(0, 'model', 'DecisionTree')

# concat & sort
results = pd.concat([results, decision_tree_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


3. KNearestNeighbors (KNN)


In [9]:
KNN = KNeighborsClassifier(n_neighbors=12)
knn_results = train_evaluate_model(KNN, X_train, y_train, X_test, y_test)
knn_results.insert(0, 'model', 'KNearsNeighbors')

# concat & sort
results = pd.concat([results, knn_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


4. RandomForestClassifier (RFC)

In [10]:
RFC = RandomForestClassifier()
rfc_results = train_evaluate_model(RFC, X_train, y_train, X_test, y_test)
rfc_results.insert(0, 'model', 'RandomForest')

# concat & sort
results = pd.concat([results, rfc_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


5. XGBoost (XGB)

In [11]:
XGB = xgb.XGBClassifier()
xgb_results = train_evaluate_model(XGB, X_train, y_train, X_test, y_test)
xgb_results.insert(0, 'model', 'XGBoost')

# concat & sort
results = pd.concat([results, xgb_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


6. Naive Bayes (NB)

In [12]:
NB = GaussianNB()
nb_results = train_evaluate_model(NB, X_train, y_train, X_test, y_test)
nb_results.insert(0, 'model', 'NaiveBayes')

# concat & sort
results = pd.concat([results, nb_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


7. Neural Network (NN)

In [13]:
# using random normal distribution & seed for reprod
tf_init = tf.keras.initializers.RandomNormal(seed=0)

def scheduler(epoch, lr):
    '''
    This is a learning rate scheduler function. 
    It adjusts the learning rate during training based on the epoch number. 
    The scheduler starts with a constant learning rate until epoch 50, after which it exponentially decays the learning rate.
    '''
    if epoch < 50:
        return lr
    else:
        return lr * tf.math.exp(-0.1)

# set the callback to scheduler fn
callback = tf.keras.callbacks.LearningRateScheduler(scheduler) 

# define input layer for the nn with a shape based on no. of features in data
inputs = Input(shape=X.shape[-1])

# nn architecture
'''
The following lines create the neural network architecture using Keras Sequential API. 
The architecture consists of several fully connected (Dense) layers with ReLU activation functions and dropout layers to prevent overfitting. 
The network terminates with a single neuron output layer with a sigmoid activation function, which is often used for binary classification tasks.
'''
x = Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.RandomNormal())(inputs)
x = Dropout(0.4)(x)
x = Dense(16,activation='relu',kernel_initializer=tf.keras.initializers.RandomNormal())(inputs)
x = Dropout(0.4)(x)
x = Dense(25,activation='relu',kernel_initializer=tf.keras.initializers.RandomNormal())(x)
x = Dense(1, activation='sigmoid',kernel_initializer=tf.keras.initializers.RandomNormal())(x)

# create keras model
model = Model(inputs=inputs, outputs=x)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train,epochs=140)
predictions = model.predict(X_test)

Epoch 1/140
Epoch 2/140
Epoch 3/140
Epoch 4/140
Epoch 5/140
Epoch 6/140
Epoch 7/140
Epoch 8/140
Epoch 9/140
Epoch 10/140
Epoch 11/140
Epoch 12/140
Epoch 13/140
Epoch 14/140
Epoch 15/140
Epoch 16/140
Epoch 17/140
Epoch 18/140
Epoch 19/140
Epoch 20/140
Epoch 21/140
Epoch 22/140
Epoch 23/140
Epoch 24/140
Epoch 25/140
Epoch 26/140
Epoch 27/140
Epoch 28/140
Epoch 29/140
Epoch 30/140
Epoch 31/140
Epoch 32/140
Epoch 33/140
Epoch 34/140
Epoch 35/140
Epoch 36/140
Epoch 37/140
Epoch 38/140
Epoch 39/140
Epoch 40/140
Epoch 41/140
Epoch 42/140
Epoch 43/140
Epoch 44/140
Epoch 45/140
Epoch 46/140
Epoch 47/140
Epoch 48/140
Epoch 49/140
Epoch 50/140
Epoch 51/140
Epoch 52/140
Epoch 53/140
Epoch 54/140
Epoch 55/140
Epoch 56/140
Epoch 57/140
Epoch 58/140
Epoch 59/140
Epoch 60/140
Epoch 61/140
Epoch 62/140
Epoch 63/140
Epoch 64/140
Epoch 65/140
Epoch 66/140
Epoch 67/140
Epoch 68/140
Epoch 69/140
Epoch 70/140
Epoch 71/140
Epoch 72/140
Epoch 73/140
Epoch 74/140
Epoch 75/140
Epoch 76/140
Epoch 77/140
Epoch 78

In [14]:
# predict using trained nn model
predictions = model.predict(X_test)

# convert predicted probabilities to binary predictions. Note that a threshold of 0.4 was used as recall is more important for medical diagnosis
predictions = (predictions > 0.4) 

# calculate evaluation metrics
accuracy = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
balanced_accuracy = balanced_accuracy_score(y_test, predictions)

# df and insert all evaluation metrics
nn_df = pd.DataFrame([[
    accuracy, 
    f1, 
    precision, 
    recall, 
    balanced_accuracy
]], 
columns=[
    'accuracy', 
    'f1_score', 
    'precision', 
    'recall', 
    'balanced_accuracy'
])

nn_df.insert(0, 'model', 'NeuralNetwork')

# concat & sort
results = pd.concat([results, nn_df], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)



Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


### Using all features

In [15]:
PATH = config['Paths']['processed_data']
df_prepro_all = pd.read_csv(f'{PATH}train_test_all_features.csv', delimiter=",")
print(df_prepro_all.info())

df_prepro_all.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   diagnosis                569 non-null    int64  
 1   radius_mean              569 non-null    float64
 2   texture_mean             569 non-null    float64
 3   perimeter_mean           569 non-null    float64
 4   area_mean                569 non-null    float64
 5   smoothness_mean          569 non-null    float64
 6   compactness_mean         569 non-null    float64
 7   concavity_mean           569 non-null    float64
 8   concave points_mean      569 non-null    float64
 9   symmetry_mean            569 non-null    float64
 10  fractal_dimension_mean   569 non-null    float64
 11  radius_se                569 non-null    float64
 12  texture_se               569 non-null    float64
 13  perimeter_se             569 non-null    float64
 14  area_se                  5

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,texture_se,perimeter_se,area_se,smoothness_se,compactness_se,concavity_se,concave points_se,symmetry_se,fractal_dimension_se,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,1.095,0.9053,8.589,153.4,0.006399,0.04904,0.05373,0.01587,0.03003,0.006193,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,0.5435,0.7339,3.398,74.08,0.005225,0.01308,0.0186,0.0134,0.01389,0.003532,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,0.7456,0.7869,4.585,94.03,0.00615,0.04006,0.03832,0.02058,0.0225,0.004571,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,0.4956,1.156,3.445,27.23,0.00911,0.07458,0.05661,0.01867,0.05963,0.009208,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,0.7572,0.7813,5.438,94.44,0.01149,0.02461,0.05688,0.01885,0.01756,0.005115,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [16]:
# define data & labels
df_all = df_prepro_all.copy()

X_ALL = df_all.iloc[:, 1:]
y_all = df_all.diagnosis

In [17]:
# split into train, test (80/20)
X_train_all_features, X_test_all_features, y_train, y_test = train_test_split(X_ALL, y_all, test_size = 0.2, random_state=42)

# normalize data
scaler = StandardScaler()
scaler.fit(X_train_all_features)

scaler.transform(X_train_all_features)
scaler.transform(X_test_all_features)

array([[-0.46649743, -0.13728933, -0.44421138, ..., -0.19435087,
         0.17275669,  0.20372995],
       [ 1.36536344,  0.49866473,  1.30551088, ...,  0.99177862,
        -0.561211  , -1.00838949],
       [ 0.38006578,  0.06921974,  0.40410139, ...,  0.57035018,
        -0.10783139, -0.20629287],
       ...,
       [-0.73547237, -0.99852603, -0.74138839, ..., -0.27741059,
        -0.3820785 , -0.32408328],
       [ 0.02898271,  2.0334026 ,  0.0274851 , ..., -0.49027026,
        -1.60905688, -0.33137507],
       [ 1.87216885,  2.80077153,  1.80354992, ...,  0.7925579 ,
        -0.05868885, -0.09467243]])

1. Logistic Regression (LogReg)

In [18]:
LR = LogisticRegression(max_iter=3000)
logreg_results = train_evaluate_model(LR, X_train_all_features, y_train, X_test_all_features, y_test)
logreg_results.insert(0, 'model', 'LogisticRegression_all_features')

# concat & sort
results = pd.concat([results, logreg_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
7,LogisticRegression_all_features,0.95614,0.939759,0.975,0.906977,0.946446
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


2. DecisionTree (DT)

In [19]:
DT = DecisionTreeClassifier()
decision_tree_results = train_evaluate_model(DT, X_train_all_features, y_train, X_test_all_features, y_test)
decision_tree_results.insert(0, 'model', 'DecisionTree_all_features')

# concat & sort
results = pd.concat([results, decision_tree_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
7,LogisticRegression_all_features,0.95614,0.939759,0.975,0.906977,0.946446
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277
8,DecisionTree_all_features,0.921053,0.896552,0.886364,0.906977,0.918277


3. KNearestNeighbors (KNN)

In [20]:
KNN = KNeighborsClassifier(n_neighbors=12)
knn_results = train_evaluate_model(KNN, X_train_all_features, y_train, X_test_all_features, y_test)
knn_results.insert(0, 'model', 'KNearsNeighbors_all_features')

# concat & sort
results = pd.concat([results, knn_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
9,KNearsNeighbors_all_features,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
7,LogisticRegression_all_features,0.95614,0.939759,0.975,0.906977,0.946446
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277
8,DecisionTree_all_features,0.921053,0.896552,0.886364,0.906977,0.918277


4. RandomForestClassifier (RFC)

In [21]:
RFC = RandomForestClassifier()
rfc_results = train_evaluate_model(RFC, X_train_all_features, y_train, X_test_all_features, y_test)
rfc_results.insert(0, 'model', 'RandomForest_all_features')

# concat & sort
results = pd.concat([results, rfc_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
9,KNearsNeighbors_all_features,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
10,RandomForest_all_features,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
7,LogisticRegression_all_features,0.95614,0.939759,0.975,0.906977,0.946446
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277


5. XGBoost (XGB)

In [22]:
XGB = xgb.XGBClassifier()
xgb_results = train_evaluate_model(XGB, X_train_all_features, y_train, X_test_all_features, y_test)
xgb_results.insert(0, 'model', 'XGBoost_all_features')

# concat & sort
results = pd.concat([results, xgb_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
9,KNearsNeighbors_all_features,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
10,RandomForest_all_features,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
11,XGBoost_all_features,0.95614,0.941176,0.952381,0.930233,0.951032
7,LogisticRegression_all_features,0.95614,0.939759,0.975,0.906977,0.946446


6. Naive Bayes (NB)

In [23]:
NB = GaussianNB()
nb_results = train_evaluate_model(NB, X_train_all_features, y_train, X_test_all_features, y_test)
nb_results.insert(0, 'model', 'NaiveBayes_all_features')

# concat & sort
results = pd.concat([results, nb_results], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
9,KNearsNeighbors_all_features,0.973684,0.963855,1.0,0.930233,0.965116
12,NaiveBayes_all_features,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
10,RandomForest_all_features,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
11,XGBoost_all_features,0.95614,0.941176,0.952381,0.930233,0.951032


7. Neural Network (NN)

In [24]:
# define input layer for the nn with a shape based on no. of features in data
inputs = Input(shape=X_ALL.shape[-1])

# nn architecture
'''
The following lines create the neural network architecture using Keras Sequential API. 
The architecture consists of several fully connected (Dense) layers with ReLU activation functions and dropout layers to prevent overfitting. 
The network terminates with a single neuron output layer with a sigmoid activation function, which is often used for binary classification tasks.
'''
x = Dense(16, activation='relu', kernel_initializer=tf.keras.initializers.RandomNormal())(inputs)
x = Dropout(0.4)(x)
x = Dense(16,activation='relu',kernel_initializer=tf.keras.initializers.RandomNormal())(inputs)
x = Dropout(0.4)(x)
x = Dense(25,activation='relu',kernel_initializer=tf.keras.initializers.RandomNormal())(x)
x = Dense(1, activation='sigmoid',kernel_initializer=tf.keras.initializers.RandomNormal())(x)

# create keras model
model_all = Model(inputs=inputs, outputs=x)
model_all.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_all.fit(X_train_all_features, y_train, epochs=140)
predictions = model_all.predict(X_test_all_features)

Epoch 1/140
Epoch 2/140
Epoch 3/140
Epoch 4/140
Epoch 5/140
Epoch 6/140
Epoch 7/140
Epoch 8/140
Epoch 9/140
Epoch 10/140
Epoch 11/140
Epoch 12/140
Epoch 13/140
Epoch 14/140
Epoch 15/140
Epoch 16/140
Epoch 17/140
Epoch 18/140
Epoch 19/140
Epoch 20/140
Epoch 21/140
Epoch 22/140
Epoch 23/140
Epoch 24/140
Epoch 25/140
Epoch 26/140
Epoch 27/140
Epoch 28/140
Epoch 29/140
Epoch 30/140
Epoch 31/140
Epoch 32/140
Epoch 33/140
Epoch 34/140
Epoch 35/140
Epoch 36/140
Epoch 37/140
Epoch 38/140
Epoch 39/140
Epoch 40/140
Epoch 41/140
Epoch 42/140
Epoch 43/140
Epoch 44/140
Epoch 45/140
Epoch 46/140
Epoch 47/140
Epoch 48/140
Epoch 49/140
Epoch 50/140
Epoch 51/140
Epoch 52/140
Epoch 53/140
Epoch 54/140
Epoch 55/140
Epoch 56/140
Epoch 57/140
Epoch 58/140
Epoch 59/140
Epoch 60/140
Epoch 61/140
Epoch 62/140
Epoch 63/140
Epoch 64/140
Epoch 65/140
Epoch 66/140
Epoch 67/140
Epoch 68/140
Epoch 69/140
Epoch 70/140
Epoch 71/140
Epoch 72/140
Epoch 73/140
Epoch 74/140
Epoch 75/140
Epoch 76/140
Epoch 77/140
Epoch 78

In [25]:
# predict using trained nn model
predictions_all = model_all.predict(X_test_all_features)

# convert predicted probabilities to binary predictions. Note that a threshold of 0.4 was used as recall is more important for medical diagnosis
predictions_all = (predictions_all > 0.4) 

# calculate evaluation metrics
accuracy = accuracy_score(y_test, predictions_all)
f1 = f1_score(y_test, predictions_all)
precision = precision_score(y_test, predictions_all)
recall = recall_score(y_test, predictions_all)
balanced_accuracy = balanced_accuracy_score(y_test, predictions_all)

# df and insert all evaluation metrics
nn_df = pd.DataFrame([[
    accuracy, 
    f1, 
    precision, 
    recall, 
    balanced_accuracy
]], 
columns=[
    'accuracy', 
    'f1_score', 
    'precision', 
    'recall', 
    'balanced_accuracy'
])

nn_df.insert(0, 'model', 'NeuralNetwork_all_features')

# concat & sort
results = pd.concat([results, nn_df], ignore_index=True)
results.sort_values(by='f1_score',ascending=False)



Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
9,KNearsNeighbors_all_features,0.973684,0.963855,1.0,0.930233,0.965116
12,NaiveBayes_all_features,0.973684,0.963855,1.0,0.930233,0.965116
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
10,RandomForest_all_features,0.964912,0.952381,0.97561,0.930233,0.958074
13,NeuralNetwork_all_features,0.964912,0.95122,1.0,0.906977,0.953488
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032


Prep evaluation df

In [26]:
results

Unnamed: 0,model,accuracy,f1_score,precision,recall,balanced_accuracy
0,LogisticRegression,0.964912,0.952381,0.97561,0.930233,0.958074
1,DecisionTree,0.921053,0.896552,0.886364,0.906977,0.918277
2,KNearsNeighbors,0.973684,0.963855,1.0,0.930233,0.965116
3,RandomForest,0.964912,0.952381,0.97561,0.930233,0.958074
4,XGBoost,0.95614,0.941176,0.952381,0.930233,0.951032
5,NaiveBayes,0.973684,0.963855,1.0,0.930233,0.965116
6,NeuralNetwork,0.964912,0.952381,0.97561,0.930233,0.958074
7,LogisticRegression_all_features,0.95614,0.939759,0.975,0.906977,0.946446
8,DecisionTree_all_features,0.921053,0.896552,0.886364,0.906977,0.918277
9,KNearsNeighbors_all_features,0.973684,0.963855,1.0,0.930233,0.965116


In [28]:
# to csv
OHTERS_PATH = config['Paths']['other_data']
results.to_csv(f'{OHTERS_PATH}evaluation_table.csv', index=False)

End