Import Libraries

In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as mp
import seaborn as sb
import sklearn
import xgboost
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, roc_curve, auc
from xgboost import XGBClassifier
import tensorflow
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import Callback, EarlyStopping
import os
import warnings
warnings.filterwarnings('ignore')

Data pre-processing

In [30]:
def read_and_split():
    """
    Read the data, split the data into Independent and target variables and train, test datasets on 70:30 ratio
    
    Returns: 
        dataframes: independent and target variables split into train and test datasets
        
    """
    data_path = "D:\Data Science\Machine Learning & Deep Learning ANN (Regression & Classification)\Classification Practicals\BreastCancerPrediction\data\Breast_cancer_data.csv"
    print(data_path)
    data = pd.read_csv(data_path)
    X = data.drop(columns='diagnosis')
    y = data['diagnosis']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
    print("Data has been split into train and test")
    
    return X_train, X_test, y_train, y_test

Logistic Regression, Random Forest and Gradient Boosting models

In [32]:
def model_and_evaluate(model_name):
    """
    Build model and evaluate the model against the error metrics

    Args:
        model_name (str): Runs the algorithm based on the model name
    
    Returns:
        prints the accuracy of the model
    """
    X_train, X_test, y_train, y_test = read_and_split()

    # print("Data has been received.")
    
    #choose the model (LogisticRegression, RandomForest, GradientBoosting)
    if model_name=='LogisticRegression':
        model = LogisticRegression()
    elif model_name=='RandomForest':
        model = RandomForestClassifier()
    elif model_name=='GradientBoosting':
        model = GradientBoostingClassifier()
            
    #fit the model on train data
    model.fit(X_train, y_train)

    #predictions on test data 
    predictions = model.predict(X_test)
    print("Predictions have been made.")
    
    #check performance of the model
    accuracy = round(accuracy_score(y_test, predictions),2)*100
    confusion_matrix_report = confusion_matrix(y_test, predictions)
    print("Evaluation metrics:")
    print(f'Accuracy:{accuracy}%')
    print(f'Confusion matrix:{confusion_matrix_report}')
    return 


In [33]:
model_and_evaluate(model_name='RandomForest')

D:\Data Science\Machine Learning & Deep Learning ANN (Regression & Classification)\Classification Practicals\BreastCancerPrediction\data\Breast_cancer_data.csv
Data has been split into train and test
Predictions have been made.
Evaluation metrics:
Accuracy:96.0%
Confusion matrix:[[ 62   1]
 [  5 103]]


Neural Network model

In [None]:
def neural_network_model(units, learning_rate, epochs):
    """
    Build model and evaluate the model against the error metrics

    Args:
        units (int): number of neurons to be built
        learning_rate (float): controls how much to change the model to minimise the error
        epochs (int): the number of times the algorithm sees and processes the entire training dataset during the learning process 
        
    Returns:
       prints the accuracy of the model

    """
    X_train, X_test, y_train, y_test = read_and_split()
    
    model = Sequential()
    model.add(Dense(units=units, activation='sigmoid'))
    model.add(Dense(units=1))
    
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    
    model.fit(x=X_train, y=y_train, batch_size=1, epochs=epochs, callbacks=EarlyStopping(monitor='val_accuracy', patience=15, mode='max'),verbose=1)

    probabilities = model.predict(X_test)
    predictions = (probabilities>0.5).astype(int)
    
    #check performance of the model
    accuracy = round(accuracy_score(y_test, predictions),2)*100
    confusion_matrix_report = confusion_matrix(y_test, predictions)
    print("Evaluation metrics:")
    print(f'Accuracy:{accuracy}%')
    print(f'Confusion matrix:{confusion_matrix_report}')
    return 

In [18]:
neural_network_model(units=40, learning_rate=0.05, epochs=60)

Data has been split into train and test
Epoch 1/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.6157 - loss: 5.8938
Epoch 2/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6197 - loss: 6.0632
Epoch 3/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5849 - loss: 6.6172
Epoch 4/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6409 - loss: 5.7256
Epoch 5/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5817 - loss: 6.6684
Epoch 6/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6311 - loss: 5.8817
Epoch 7/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6600 - loss: 5.4203
Epoch 8/60
[1m398/398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6284 - loss: 5.9245


Monte Carlo Simulation

In [None]:
n_simulations = 100
accuracy_list = []

for i in range(n_simulations):
    X, y = read_and_preprocess()
 
    #split the data into 70:30 ratio
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
    print("Data has been split into train and test")
    
    print(f'Iteration number:{i+1}')    
    #simulate using the best model so far
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    accuracy = round(accuracy_score(y_test, predictions),2)*100
    accuracy_list.append(accuracy)
    print(f'Accuracy:{accuracy}')
print(f'Predictions have been made for {n_simulations}')
print(f'Accuracy list for {n_simulations}:{accuracy_list}') 

#calculate mean and standard deviation of accuracy scores from the monte carlo simulation
mean_accuracy = sum(accuracy_list)/len(accuracy_list)
sd_accuracy = np.std(accuracy_list)
print(f'Mean of accuracy scores:{mean_accuracy} with a standard deviation of {sd_accuracy}')

Data has been split into train and test
Iteration number:1
Accuracy:94.0
Data has been split into train and test
Iteration number:2
Accuracy:89.0
Data has been split into train and test
Iteration number:3
Accuracy:94.0
Data has been split into train and test
Iteration number:4
Accuracy:92.0
Data has been split into train and test
Iteration number:5
Accuracy:92.0
Data has been split into train and test
Iteration number:6
Accuracy:92.0
Data has been split into train and test
Iteration number:7
Accuracy:89.0
Data has been split into train and test
Iteration number:8
Accuracy:91.0
Data has been split into train and test
Iteration number:9
Accuracy:96.0
Data has been split into train and test
Iteration number:10
Accuracy:94.0
Data has been split into train and test
Iteration number:11
Accuracy:93.0
Data has been split into train and test
Iteration number:12
Accuracy:89.0
Data has been split into train and test
Iteration number:13
Accuracy:90.0
Data has been split into train and test
Iterati