In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt # Library to help us with plots
from sklearn.preprocessing import MinMaxScaler # Library to help us scale
from sklearn.model_selection import train_test_split # Library to help us split the dataset
from sklearn.ensemble import RandomForestClassifier # Library to enable Random Forest
from sklearn.svm import SVC # Library to enable support vector machine
from sklearn.neighbors import KNeighborsClassifier # Library to enable KNN
from sklearn.tree import DecisionTreeClassifier # Library to enable decision tree
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_auc_score ,roc_curve
from sklearn.model_selection import cross_validate, cross_val_score, KFold, StratifiedKFold, GridSearchCV
import pickle # Library to help us save the model

RANDOM_SEED = 1331
np.random.seed(RANDOM_SEED)

Based from the original Cuore project  by:
Frank Aiwuyor Ogiemwonyi, Rony Ventura, Tara de Groot,
Eric Vincent Rivas, Silvia Dubon, Dwi Aji Kurnia Putra, and Laureanne van Dijk 

## 1. Import DataFrame

In [None]:
df = pd.read_csv("processed.cleveland.csv",header=None) # Opening and reading the file of Cleveland
df.head() # View of first 5 rows 

In [None]:
#Adding column names
#Column names referenced from original dataset
#https://archive.ics.uci.edu/dataset/45/heart+disease
df=df.set_axis(['Age', 'Sex', 'ChestPain', 'Trestbps', 'Chol', 'Fbs', 'Restecg', 'Thalach', 'Exang', 'Oldpeak', 'Slope', 'Ca', 'Thal', 'Class'], axis='columns')

In [None]:
#Check for null values and data types of each column
df.info()

print("\n\n" + " Data set shape is :")
df.shape

In [None]:
#Distribution of the data to check for outliers especially on the age column 
df.describe()

## 2. Preprocess the Dataset

### 2.1 Delete rows with missing values

In [None]:
# As Features "CA" and "THAL" are type object lets check what unique values they have 
print(df['Ca'].unique()) # Print unique values of "ca"
print(df['Thal'].unique()) # Print unique values of "thal"

In [None]:
# Missing values are stated with a questionmark, so let's see how many missing values we have
print(" \"?\" values in the dataframe ") # Print the phrase "?" values in the dataframe
print((df == "?").sum(axis=0)) # Finding the number of "?" in each feature (column) and print the sum 

In [None]:
# Creating a new df in which the rows with values "?" from the columns "ca" and "thal" are excluded
df = df[(df["Ca"] != '?') & (df["Thal"] != '?')]#This line will use only the rows that does not contain ?
#In case we had empty values in the form of nan or null: 
#df.dropna(inplace = True) #  Deletion of null values 
print(" \"?\" values in the dataframe ") # Print the phrase "?" values in the dataframe
# Finding the number of "?" in each feature (column) and showing (printing) the sum of it to verify there are no more "?" values
print((df == "?").sum(axis=0)) 


In [None]:
df = df.reset_index(drop=True) # Reset the index to make sure the index matches the number of entries 
df.dtypes

### 2.2 Transform Data

In [None]:
#Create a copy of the dataframe to work in the machine learning model 
#The transformation of Data is done in python instead of PowerBi to ease up the reproducibility and demonstrate the procedure
dfmodel=df.copy()
dfmodel.head()

In [None]:
#Now that the data is clean we have to convert to the correct datatype
df["Age"] = df["Age"].astype("int")
df["Sex"] = df["Sex"].astype("category")
df["ChestPain"] = df["ChestPain"].astype("category")
df["Fbs"] = df["Fbs"].astype("category")
df["Restecg"] = df["Restecg"].astype("category")
df["Thalach"] = df["Thalach"].astype("int")
df["Exang"] = df["Exang"].astype("category")
df["Slope"] = df["Slope"].astype("category")
df["Ca"] = pd.to_numeric(df['Ca'])
df["Ca"]=df["Ca"].astype("int")
df["Thal"] = pd.to_numeric(df['Thal'])
df["Thal"] = df["Thal"].astype("category")
df["Class"]=df["Class"].astype("category")
# Rename values 
df['Sex'] = df['Sex'].replace({0: 'Female', 1: 'Male'})
df['ChestPain'] = df['ChestPain'].replace({1: 'Typical angina', 2: 'Atypical angina',3:'Non-anginal pain',4:'Asymptomatic'})
df['Fbs'] = df['Fbs'].replace({0: 'Normal', 1: 'High'})
df['Restecg'] = df['Restecg'].replace({0: 'Normal', 1: 'Abnormal',2:'Probable hypertrophy'})
df['Exang'] = df['Exang'].replace({0: 'No', 1: 'Yes'})
df['Slope'] = df['Slope'].replace({1: 'Upsloping', 2: 'Flat',3:'Downsloping'})
df['Thal'] = df['Thal'].replace({3.0: 'Normal',6:'Fixed defect',7:'Reversable defect'})




In [None]:
df.head()

In [None]:
df.shape # Size after deletion of missing values
#After the cleaning phase we have deleted 6 rows in total ,approximately 1.98%.

### 2.3 Check Class labels


Class= 0: Healthy \
Class=> 1: Not *healthy* \
Prediction outcome will categorize  healthy or not healthy , therefore class 1,2,3,4 will be considered not healthy and grouped together in one single class

In [None]:
df["Class"].value_counts() # Count and show the number of values per class label


In [None]:
# Changing values from 1-4 to 1, because 1 means they have a heart disease i.e. not healthy 
df["Class"].replace({2: 1, 3: 1, 4:1}, inplace=True) 
dfmodel["Class"].replace({2: 1, 3: 1, 4:1}, inplace=True)#replace the class labels in the models dataframe

In [None]:
df["Class"].value_counts() # Count the amount of values for each class label 

In [None]:
#Rename the values of Class column
df['Class'] = df['Class'].replace({0: 'Healthy', 1: 'Not Healthy'})
df.head()

### 2.4 Save the CSVs
For reproducibility

In [None]:
df.to_csv('Cleveland_TransformedData.csv', index=False)
dfmodel.to_csv('Cleveland_ModelData.csv', index=False)

## 3. Exploratory Data Analysis (EDA)

A small data analysis is performed in python just to grasp a notion of what is behind the data.
A dedicated data analysis is done in Power Bi

Comparison of women vs men 

In [None]:
# We are going to plot the negative and postive values (healty/not healthy) grouped by sex
# This way we can compare the outcome between women and men
ylabels = ['Healthy', 'Not Healthy'] # The labels we use for the y-axis
#labels = ("female", "male") # The labels we use for the x-axis

positions = (0, 1)

Sex = dfmodel.groupby("Sex")['Class'].value_counts() # We're grouping by sex
ax = Sex.unstack().plot(kind='bar',legend=False, rot=0) # Creating the barplot
plt.legend(labels=ylabels) # Create the legend based on who is healthy/not healthy
plt.show() # show the plot
Sex # Print the values we see in the plot so we're able to check the exact values

## 4. Standardization

In [None]:
# Define the columns in which the values are numerical 
numerical = ["Age", "Trestbps", "Chol", "Thalach", "Oldpeak", "Ca"]
X = dfmodel[numerical] # Create a dataframe for standardization that only includes the numerical features
# We compared the evaluation metrics of MinMaxScaler() and StandardScaler()
# MinMaxScaler() scored slightly better so we decided to go with MinMaxScaler()
scaler=MinMaxScaler() # Standardize the dataset
df_standard = scaler.fit_transform(X) #fit and transform the data
# Create a dataframe with the standardized values
df_standard = pd.DataFrame(df_standard, columns=numerical) 

In [None]:
# Combine original dataframe with standardized dataset
standardized = dfmodel.copy() # Create a copy of the origional dataset
# Replace the numerical features in the original dataframe with the standardized features
standardized[numerical] = df_standard[numerical] 
standardized.head() # Print the first 5 rows of the standardized data

## 5. Selecting the features

In [None]:
# Create 2 test sets: one with all features and one with selected features
# First we create a dataframe in which all features are included
X_all_features = standardized.copy() # Create a copy of the standardized dataframe
X_all_features.drop(columns=['Class'],inplace=True) # Drop the outcome variable
# Create a dataset with only the selected values
X_selection = standardized[['Chol', 'Age', 'Fbs', 'Trestbps', 'Ca']]
X_selection = pd.DataFrame(X_selection) # Converting the X_selection to a dataframe
y = standardized['Class'] # We store the classlabels in y

## 6. Splitting in training and test sets

In [None]:
# Split the dataset that includes all features into a training and test set
X1_train, X1_test, y1_train, y1_test = train_test_split(X_all_features, y, test_size=0.2, random_state=RANDOM_SEED)
# Split the dataset that includes a selection of features into a training and test set
X2_train, X2_test, y2_train, y2_test = train_test_split(X_selection, y, test_size=0.2, random_state=RANDOM_SEED)

In [None]:
# Defining a method to print the size of the test/training set
def size(X_train, X_test, y_train, y_test):
  print("The size of the training set is: ",X_train.shape)
  print("The size of the test set is: ",X_test.shape)
  print("The size of the training target set is: ",y_train.shape)
  print("The size of the test target set is: ",y_test.shape)

In [None]:
# Checking if the train/split sets match with target of the dataset with all features
size(X1_train, X1_test, y1_train, y1_test)

In [None]:
# Checking if the train / split  sets match with target of the dataset with the feature selection
size(X2_train, X2_test, y2_train, y2_test)

## 7. Choosing models Random Forest, SVM, KNN, Descision Tree


In [None]:
# Define classifiers to train and test 
MODELS_TO_TEST = {
    "RF_10": RandomForestClassifier(n_estimators=10, max_depth=5),
    "SVM": SVC(kernel='linear'),
    "KNN": KNeighborsClassifier(n_neighbors=5),
    "DT": DecisionTreeClassifier(max_depth=3),
}

# Define the number of splits 
NUMBER_OF_SPLITS = 5

# Define the scoring metrics
SCORING_METRICS = ["accuracy", "precision_macro", "recall_macro", "f1_macro", "roc_auc"] # Metrics of interest

# Create empty DataFrame to populate the name of the classifier and the six values returned from `cross_validate()`
results_evaluation = pd.DataFrame({
                                    "classifier_name":[],
                                    "fit_time": [],
                                    "score_time": [],
                                    "test_accuracy": [],
                                    "test_precision_macro": [],
                                    "test_recall_macro": [],
                                    "test_f1_macro": [],
                                    "test_roc_auc": [],
                                    })
results_evaluation_selected_features = pd.DataFrame({
                                    "classifier_name":[],
                                    "fit_time": [],
                                    "score_time": [],
                                    "test_accuracy": [],
                                    "test_precision_macro": [],
                                    "test_recall_macro": [],
                                    "test_f1_macro": [],
                                    "test_roc_auc": [],
                                    })

## 8. Train, test and evaluation metrics

In [None]:
#### ITERATION FOR THE EXPERIMENT for Dataset with ALL features

for name, classifier in MODELS_TO_TEST.items():
    
    print(f"Currently training the classifier {name}.")

    # Get the evaluation metrics per fold after cross-validation
    # Note that we are passing the normalized array `data_X_norm` to all classifiers
    scores_cv = cross_validate(classifier, X_all_features, y, cv=NUMBER_OF_SPLITS, scoring=SCORING_METRICS)

    # Average the scores among folds
    dict_this_result = {
                    "classifier_name":[name],
                    }
    # Populate the dictionary with the results of the cross-validation
    for metric_name, score_per_fold in scores_cv.items():
        dict_this_result[metric_name] = [ scores_cv[metric_name].mean() ]

    # Generate the results to populate the pandas.DataFrame
    this_result = pd.DataFrame(dict_this_result)

    # Append to the main dataframe with the results 
    results_evaluation = pd.concat([results_evaluation, this_result], ignore_index=True)

print("The experimental setup has finished")

In [None]:
# Printing the evaluation metrics of all features
results_evaluation

In [None]:
# Store the average accuracy from the all features dataset and group by the name of the classifier
average_score_classifier = results_evaluation.groupby(by=["classifier_name"]).mean()

In [None]:
average_score_classifier["test_accuracy"].plot.bar() # Plot the the average accuracy from the all features dataset
plt.title("Average accuracy per classifier among dataset") # Set a title for the plot
plt.xlabel("Classifiers") # Set a label for the X-axis
plt.show() # Show the plot
average_score_classifier["test_accuracy"]

In [None]:
average_score_classifier["test_roc_auc"].plot.bar() # Plot the the average accuracy from the all features dataset
plt.title("Average roc-auc per classifier among dataset") # Set a title for the plot
plt.xlabel("Classifiers") # Set a label for the X-axis
plt.show() # Show the plot
average_score_classifier["test_roc_auc"]

In [None]:
#### ITERATION FOR THE EXPERIMENT for Dataset with selected features

for name, classifier in MODELS_TO_TEST.items():
    
    print(f"Currently training the classifier {name}.")

    # Get the evaluation metrics per fold after cross-validation
    # Note that we are passing the normalized array `data_X_norm` to all classifiers
    scores_cv_selected = cross_validate(classifier, X_selection, y, cv=NUMBER_OF_SPLITS, scoring=SCORING_METRICS)

    # Average the scores among folds
    dict_this_result_selected = {
                    "classifier_name":[name],
                    }
    # Populate the dictionary with the results of the cross-validation
    for metric_name, score_per_fold in scores_cv_selected.items():
        dict_this_result_selected[metric_name] = [ scores_cv_selected[metric_name].mean() ]

    #### Generate the results to populate the pandas.DataFrame
    this_result_selected = pd.DataFrame(dict_this_result_selected)

    # Append to the main dataframe with the results 
    results_evaluation_selected_features = pd.concat([results_evaluation_selected_features, this_result_selected], ignore_index=True)

print("The experimental setup has finished")

In [None]:
# Printing the evaluation metrics of the selected features
results_evaluation_selected_features

## 9. Tune the models for better performance by optimizing the parameters




In [None]:
# Create a standard SVC classifier clf without any parameter
clf = SVC()
# Grid search with a list of two parameter dictionaries, one with kernel = ['poly'] and degree = [2, 3, 4]
# and the other one with kernel = ['linear', 'rbf'] and C = [1, 10, 100, 1000]
param_grid = [
    {'kernel': ['poly'], 'degree': [2, 3, 4]},
    {'kernel': ['linear', 'rbf'], 'C': [1, 10, 100, 1000]},
    ]
# Run GridSearch and save the scores
gs = GridSearchCV(clf, param_grid, cv=5)

In [None]:
# Fit the training data of the dataset with all features
gs.fit(X1_train, y1_train)
# The best classifier
gs.best_estimator_

In [None]:
# The best score
gs.best_score_

## 10. Saving the best model

In [None]:
# Predict model with gs (best model)
y1_predicted = gs.predict(X1_test)
print(classification_report(y1_test, y1_predicted))
print(f'model  AUC score: {roc_auc_score(y1_test, y1_predicted)}')

In [None]:
# Method for plotting AUC/ROC curve
def plot_roc_curve(true_y, y_prob): 
    
    fpr, tpr, thresholds = roc_curve(y1_test, y1_predicted)
    plt.plot(fpr, tpr)
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')

In [None]:
plot_roc_curve(y1_test, y1_predicted) # Plot AUC/ROC curve
print(f'model  AUC score: {roc_auc_score(y1_test, y1_predicted)}')

In [None]:
# Save the trained model into trained_model_cuore.pickle using pickle, without using a folder_path
FOLDER_PATH = ""
trained_model_filename = FOLDER_PATH + "trained_model_cuore.pickle"

In [None]:
# Create file with the specific variable in the specified folder
data_to_save = gs.best_estimator_ 
file_path = trained_model_filename


In [None]:
# Creates a binary object and writes the indicated variables
with open(file_path, "wb") as writeFile:
    pickle.dump(data_to_save, writeFile)

In [None]:
# Here we will load the same model, but in a variable that is completely empty
loaded_model = None

In [None]:
# Load model
with open(trained_model_filename, "rb") as readFile:
    loaded_model = pickle.load(readFile)

In [None]:
pip list

In [None]:
Y_predicted_loaded_model = loaded_model.predict(X1_test) # Confirm that the loaded model has the same metrics as the one trained
print(classification_report(y1_test, Y_predicted_loaded_model))
print(f'model  AUC score: {roc_auc_score(y1_test, Y_predicted_loaded_model)}')