In [100]:
#Import relevant libraries for use within the program
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler # Import the StandardScaler function
from sklearn.model_selection import train_test_split # Import train_test_split function from scikit learn
from sklearn.cluster import KMeans

In [101]:
# Configure printing to 3 decimal places:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Data Loading

We load data from the heart csv file into a pandas dataframe.

In [102]:
# Load the dataset from the csv file
path = './heart.csv'
dataframe = pd.read_csv(path) #Dataframe having the csv file of heart.csv dataset

In [103]:
# Print out the first 10 records to ensure the data loaded correctly
dataframe.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1
6,58,1,0,114,318,0,2,140,0,4.4,0,3,1,0
7,55,1,0,160,289,0,0,145,1,0.8,1,1,3,0
8,46,1,0,120,249,0,0,144,0,0.8,2,0,3,0
9,54,1,0,122,286,0,0,116,1,3.2,1,2,2,0


In [104]:
dataframe.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0,1025.0
mean,54.434,0.696,0.942,131.612,246.0,0.149,0.53,149.114,0.337,1.072,1.385,0.754,2.324,0.513
std,9.072,0.46,1.03,17.517,51.593,0.357,0.528,23.006,0.473,1.175,0.618,1.031,0.621,0.5
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,48.0,0.0,0.0,120.0,211.0,0.0,0.0,132.0,0.0,0.0,1.0,0.0,2.0,0.0
50%,56.0,1.0,1.0,130.0,240.0,0.0,1.0,152.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,275.0,0.0,1.0,166.0,1.0,1.8,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


## Split known and Unknown variables

In [105]:
# Get all the columns except target
X = dataframe.drop(['target'], axis=1)

# Get only the target column
Y = dataframe['target']

## Cleaning

In [106]:
def rem_outlier(dataframe, feature): # Defining a function that accepts a dataset and one of it's properties to remove outliers from that property (Age, cholesterol and so on)
#     Set the upper and lower bounds to +/- 3 standard devations
    lower_bound = dataframe[feature].mean() - 3*dataframe[feature].std()
    upper_bound = dataframe[feature].mean() + 3*dataframe[feature].std()
    
#     Remove all datapoints above and below 3 standard deviations
    dataframe.loc[dataframe[feature] < lower_bound , feature] = lower_bound
    dataframe.loc[dataframe[feature] > upper_bound, feature] = upper_bound

def clean(df):
#     Remove outliers from properties inside for loop
    for var in ['trestbps', 'chol', 'thalach', 'age']:
        rem_outlier(df, var)

In [107]:
# Standardise non categorical variables

def standardise(x):
    # Standardise selected columns' values to z-scores
    for var in ['thal', 'age', 'trestbps', 'chol', 'thalach', 'oldpeak']:
        x[var] = (x[var] - x[var].mean())/(x[var].std())

## Training

> K-means clustering.

We only use two(2) clusters because we have a binary categorical result variable - 'target'. This indicates whether the patient has heart disease or not.

We have no need to split our dataset into training and testing because K-means is an unsupervised machine learning technique. This means that the model is only fed inputs and interpolates outputs based on its internal logic. The accuracy is then determined by the proportion of predicted classes which equal the target categories

In [108]:
def train(x, random_state=0):
    kmeans = KMeans(n_clusters=2, init='k-means++', random_state=random_state) # Initialise the KMeans model
    kmeans.fit_predict(x) # Train the model with the training data
    return kmeans

## Analysis

In [109]:
def analyse(model, x, y, verbose=False):
    if(verbose):
        display(pd.DataFrame(model.cluster_centers_, columns=x.columns.values, index=['a', 'b']), )
    
#     Get the classified labels.
    labels = model.labels_
    
    # Add K-Means predicted values
    x['Predicted'] = model.labels_
    
    # Add actual observed target values
    x['Actual'] = y

    n_correct_labels = (x['Actual'] == x['Predicted']).sum()
    n_total_values = x.shape[0]
    
    accuracy = (n_correct_labels  / n_total_values) * 100
    
    if(accuracy < 50):
        x['Predicted'] = 1-x['Predicted']
        n_correct_labels = n_total_values - n_correct_labels
        accuracy = 100 - accuracy
    
    if(verbose):
        # Print out the first 10 elements
        display(x.head(10))
    
    if(verbose):
        print(f"{n_correct_labels} out of {n_total_values} samples were correctly labeled.")
    print(f"Accuracy: {accuracy :0.2f}%")
    print(f"Inertia: {model.inertia_ :0.2f}")

In [110]:
def classify(x):
    analyse(train(x), x.copy(), Y.copy())

In [111]:
clean(X)

In [112]:
standardise(X)

In [113]:
classify(X)

Accuracy: 77.76%
Inertia: 7893.16


In [114]:
# Analyse with the uncorrelated variabled cut-out
X_trimmed = X.drop(['trestbps', 'fbs', 'chol', 'restecg'], axis=1)
classify(X_trimmed)

Accuracy: 78.83%
Inertia: 5457.45


In [115]:
# Analyse with the uncorrelated variables cut-out
X_binary = pd.get_dummies(X_trimmed, columns=['sex', 'cp', 'exang', 'slope', 'ca'])
classify(X_binary)

Accuracy: 77.17%
Inertia: 5340.93
