# Unsupervised Clustering on a heart failiur dataset: a BMI500 class project

### Sina Dabiri
### sdabiri@emory.edu

#### Version 1.0

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# For this clustering project, the scikit learn PCA is going to be used from the decomposition library.
from sklearn import decomposition

In [None]:
"""
The heart dataset's CSV has been put in the project folder. The source of this dataset is: 

Dataset from Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure 
                                             from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020)
""" 
heart_df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
heart_df

## Divide the dataset into it's features and it's true label, X and Y.

In [None]:
# Setting the X feature and inspecting it
X = heart_df[['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time']]
X

In [None]:
# Setting the true label Y vector
Y =  heart_df['DEATH_EVENT']
Y


In [None]:
# Principal Component Analysis (PCA): dimentionally reducing the data to two principal components that discribe most of the variation. Fitting the X dataframe using PCA.
pca = decomposition.PCA(n_components=2)
pca.fit(X)

In [None]:
# Printing out the two highest variances.
print(pca.explained_variance_)


In [None]:
# Transforming the X dataframe to the coordinate of the two principal components (PC1, PC2).
X_reduced = pca.fit_transform(X)
X_reduced.shape

In [None]:
# Scatter plotting the two classes of Alive/Dead on the PC1 and PC2
plt.figure(figsize=(10,5))
colors = ['blue', 'turquoise']
target_names = ['alive', 'dead']
lw = 2
for color, i, target_name in zip(colors, [0, 1], target_names):
    plt.scatter(X_reduced[Y == i, 0], X_reduced[Y == i, 1], color=color, alpha=.8, lw=lw,
                label=target_name)
plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.xlabel('PC1')
plt.ylabel('PC2')
plt.title('PCA of Heart dataset');