# Unsupervised Clustering on a heart failure dataset: a python implementation of principal component analysis for Dr. Qiu

### Sina Dabiri
### sdabiri@emory.edu

#### Version 1.0

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# For this clustering project, the scikit learn PCA is going to be used from the decomposition library.
from sklearn import decomposition

In [3]:
"""
The heart dataset's CSV has been put in the project folder. The source of this dataset is: 

Dataset from Davide Chicco, Giuseppe Jurman: Machine learning can predict survival of patients with heart failure 
                                             from serum creatinine and ejection fraction alone. BMC Medical Informatics and Decision Making 20, 16 (2020)
""" 
heart_df = pd.read_csv("heart_failure_clinical_records_dataset.csv")
heart_df

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [4]:
# checking for missing values.
heart_df.isnull().sum() 

age                         0
anaemia                     0
creatinine_phosphokinase    0
diabetes                    0
ejection_fraction           0
high_blood_pressure         0
platelets                   0
serum_creatinine            0
serum_sodium                0
sex                         0
smoking                     0
time                        0
DEATH_EVENT                 0
dtype: int64

In [5]:
# looking for missing data and general stats
heart_df.info()
heart_df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


## Divide the dataset into it's features and it's true label, X and Y.

In [8]:
# Setting the X feature and inspecting it
X = heart_df[['age', 'creatinine_phosphokinase', 
       'ejection_fraction', 'platelets',
       'serum_creatinine', 'serum_sodium']]
X

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium
0,75.0,582,20,265000.00,1.9,130
1,55.0,7861,38,263358.03,1.1,136
2,65.0,146,20,162000.00,1.3,129
3,50.0,111,20,210000.00,1.9,137
4,65.0,160,20,327000.00,2.7,116
...,...,...,...,...,...,...
294,62.0,61,38,155000.00,1.1,143
295,55.0,1820,38,270000.00,1.2,139
296,45.0,2060,60,742000.00,0.8,138
297,45.0,2413,38,140000.00,1.4,140


In [11]:
X.describe()

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium
count,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,581.839465,38.083612,263358.029264,1.39388,136.625418
std,11.894809,970.287881,11.834841,97804.236869,1.03451,4.412477
min,40.0,23.0,14.0,25100.0,0.5,113.0
25%,51.0,116.5,30.0,212500.0,0.9,134.0
50%,60.0,250.0,38.0,262000.0,1.1,137.0
75%,70.0,582.0,45.0,303500.0,1.4,140.0
max,95.0,7861.0,80.0,850000.0,9.4,148.0


In [15]:
# Principal Component Analysis (PCA): dimentionally reducing the data to two principal components that discribe most of the variation. Fitting the X dataframe using PCA.
# 1- Standardize: z= (value-mean)/stdandard deviation
X_std = (X-X.mean())/X.std()
X_std

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium
0,1.190949,0.000165,-1.527998,1.678834e-02,0.489237,-1.501519
1,-0.490457,7.502063,-0.007065,7.523047e-09,-0.284076,-0.141739
2,0.350246,-0.449186,-1.527998,-1.036336e+00,-0.090748,-1.728149
3,-0.910808,-0.485257,-1.527998,-5.455595e-01,0.489237,0.084892
4,0.350246,-0.434757,-1.527998,6.507077e-01,1.262550,-4.674340
...,...,...,...,...,...,...
294,0.098035,-0.536789,-0.007065,-1.107907e+00,-0.284076,1.444672
295,-0.490457,1.276075,-0.007065,6.791087e-02,-0.187412,0.538152
296,-1.331160,1.523425,1.851853,4.893878e+00,-0.574068,0.311522
297,-1.331160,1.887234,-0.007065,-1.261275e+00,0.005916,0.764782


In [16]:
# 2- Calculate Covariance Matrix to identify correlations
X_cov = X_std.cov()
X_cov

Unnamed: 0,age,creatinine_phosphokinase,ejection_fraction,platelets,serum_creatinine,serum_sodium
age,1.0,-0.081584,0.060098,-0.052354,0.159187,-0.045966
creatinine_phosphokinase,-0.081584,1.0,-0.04408,0.024463,-0.016408,0.05955
ejection_fraction,0.060098,-0.04408,1.0,0.072177,-0.011302,0.175902
platelets,-0.052354,0.024463,0.072177,1.0,-0.041198,0.062125
serum_creatinine,0.159187,-0.016408,-0.011302,-0.041198,1.0,-0.189095
serum_sodium,-0.045966,0.05955,0.175902,0.062125,-0.189095,1.0


In [27]:
# 3a - Compute eigenvector and eigenvalues to identify the principal components
X_cov_eigVal, X_cov_eigVec = np.linalg.eig(X_cov)
print("The eig values are: ", X_cov_eigVal)
print("The eig vector is: ", X_cov_eigVec)

The eig values are:  [1.34527211 1.15364884 0.72934891 0.82384075 0.96051258 0.98737681]
The eig vector is:  [[ 0.37378302 -0.54112916 -0.18085313 -0.68105796 -0.25693875  0.0700012 ]
 [-0.20003939  0.37714412 -0.23800393 -0.04940716 -0.62019152  0.61157347]
 [-0.2894085  -0.66521196 -0.41537521  0.53078794 -0.11191279  0.08330005]
 [-0.30235872 -0.12261369  0.02737793 -0.25799978  0.65475011  0.63050293]
 [ 0.54864476 -0.19438773  0.54333596  0.39105499 -0.08800989  0.45312098]
 [-0.58658448 -0.26384445  0.66494074 -0.18026441 -0.31682336 -0.10623682]]


In [63]:
# 3b -sort the eigenvalues and vectors based on size in descending order
X_variance = X_cov_eigVal/ X_cov_eigVal.sum()
index = np.argsort(X_variance)[::-1]
X_var_sor = X_variance[index]
X_cov_eigVal_sor = X_cov_eigVal[index]
X_cov_eigVec_sor = X_cov_eigVec[:,index]
print(X_var_sor)
print(X_cov_eigVec_sor)

[0.22421202 0.19227481 0.1645628  0.16008543 0.13730679 0.12155815]
[[ 0.37378302 -0.54112916  0.0700012  -0.25693875 -0.68105796 -0.18085313]
 [-0.20003939  0.37714412  0.61157347 -0.62019152 -0.04940716 -0.23800393]
 [-0.2894085  -0.66521196  0.08330005 -0.11191279  0.53078794 -0.41537521]
 [-0.30235872 -0.12261369  0.63050293  0.65475011 -0.25799978  0.02737793]
 [ 0.54864476 -0.19438773  0.45312098 -0.08800989  0.39105499  0.54333596]
 [-0.58658448 -0.26384445 -0.10623682 -0.31682336 -0.18026441  0.66494074]]


In [80]:
# 3c: choose top 95% variance 
var_sum = 0
idx = []
for i in range(0, len(X_var_sor)):
    var_sum += X_var_sor[i]
    if var_sum < 0.95: 
        idx.append(i)
        exit
#         X_var_sor[idx]
print("The eigen values that describe the 95% of variance are: \n", X_var_sor[idx])
print("The eigen vectors that explain the 95% of variance are: \n", X_cov_eigVec_sor[idx])

The eigen values that describe the 95% of variance are: 
 [0.22421202 0.19227481 0.1645628  0.16008543 0.13730679]
The eigen vectors that explain the 95% of variance are: 
 [[ 0.37378302 -0.54112916  0.0700012  -0.25693875 -0.68105796 -0.18085313]
 [-0.20003939  0.37714412  0.61157347 -0.62019152 -0.04940716 -0.23800393]
 [-0.2894085  -0.66521196  0.08330005 -0.11191279  0.53078794 -0.41537521]
 [-0.30235872 -0.12261369  0.63050293  0.65475011 -0.25799978  0.02737793]
 [ 0.54864476 -0.19438773  0.45312098 -0.08800989  0.39105499  0.54333596]]


In [71]:
# 4 - Printing out the eigenvalues that explain 95% of the variances.
len(X_var_sor)


6

In [29]:
# 5 -  Transforming the X dataframe to the coordinate of the two principal components (PC1, PC2).
