# Exploratory Data Analysis

### Importing essential libraries

In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
sns.set()

### Importing dataset 

In [2]:
#Import basic libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
data = pd.read_csv('Data/selected_data.csv', index_col=0)
data.reset_index(inplace=True, drop=True)
data.head()

Unnamed: 0,Age Category,Sex,Height,Weight,BMI,Marital Status,Income Category,Education Level,General Health,No of Days of Poor Physical Health,...,Stroke,Depressive Disorder,Kidney Disease,Cancer,Arthritis,Time since last routine checkup,High blood sugar/Diabetes Test,Workout,Diabetes,Heavy Drinker
0,5,0,1.6,69.0,27.0,1.0,5,2,2.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1,0.0,1
1,6,0,1.65,54.0,20.0,3.0,5,3,3.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1,0.0,1
2,3,0,1.68,91.0,32.0,1.0,5,3,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1,0.0,1
3,5,1,1.83,104.0,31.0,5.0,4,2,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1,0.0,1
4,6,0,1.57,56.0,23.0,3.0,3,3,1.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1,0.0,1


### Basic exploration of dataset

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414871 entries, 0 to 414870
Data columns (total 24 columns):
 #   Column                                        Non-Null Count   Dtype  
---  ------                                        --------------   -----  
 0   Age Category                                  414871 non-null  int64  
 1   Sex                                           414871 non-null  int64  
 2   Height                                        414871 non-null  float64
 3   Weight                                        414871 non-null  float64
 4   BMI                                           414871 non-null  float64
 5   Marital Status                                414871 non-null  float64
 6   Income Category                               414871 non-null  int64  
 7   Education Level                               414871 non-null  int64  
 8   General Health                                414871 non-null  float64
 9   No of Days of Poor Physical Health            41

The variable Coronary Heart Disease/Myocardial Infarction is renamed to CHD/MI for easier visualisation as it is our respondent variable.

In [5]:
data = data.rename(columns = {'Coronary Heart Disease/Myocardial Infarction':'CHD/MI'})

## Numerical Variables

### The numerical variables are:
Height, Weight, BMI, Physical Health,  Mental Health, Total Sleep

In [6]:
num_var = ['Height', 'Weight', 'BMI', 'Physical Health', 'Mental Health', 'Total Sleep']

num_data = pd.DataFrame(data[num_var])
num_data.head()

KeyError: "['Physical Health', 'Mental Health', 'Total Sleep'] not in index"

In [None]:
num_data.describe()

In [None]:
num_data.info()

Plot the boxplot, histogram, violin plot and pairplot for the numerical variables

In [None]:
#Draw the distributions of all variables
f, axes = plt.subplots(6, 3, figsize=(18, 24))

axes[1][1].set_xlim([20, 300])

w = 1
count = 0
for var in num_data:
    sb.boxplot(data = num_data[var], orient = "h", ax = axes[count,0])
    sb.histplot(data = num_data[var], ax = axes[count,1], bins=np.arange(num_data[var].min(), num_data[var].max() + w, w))
    sb.violinplot(data = num_data[var], orient = "h", ax = axes[count,2])
    count += 1

We want to observe the relationship between the variables and heart disease. Hence, Coronary Heart Disease/Myocardial Infarction is added to the dataset.

In [None]:
num_data = pd.concat([num_data, data['CHD/MI']], axis = 1)

Plot a heatmap to analyse the correlation for the numerical variables 

In [None]:
# Correlation Matrix
print(num_data.corr())

sns.set(font_scale=1.2)
# Plot the heatmap for the corr matrixb
f = plt.figure(figsize=(16, 16))
sb.heatmap(num_data.corr(), vmin = -1, vmax = 1, linewidths = 1,
           annot = True, fmt = ".2f", annot_kws = {"size": 12})


We observe that there is hardly any correlation between CHD/MI and the numerical variables. However, they could have a non-linear relationship.

In [None]:
num_data.describe()

Next, the numerical variables are plotted against CHD/MI to observe the distribution of values.

In [None]:
num_var = ['Height', 'Weight', 'BMI', 'Physical Health', 'Mental Health', 'Total Sleep']

sb.set(font_scale=1)
f, axes = plt.subplots(6,1,figsize=(16,24))
count = 0
for var in num_var:
    sb.boxplot(x = var, y = 'CHD/MI', data = data, orient = 'h', ax = axes[count])
    count += 1

It can be seen that most of the distributions are similar except physical health and weight. <br>
Physical Health here denotes the number of days of poor physical health the respondent experienced in the past 30 days and it shows that people who have more poor days might be more likely to have heart disease. <br>
There is a small difference in the distribution of Weight that indicates people with higher weight might be more likely to have heart disease.

## Categorical Variables

### The categorical variables are:
Age Category, Sex, Marital Status, Income Category, Education Level, General Health, Physical Health, Mental Health, Total Sleep, Smoker Status, Coronary Heart Disease/Myocardial Infarction, Stroke, Depressive Disorder, Kidney Disease, Cancer, Arthritis, Time since last routine checkup, Heavy Drinker, High blood sugar/Diabetes Test, Workout, Diabetes

In [None]:
#Drop the numerical variables to obtain the categorical data
num_var = ['Height', 'Weight', 'BMI', 'Physical Health', 'Mental Health', 'Total Sleep']

cat_data = data.drop(columns=num_var)
cat_data.head()

In [None]:
cat_data.info()

Convert the variables to categorical.

In [None]:
cat_data = cat_data.astype('category')
cat_data.info()

In [None]:
cat_data.describe()

Plot the countplot for the categorical variables.

In [None]:
f, axes = plt.subplots(18, 1, figsize=(16,80))

sb.set(font_scale=1)
count = 0
for var in cat_data:
    sb.countplot(y = var, data = cat_data, ax = axes[count])
    count += 1

To check the effect of the categorical variables on CHD/MI. Stacked Bar Chart (SBC) is used because we considered that our dataset is very imbalanced. SBC can show the percentage of CHD/MI values at each category level which allows better comparison between the different levels.

In [None]:
#Stacked Bar Chart
plt.figure(figsize=(32,14))
for var in cat_data:
    table=pd.crosstab(data[var],data['CHD/MI'])
    table.div(table.sum(1).astype(float), axis=0).plot(kind='bar', stacked=True)
    plt.xlabel(var)
    plt.ylabel('CHD/MI')
    plt.legend(loc=3)
    plt.xticks(rotation=0)

The observations are as follow:
- Age Category: The likelihood of having CHD/MI increases with age
- Sex: Men has a slightly higher percentage of having CHD/MI
- Marital Status: Widowed respondents are more likely to have CHD/MI compared to the other statuses
- Income Category: Decreasing trend of CHD/MI as income increases
- Education Level: Higher education levels show lower CHD/MI percentage
- General Health (Higher value indicates poorer health): The trend could be because of having CHD/MI affects the respondent's answer
- Smoker Status: Percentage of respondent who has CHD/MI is lower for the Never Smoked category
- Stroke: Respondent who has stroke are more likely to have CHD/MI
- Depressive Disorder: Respondent with depressive disorder has a higher percentage of CHD/MI
- Kidney Disease: The percentage of CHD/MI values of kidney disease patients are higher
- Cander: Cancer patients are more likely to have CHD/MI
- Arthritis: Having arthritis indicates a higher chance of the respondent having CHD/MI
- Time since last routine checkup: Results show that recent medical checkups might mean the respondent is more likely to have CHD/MI
- High blood sugar/Diabetes Test: Chart shows that test for high blood sugar or diabetes within past three years points to slightly higher percentage of having CHD/MI 
- Workout: People who workout have a lower chance of having CHD/MI
- Diabetes: Diabetes patients have a higher percentage of having CHD/MI
- Heavy Drinker: Heavy drinkers are slightly more likely to have CHD/MI


## Principal Component Analysis (PCA)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Drop target 
drop_list = [ 'CHD/MI']
features = list(data.drop(drop_list, axis=1).columns)

# Standardise the data 
x = data.loc[:, features].values
y = data.loc[:, ['CHD/MI']].values
x = StandardScaler().fit_transform(x)

pd.DataFrame(data = x, columns = features).head()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Drop target 
drop_list = ['CHD/MI']
features = list(data.drop(drop_list, axis=1).columns)

# Standardise the data 
x = data.loc[:, features].values
y = data.loc[:, ['CHD/MI']].values
x = StandardScaler().fit_transform(x)

pd.DataFrame(data = x, columns = features).head()

Principle Component Analysis (PCA)
Purpose of PCA: simplifies the complexity in high-dimensional data while retaining trends and patterns
Since PCA yields a feature subspace that maximizes the variance along the axes, 
it makes sense to standardize the data,  which is a requirement for the optimal performance of 
many machine learning algorithms.

In [None]:
# Importing key libraries for PCA 

from sklearn.decomposition import PCA

## Visualising result from PCA 

# Checking min no. of principal component to retain 90% of the variance 
pca = PCA(.90)   
pca.fit(x)
n = pca.n_components_

# PCA Projection to n dimension
pca_x = pca.fit_transform(x)
pca_names = ['PCA_{x}'.format(x=num) for num in range(1,n+1)]
pca_data = pd.DataFrame(data=pca_x, columns=pca_names)

f_data = pd.concat([pca_data, data['CHD/MI']], axis=1)

In [None]:
## Explained Variance plot for 90% explained variance
f, axes = plt.subplots(1,1, figsize = (13, 5))
total = pca.explained_variance_ratio_.sum()*100

print("Total Explained Variance: {:.2f}%".format(total))
sns.barplot(x=pca_names, y=pca.explained_variance_ratio_)

PCA Projection to 2D

In [None]:
pca = PCA(n_components=2)

In [None]:
principalComponents = pca.fit_transform(x)

In [None]:
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['principal component 1', 'principal component 2'])

In [None]:
finalDf = pd.concat([principalDf, data[['CHD/MI']]], axis = 1)
finalDf.head(5)

In [None]:
# Use a PCA projection to 2d to visualize the entire data set. 
fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)


targets = [0, 1]
colors = ['r', 'g']
for target, color in zip(targets,colors):
    indicesToKeep = finalDf['CHD/MI'] == target
    ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
               , finalDf.loc[indicesToKeep, 'principal component 2']
               , c = color
               , s = 50)
ax.legend(targets)
ax.grid()

The explained variance tells us how much information (variance) can be attributed to each of the 
principal components.

In [None]:
print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))