In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import altair as alt

In [19]:
hd_df = pd.read_csv("https://raw.githubusercontent.com/sharmaroshan/Heart-UCI-Dataset/master/heart.csv")
hd_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
train_df, test_df = train_test_split(hd_df, test_size=0.2, random_state=123)
train_df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
102,63,0,1,140,195,0,1,179,0,0.0,2,2,2,1
261,52,1,0,112,230,0,1,160,0,0.0,2,1,2,0
228,59,1,3,170,288,0,0,159,0,0.2,1,0,3,0
288,57,1,0,110,335,0,1,143,1,3.0,1,1,3,0
78,52,1,1,128,205,1,1,184,0,0.0,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
106,69,1,3,160,234,1,0,131,0,0.1,1,1,2,1
83,52,1,3,152,298,1,1,178,0,1.2,1,0,3,1
17,66,0,3,150,226,0,1,114,0,2.6,0,0,2,1
230,47,1,2,108,243,0,1,152,0,0.0,2,0,2,0


In [8]:
print(train_df.dtypes) # all dtypes are numeric
print(train_df.isnull().values.any())

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object
False


In [36]:
# Analysis of all the feature distributions

feat_dist = alt.Chart(train_df).mark_bar().encode(
    x= alt.X(alt.repeat(), type='quantitative', bin=alt.Bin(maxbins=30)),
    y="count()",
    color="target"
).properties(
    height=200,
    width=200
).repeat(
    train_df.columns.tolist(),
    columns=4
)
feat_dist

To see if any particular feature might be more useful when predicting the target class, the distribution of each feature was plotted and coloured according to the target value. Features are divided into continuous or discrete values, with the continuous variables all having very different distribuion means and speads. Discrete features either have counts at 0 or 1 or are spread about three to five distinct values. This visual preliminary analysis helps to highlight what sort of preprocessing will be needed in order to incorporate the features into our predictive model.

In [35]:
# Spearman's correlation values for all of the features and target value
corr = train_df.corr('spearman').style.background_gradient()
corr

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
age,1.0,-0.10247,-0.049172,0.271771,0.200683,0.139318,-0.145674,-0.37581,0.084412,0.264123,-0.227631,0.32929,0.10951,-0.246855
sex,-0.10247,1.0,-0.072659,-0.052798,-0.229346,0.021746,-0.010062,-0.060713,0.143083,0.116345,-0.086217,0.159675,0.224597,-0.300256
cp,-0.049172,-0.072659,1.0,0.101039,-0.093341,0.105884,-0.015156,0.28719,-0.43991,-0.142195,0.154317,-0.219713,-0.250448,0.492618
trestbps,0.271771,-0.052798,0.101039,1.0,0.10753,0.128277,-0.098703,0.023733,-0.016931,0.104438,-0.06218,0.080532,0.085912,-0.07876
chol,0.200683,-0.229346,-0.093341,0.10753,1.0,0.014047,-0.172206,-0.005508,0.068945,0.006043,-0.007742,0.092018,0.062581,-0.063723
fbs,0.139318,0.021746,0.105884,0.128277,0.014047,1.0,-0.067943,-0.015961,-0.025412,0.004244,-0.01039,0.145015,0.003502,0.001544
restecg,-0.145674,-0.010062,-0.015156,-0.098703,-0.172206,-0.067943,1.0,0.090826,-0.043779,-0.036127,0.113372,-0.102546,0.019667,0.145023
thalach,-0.37581,-0.060713,0.28719,0.023733,-0.005508,-0.015961,0.090826,1.0,-0.382661,-0.429001,0.478091,-0.248855,-0.145527,0.424708
exang,0.084412,0.143083,-0.43991,-0.016931,0.068945,-0.025412,-0.043779,-0.382661,1.0,0.295124,-0.262287,0.151482,0.264815,-0.414515
oldpeak,0.264123,0.116345,-0.142195,0.104438,0.006043,0.004244,-0.036127,-0.429001,0.295124,1.0,-0.612155,0.204812,0.26462,-0.400458


To see if there are any features that are particularly correlated with each other or if the target is correlated to any one particular feature, spearman's correlation values were calculated for all features and the target value. For these values, no two features seem highly correlated with each other, but many slight correlations exist as seen in the more darker coloured/bluer cells. To examine this further visually, we will plot all of the features with each other to view these relationships.

In [39]:
# Plotting the correlations
cols = train_df.columns.tolist()

corr_viz = alt.Chart(train_df).mark_point(opacity=0.4, size=10).encode(
    x=alt.X(alt.repeat("row"), type='quantitative', scale=alt.Scale(zero=False)),
    y=alt.Y(alt.repeat("column"), type='quantitative')
).properties(
    height=170,
    width=170
).repeat(
    column=cols,
    row=cols
)
corr_viz

In agreeance with the correlation plot, visualizations show no significant correlations between any two features or features with the target class. This might mean that for this dataset, the prediciton of the diagnosis of heart disease depends on the accumulation of multiple features rather than just one feature.