In [77]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

Note: Data is from the UCI Machine Learning Repository:

Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [78]:
data = 'https://raw.githubusercontent.com/Codecademy/Master-Statistics-Live-Series/main/Codecademy%20Live%20Stats%20%231/processed.cleveland.data.csv'
heart = pd.read_csv(data)

In [79]:
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


- age: age in years
- sex: 1=male, 0=female
- cp: chest pain type
 - Value 1: typical angina
 - Value 2: atypical angina
 - Value 3: non-anginal pain
 - Value 4: asymptomatic
- trestbps: resting blood pressure (in mm Hg on admission to the hospital)
- chol: serum cholestoral in mg/dl
- fbs: (fasting blood sugar > 120 mg/dl) (1 = true; 0 = false)
- restecg: resting electrocardiographic results
 - Value 0: normal
 - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 
 - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
- thalach: maximum heart rate achieved in an exercise test
- exang: exercise induced angina (1 = yes; 0 = no)
- oldpeak: ST depression induced by exercise relative to rest
- slope: the slope of the peak exercise ST segment
 - Value 1: upsloping
 - Value 2: flat
 - Value 3: downsloping
- ca: number of major vessels (0-3) colored by flourosopy
- thal: 
 - Value 3: normal
 - Value 6: fixed defect
 - Value 7: reversable defect
- heart_disease: diagnosis of heart disease (angiographic disease status)
 - Value 0: < 50% diameter narrowing
 - Value 1: > 50% diameter narrowing
"\[This field\] refers to the presence of heart disease in the patient. It is integer valued from 0 (no presence) to 4. Experiments with the Cleveland database have concentrated on simply attempting to distinguish presence (values 1,2,3,4) from absence (value 0)."


In [80]:
heart.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
unique,,,,,,,,,,,,5.0,4.0,
top,,,,,,,,,,,,0.0,3.0,
freq,,,,,,,,,,,,176.0,166.0,
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,,,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,,,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,,,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,,,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,,,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,,,2.0


In [81]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   age            303 non-null    float64
 1   sex            303 non-null    float64
 2   cp             303 non-null    float64
 3   trestbps       303 non-null    float64
 4   chol           303 non-null    float64
 5   fbs            303 non-null    float64
 6   restecg        303 non-null    float64
 7   thalach        303 non-null    float64
 8   exang          303 non-null    float64
 9   oldpeak        303 non-null    float64
 10  slope          303 non-null    float64
 11  ca             303 non-null    object 
 12  thal           303 non-null    object 
 13  heart_disease  303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [82]:
print(heart['ca'].unique())
print(heart['thal'].unique())

['0.0' '3.0' '2.0' '1.0' '?']
['6.0' '3.0' '7.0' '?']


replacing missing values '?' in the entire data

In [83]:
heart = heart.replace(['?'], np.nan)

after replacement, changing data type of ca from 'object' to 'float'

In [84]:
heart['ca'] = heart['ca'].astype('float')
print(heart.dtypes)

age              float64
sex              float64
cp               float64
trestbps         float64
chol             float64
fbs              float64
restecg          float64
thalach          float64
exang            float64
oldpeak          float64
slope            float64
ca               float64
thal              object
heart_disease      int64
dtype: object



*   changing cp values from floats 1.0, 2.0, 3.0 , 4.0 to thier values

*   checking if data type of cp changes from 'float64' to 'object' and the unique values

*   print the first five rows of data to see the change


In [85]:
# cp: chest pain type
# Value 1: typical angina
# Value 2: atypical angina
# Value 3: non-anginal pain
# Value 4: asymptomatic

heart['cp'] = heart['cp'].replace([1.0, 2.0, 3.0 , 4.0], ['typical angina', 'atypical angina', 'non-anginal pain', 'asymptomatic'])
# print(heart.dtypes)
# print(heart['cp'].unique())
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
0,63.0,1.0,typical angina,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,asymptomatic,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,asymptomatic,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,non-anginal pain,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,atypical angina,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


to see the change in cp after changing the values. notice unique, top and freq

In [86]:
heart.describe(include='all')

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,heart_disease
count,303.0,303.0,303,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,299.0,301.0,303.0
unique,,,4,,,,,,,,,,3.0,
top,,,asymptomatic,,,,,,,,,,3.0,
freq,,,144,,,,,,,,,,166.0,
mean,54.438944,0.679868,,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.672241,,0.937294
std,9.038662,0.467299,,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,0.937438,,1.228536
min,29.0,0.0,,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0,,0.0
25%,48.0,0.0,,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,,0.0
50%,56.0,1.0,,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0,,0.0
75%,61.0,1.0,,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,1.0,,2.0


In [87]:
# sex: 1=male, 0=female
heart['sex'] = heart['sex'].replace({0.0: 'female', 1.0: 'male'})
heart.dtypes

age              float64
sex               object
cp                object
trestbps         float64
chol             float64
fbs              float64
restecg          float64
thalach          float64
exang            float64
oldpeak          float64
slope            float64
ca               float64
thal              object
heart_disease      int64
dtype: object

In [88]:
# restecg: resting electrocardiographic results
# Value 0: normal
# Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
# Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria

heart['restecg'] = heart['restecg'].replace({0.0: 'normal', 1.0: 'ST-T wave abnormality', 2.0: 'left ventricular hypertrophy'})
heart.dtypes

age              float64
sex               object
cp                object
trestbps         float64
chol             float64
fbs              float64
restecg           object
thalach          float64
exang            float64
oldpeak          float64
slope            float64
ca               float64
thal              object
heart_disease      int64
dtype: object

In [89]:
# One-hot encoding of 'restecg'

heart = pd.get_dummies(data= heart, columns= ['restecg'])
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_ST-T wave abnormality,restecg_left ventricular hypertrophy,restecg_normal
0,63.0,male,typical angina,145.0,233.0,1.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0,1,0
1,67.0,male,asymptomatic,160.0,286.0,0.0,108.0,1.0,1.5,2.0,3.0,3.0,2,0,1,0
2,67.0,male,asymptomatic,120.0,229.0,0.0,129.0,1.0,2.6,2.0,2.0,7.0,1,0,1,0
3,37.0,male,non-anginal pain,130.0,250.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,0,0,1
4,41.0,female,atypical angina,130.0,204.0,0.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0,1,0


removing 'ST-T wave abnormality' and 'left ventricular hypertrophy' that we created with one-hot encoding

In [90]:
heart = heart.drop(['restecg_ST-T wave abnormality', 'restecg_left ventricular hypertrophy'], axis=1)
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_normal
0,63.0,male,typical angina,145.0,233.0,1.0,150.0,0.0,2.3,3.0,0.0,6.0,0,0
1,67.0,male,asymptomatic,160.0,286.0,0.0,108.0,1.0,1.5,2.0,3.0,3.0,2,0
2,67.0,male,asymptomatic,120.0,229.0,0.0,129.0,1.0,2.6,2.0,2.0,7.0,1,0
3,37.0,male,non-anginal pain,130.0,250.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0,1
4,41.0,female,atypical angina,130.0,204.0,0.0,172.0,0.0,1.4,1.0,0.0,3.0,0,0


In [91]:
# slope: the slope of the peak exercise ST segment
# - Value 1: upsloping
# - Value 2: flat
# - Value 3: downsloping

heart['slope'] = heart['slope'].replace({1.0: 'upsloping', 2.0: 'flat', 3.0: 'downsloping'})
correct_order = ['upsloping', 'flat', 'downsloping']
heart['slope'] = pd.Categorical(heart['slope'], correct_order, ordered= True)
heart['slope'].unique()

['downsloping', 'flat', 'upsloping']
Categories (3, object): ['upsloping' < 'flat' < 'downsloping']

In [92]:
heart['slope_codes'] = heart['slope'].cat.codes
median_index = heart['slope_codes'].median()
print(correct_order[int(median_index)])

flat


In [98]:
# thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

heart['thal'] = heart['thal'].replace({'3.0': 'normal', '6.0': 'fixed defect','7.0': 'reversable defect'})
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_normal,slope_codes
0,63.0,male,typical angina,145.0,233.0,1.0,150.0,0.0,2.3,downsloping,0.0,fixed defect,0,0,2
1,67.0,male,asymptomatic,160.0,286.0,0.0,108.0,1.0,1.5,flat,3.0,normal,2,0,1
2,67.0,male,asymptomatic,120.0,229.0,0.0,129.0,1.0,2.6,flat,2.0,reversable defect,1,0,1
3,37.0,male,non-anginal pain,130.0,250.0,0.0,187.0,0.0,3.5,downsloping,0.0,normal,0,1,2
4,41.0,female,atypical angina,130.0,204.0,0.0,172.0,0.0,1.4,upsloping,0.0,normal,0,0,0


In [100]:
np.where(heart['heart_disease'] == 0, 'absence', 'presence')[1:10]

array(['presence', 'presence', 'absence', 'absence', 'absence',
       'presence', 'absence', 'presence', 'presence'], dtype='<U8')

In [101]:
heart['heart_disease'] = np.where(heart['heart_disease'] == 0, 'absence', 'presence')
heart.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,thalach,exang,oldpeak,slope,ca,thal,heart_disease,restecg_normal,slope_codes
0,63.0,male,typical angina,145.0,233.0,1.0,150.0,0.0,2.3,downsloping,0.0,fixed defect,absence,0,2
1,67.0,male,asymptomatic,160.0,286.0,0.0,108.0,1.0,1.5,flat,3.0,normal,presence,0,1
2,67.0,male,asymptomatic,120.0,229.0,0.0,129.0,1.0,2.6,flat,2.0,reversable defect,presence,0,1
3,37.0,male,non-anginal pain,130.0,250.0,0.0,187.0,0.0,3.5,downsloping,0.0,normal,absence,1,2
4,41.0,female,atypical angina,130.0,204.0,0.0,172.0,0.0,1.4,upsloping,0.0,normal,absence,0,0


In [None]:
heart.describe(include='all')