In [2]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [40]:
df=pd.read_csv('./data/processed.cleveland.data.csv')

In [41]:
df.head()

Unnamed: 0,63.0,1.0,1.0.1,145.0,233.0,1.0.2,2.0,150.0,0.0,2.3,3.0,0.0.1,6.0,0
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


As our data don't have column names let's assign them names. The names are from where the dataset is taken from  

In [42]:
df.columns = [
'age',
'sex',
'chest_pain',
'blood pressure',
'serum_cholestoral',
'fasting_blood_sugar',
'electrocardiographic',
'max_heart_rate',
'induced_angina',
'ST_depression',
'slope',
'vessels',
'thal',
'diagnosis']

df.head()

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


## Dealing with the missing data

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 302 entries, 0 to 301
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   age                   302 non-null    float64
 1   sex                   302 non-null    float64
 2   chest_pain            302 non-null    float64
 3   blood pressure        302 non-null    float64
 4   serum_cholestoral     302 non-null    float64
 5   fasting_blood_sugar   302 non-null    float64
 6   electrocardiographic  302 non-null    float64
 7   max_heart_rate        302 non-null    float64
 8   induced_angina        302 non-null    float64
 9   ST_depression         302 non-null    float64
 10  slope                 302 non-null    float64
 11  vessels               302 non-null    object 
 12  thal                  302 non-null    object 
 13  diagnosis             302 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


We don't have any null value

In [44]:
# dtypes tells datatype of each column
df.dtypes

age                     float64
sex                     float64
chest_pain              float64
blood pressure          float64
serum_cholestoral       float64
fasting_blood_sugar     float64
electrocardiographic    float64
max_heart_rate          float64
induced_angina          float64
ST_depression           float64
slope                   float64
vessels                  object
thal                     object
diagnosis                 int64
dtype: object

We can see both vessels and thal have 'object' datatype so they might have both numbers and some other characters. Let's print all see what is going

In [45]:
df['vessels'].unique()

array(['3.0', '2.0', '0.0', '1.0', '?'], dtype=object)

In [46]:
df['thal'].unique()

array(['3.0', '7.0', '6.0', '?'], dtype=object)

In [47]:
# loc helps to access rows and columns by labels or boolean
print(str(len(df.loc[(df['vessels']=='?')
    | (df['thal']=='?')]))+" rows found")

6 rows found


In [48]:
df.loc[(df['vessels']=='?')\
    | (df['thal']=='?')]

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
86,53.0,0.0,3.0,128.0,216.0,0.0,2.0,115.0,0.0,0.0,1.0,0.0,?,0
165,52.0,1.0,3.0,138.0,223.0,0.0,0.0,169.0,0.0,0.0,1.0,?,3.0,0
191,43.0,1.0,4.0,132.0,247.0,1.0,2.0,143.0,1.0,0.1,2.0,?,7.0,1
265,52.0,1.0,4.0,128.0,204.0,1.0,0.0,156.0,1.0,1.0,2.0,0.0,?,2
286,58.0,1.0,2.0,125.0,220.0,0.0,0.0,144.0,0.0,0.4,2.0,?,7.0,0
301,38.0,1.0,3.0,138.0,175.0,0.0,0.0,173.0,0.0,0.0,1.0,?,3.0,0


In [49]:
df.shape

(302, 14)

Now we have 302 rows in total and 6 out of them are not correct so we can drop them without significant loss in training samples

In [50]:
df_new=df.loc[(df['vessels']!='?')\
       & (df['thal']!='?')]

In [51]:
df_new

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,57.0,0.0,4.0,140.0,241.0,0.0,0.0,123.0,1.0,0.2,2.0,0.0,7.0,1
297,45.0,1.0,1.0,110.0,264.0,0.0,0.0,132.0,0.0,1.2,2.0,0.0,7.0,1
298,68.0,1.0,4.0,144.0,193.0,1.0,0.0,141.0,0.0,3.4,2.0,2.0,7.0,2
299,57.0,1.0,4.0,130.0,131.0,0.0,0.0,115.0,1.0,1.2,2.0,1.0,7.0,3


## Data Preprocessing

In [52]:
X=df_new.iloc[:,:-1]
X.head()

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0


In [55]:
Y=df_new.iloc[:,-1]
Y.head()

0    2
1    1
2    0
3    0
4    0
Name: diagnosis, dtype: int64