In [113]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import neighbors, datasets
from sklearn import cross_validation
from sklearn import svm
from sklearn.linear_model import SGDClassifier
import operator

In [114]:
from sklearn.cross_validation import KFold
import matplotlib.pyplot as plt
import itertools
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns
%matplotlib inline
from IPython.display import Image

In [115]:

# add the rows names# add the 
header_row = ['age','sex','chest_pain','blood pressure','serum_cholestoral','fasting_blood_sugar',\
               'electrocardiographic','max_heart_rate','induced_angina','ST_depression','slope','vessels','thal','diagnosis']

# read csv file with Cleveland heart diseases data
heart = pd.read_csv('processed.cleveland.data.csv', names=header_row)
heart[:5]

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [116]:
len(heart)

303

# 5.Data Exploration

In [117]:
heart.describe()

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,diagnosis
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


In [118]:
# get number of people with symptom X_k
names_descr = dict()
categorical_columns = ["sex", "chest_pain", "fasting_blood_sugar", "electrocardiographic", "induced_angina", "slope", "vessels", \
                       "thal", "diagnosis"]

for c in categorical_columns:
    print(heart.groupby([c])["age"].count())

sex
0.0     97
1.0    206
Name: age, dtype: int64
chest_pain
1.0     23
2.0     50
3.0     86
4.0    144
Name: age, dtype: int64
fasting_blood_sugar
0.0    258
1.0     45
Name: age, dtype: int64
electrocardiographic
0.0    151
1.0      4
2.0    148
Name: age, dtype: int64
induced_angina
0.0    204
1.0     99
Name: age, dtype: int64
slope
1.0    142
2.0    140
3.0     21
Name: age, dtype: int64
vessels
0.0    176
1.0     65
2.0     38
3.0     20
?        4
Name: age, dtype: int64
thal
3.0    166
6.0     18
7.0    117
?        2
Name: age, dtype: int64
diagnosis
0    164
1     55
2     36
3     35
4     13
Name: age, dtype: int64


In [119]:
heart.isnull().sum()

age                     0
sex                     0
chest_pain              0
blood pressure          0
serum_cholestoral       0
fasting_blood_sugar     0
electrocardiographic    0
max_heart_rate          0
induced_angina          0
ST_depression           0
slope                   0
vessels                 0
thal                    0
diagnosis               0
dtype: int64

# 5.1 preprocess the data, to change the missing data into mean value, using simple mean imputation methods

In [120]:
heart.columns

Index(['age', 'sex', 'chest_pain', 'blood pressure', 'serum_cholestoral',
       'fasting_blood_sugar', 'electrocardiographic', 'max_heart_rate',
       'induced_angina', 'ST_depression', 'slope', 'vessels', 'thal',
       'diagnosis'],
      dtype='object')

In [122]:
heart.mean()

age                      54.438944
sex                       0.679868
chest_pain                3.158416
blood pressure          131.689769
serum_cholestoral       246.693069
fasting_blood_sugar       0.148515
electrocardiographic      0.990099
max_heart_rate          149.607261
induced_angina            0.326733
ST_depression             1.039604
slope                     1.600660
diagnosis                 0.937294
dtype: float64

In [123]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age                     303 non-null float64
sex                     303 non-null float64
chest_pain              303 non-null float64
blood pressure          303 non-null float64
serum_cholestoral       303 non-null float64
fasting_blood_sugar     303 non-null float64
electrocardiographic    303 non-null float64
max_heart_rate          303 non-null float64
induced_angina          303 non-null float64
ST_depression           303 non-null float64
slope                   303 non-null float64
vessels                 303 non-null object
thal                    303 non-null object
diagnosis               303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.2+ KB


In [124]:
heart.mean().to_dict()

{'age': 54.43894389438944,
 'sex': 0.6798679867986799,
 'chest_pain': 3.1584158415841586,
 'blood pressure': 131.68976897689768,
 'serum_cholestoral': 246.69306930693068,
 'fasting_blood_sugar': 0.1485148514851485,
 'electrocardiographic': 0.9900990099009901,
 'max_heart_rate': 149.6072607260726,
 'induced_angina': 0.32673267326732675,
 'ST_depression': 1.0396039603960396,
 'slope': 1.6006600660066006,
 'diagnosis': 0.9372937293729373}

In [128]:
for c in heart.columns[:-1]:
    heart[c] = heart[c].apply( lambda x: heart[heart[c]!='?'][c].astype(float).mean() \
                              if x == "?" else x)
    heart[c] = heart[c].astype(float)

In [129]:
heart.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
age                     303 non-null float64
sex                     303 non-null float64
chest_pain              303 non-null float64
blood pressure          303 non-null float64
serum_cholestoral       303 non-null float64
fasting_blood_sugar     303 non-null float64
electrocardiographic    303 non-null float64
max_heart_rate          303 non-null float64
induced_angina          303 non-null float64
ST_depression           303 non-null float64
slope                   303 non-null float64
vessels                 303 non-null float64
thal                    303 non-null float64
diagnosis               303 non-null int64
dtypes: float64(13), int64(1)
memory usage: 33.2 KB


### 5.2 Heart disease types discovering

In [130]:
# print the types of heart disease
set(heart.loc[:, "diagnosis"].values)

{0, 1, 2, 3, 4}

## 5.3 Let's find the ranges of each feature by disease type

### 5.3.1 Age

In [132]:
heart_d = heart[heart["diagnosis"] >= 1 ]
heart_d[:5]

Unnamed: 0,age,sex,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal,diagnosis
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,3.0,3
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,7.0,2
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,7.0,1


In [133]:
print("Minimum age to Maximum age per disease type")

heart_d.groupby(["diagnosis", ])["age"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis", ])["age"].max().astype(str)

Minimum age to Maximum age per disease type


diagnosis
1    35.0, 70.0
2    42.0, 69.0
3    39.0, 70.0
4    38.0, 77.0
Name: age, dtype: object

In [139]:
print("Mean age per disease type")
heart_d.groupby(["diagnosis", ])["age"].mean()

Mean age per disease type


diagnosis
1    55.381818
2    58.027778
3    56.000000
4    59.692308
Name: age, dtype: float64

In [145]:
heart_d.groupby(['diagnosis','sex'])['age'].count()

diagnosis  sex
1          0.0     9
           1.0    46
2          0.0     7
           1.0    29
3          0.0     7
           1.0    28
4          0.0     2
           1.0    11
Name: age, dtype: int64

In [147]:
heart_d.groupby(['diagnosis','sex']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,chest_pain,blood pressure,serum_cholestoral,fasting_blood_sugar,electrocardiographic,max_heart_rate,induced_angina,ST_depression,slope,vessels,thal
diagnosis,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1,0.0,9,9,9,9,9,9,9,9,9,9,9,9
1,1.0,46,46,46,46,46,46,46,46,46,46,46,46
2,0.0,7,7,7,7,7,7,7,7,7,7,7,7
2,1.0,29,29,29,29,29,29,29,29,29,29,29,29
3,0.0,7,7,7,7,7,7,7,7,7,7,7,7
3,1.0,28,28,28,28,28,28,28,28,28,28,28,28
4,0.0,2,2,2,2,2,2,2,2,2,2,2,2
4,1.0,11,11,11,11,11,11,11,11,11,11,11,11


### We can see that heart disease all types can be present in men with higher probability than in women 

# 5.3 Chest pain 

In [152]:
print("count each chest pain value per heart disease type")
heart_d.groupby(['diagnosis','chest_pain'])["age"].count()

count each chest pain value per heart disease type


diagnosis  chest_pain
1          1.0            5
           2.0            6
           3.0            9
           4.0           35
2          1.0            1
           2.0            1
           3.0            4
           4.0           30
3          2.0            2
           3.0            4
           4.0           29
4          1.0            1
           3.0            1
           4.0           11
Name: age, dtype: int64


### The people with chest pain = 4 often have heart disease 

### 5.3.4 blood pressure

In [153]:
print("Minimum blood pressure to Maximum  blood pressure per disease type")

heart_d.groupby(["diagnosis"])["blood pressure"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis"])["blood pressure"].max().astype(str)

Minimum blood pressure to Maximum  blood pressure per disease type


diagnosis
1    108.0, 192.0
2    100.0, 180.0
3    100.0, 200.0
4    112.0, 165.0
Name: blood pressure, dtype: object

In [155]:
print ("Mean blood pressure per disease type")
heart_d.groupby(["diagnosis", ])["blood pressure"].mean()

Mean blood pressure per disease type


diagnosis
1    133.254545
2    134.194444
3    135.457143
4    138.769231
Name: blood pressure, dtype: float64

In [157]:
# 5.3.5 serum_cholestoral

print("Minimum serum_cholestoral to Maximum serum_cholestoral per disease type")

heart_d.groupby(["diagnosis"])["serum_cholestoral"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis"])["serum_cholestoral"].max().astype(str)

Minimum serum_cholestoral to Maximum serum_cholestoral per disease type


diagnosis
1    149.0, 335.0
2    169.0, 409.0
3    131.0, 353.0
4    166.0, 407.0
Name: serum_cholestoral, dtype: object

In [158]:
heart_d.groupby(["diagnosis", ])["serum_cholestoral"].mean()

diagnosis
1    249.109091
2    259.277778
3    246.457143
4    253.384615
Name: serum_cholestoral, dtype: float64

In [160]:
# 5.3.6 fasting_blood_sugar
print("Count each fasting_blood_sugar per heart disease type")
heart_d.groupby(["diagnosis", "fasting_blood_sugar"])["age"].count()

Count each fasting_blood_sugar per heart disease type


diagnosis  fasting_blood_sugar
1          0.0                    51
           1.0                     4
2          0.0                    27
           1.0                     9
3          0.0                    27
           1.0                     8
4          0.0                    12
           1.0                     1
Name: age, dtype: int64

In [161]:
# 5.3.7 electrocardiographic results
print ("Count each electrocardiographic per heart disease type")
heart_d.groupby(["diagnosis", "electrocardiographic"])["age"].count()

Count each electrocardiographic per heart disease type


diagnosis  electrocardiographic
1          0.0                     23
           2.0                     32
2          0.0                     19
           1.0                      1
           2.0                     16
3          0.0                     12
           1.0                      1
           2.0                     22
4          0.0                      2
           1.0                      1
           2.0                     10
Name: age, dtype: int64

In [162]:
# 5.3.8 max_heart_rate

print("Minimum max_heart_rate to Maximum max_heart_rate per disease type")

heart_d.groupby(["diagnosis"])["max_heart_rate"].min().astype(str) + ', ' +  heart_d.groupby(["diagnosis"])["max_heart_rate"].max().astype(str)

Minimum max_heart_rate to Maximum max_heart_rate per disease type


diagnosis
1     88.0, 195.0
2     71.0, 170.0
3     90.0, 173.0
4    114.0, 182.0
Name: max_heart_rate, dtype: object

In [165]:
heart_d.groupby(["diagnosis", ])["max_heart_rate"].mean()

diagnosis
1    145.927273
2    135.583333
3    132.057143
4    140.615385
Name: max_heart_rate, dtype: float64

In [167]:
# 5.3.9 induced_angina
print("Count  induced_angina per heart disease type")
heart_d.groupby(["diagnosis", "induced_angina"])["age"].count()

Count  induced_angina per heart disease type


diagnosis  induced_angina
1          0.0               30
           1.0               25
2          0.0               14
           1.0               22
3          0.0               12
           1.0               23
4          0.0                7
           1.0                6
Name: age, dtype: int64

In [169]:
# 5.3.10 ST_depression
print("Count  mean ST_depression per heart disease type")
heart_d.groupby(["diagnosis"])["ST_depression"].mean()

Count  mean ST_depression per heart disease type


diagnosis
1    1.005455
2    1.780556
3    1.962857
4    2.361538
Name: ST_depression, dtype: float64

In [171]:
# 5.3.11 slope
print("Count slope per heart disease type")
heart_d.groupby(["diagnosis", "slope"])["age"].count()


Count slope per heart disease type


diagnosis  slope
1          1.0      22
           2.0      31
           3.0       2
2          1.0       7
           2.0      26
           3.0       3
3          1.0       6
           2.0      24
           3.0       5
4          1.0       1
           2.0      10
           3.0       2
Name: age, dtype: int64

In [172]:
# 5.3.12 vessels
print("Count  mean vessels per heart disease type")
heart_d.groupby(["diagnosis"])["vessels"].mean()

Count  mean vessels per heart disease type


diagnosis
1    0.739495
2    1.222222
3    1.457143
4    1.692308
Name: vessels, dtype: float64

In [173]:
# 5.3.13 thal
print("Count  mean thal per heart disease type")
heart_d.groupby(["diagnosis"])["thal"].mean()

Count  mean thal per heart disease type


diagnosis
1    5.345455
2    5.992617
3    6.285714
4    6.230769
Name: thal, dtype: float64

# 6 Data preprocessing

In [175]:
# 6.1 map dependent variable y = "diagnosis" into binary label
# if "diagnosis" == 0, member does not have disease A - we put 0
# if "diagnosis" >= 1, member possess the disease A - we put 1
heart.loc[:, "diag_int"] = heart.loc[:, "diagnosis"].apply(lambda x: 1 if x >= 1 else 0)

In [176]:
# 6.2 normalize the data
# create the normalizer and fit it 
preprocessing.Normalizer().fit_transform(heart)

array([[0.19741527, 0.00313358, 0.00313358, ..., 0.01880145, 0.        ,
        0.        ],
       [0.19057465, 0.0028444 , 0.01137759, ..., 0.00853319, 0.0056888 ,
        0.0028444 ],
       [0.22578159, 0.00336987, 0.0134795 , ..., 0.02358912, 0.00336987,
        0.00336987],
       ...,
       [0.25334975, 0.00444473, 0.01777893, ..., 0.03111313, 0.0133342 ,
        0.00444473],
       [0.17495449, 0.        , 0.00613875, ..., 0.00920813, 0.00306938,
        0.00306938],
       [0.13346695, 0.00351229, 0.01053686, ..., 0.01053686, 0.        ,
        0.        ]])