In [42]:
# import libraries
import pandas as pd
import numpy as np

In [43]:
# Import data into a dataframe
df = pd.read_csv("heart.csv")

### DATA EXPLORATION

In [44]:
# Data Exploration
df.head(10)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
5,57,1,0,140,192,0,1,148,0,0.4,1,0,1,1
6,56,0,1,140,294,0,0,153,0,1.3,1,0,2,1
7,44,1,1,120,263,0,1,173,0,0.0,2,0,3,1
8,52,1,2,172,199,1,1,162,0,0.5,2,0,3,1
9,57,1,2,150,168,0,1,174,0,1.6,2,0,2,1


In [45]:
# df.target.unique()
df['target'].unique()

array([1, 0], dtype=int64)

In [46]:
# Dataframe dimensions
df.shape[0]

303

Features are: 
> 1. age
> 2. sex
> 3. chest pain type (4 values)
> 4. resting blood pressure
> 5. serum cholestoral in mg/dl
> 6. fasting blood sugar > 120 mg/dl
> 7. resting electrocardiographic results (values 0,1,2)
> 8. maximum heart rate achieved
> 9. exercise induced angina
> 10. oldpeak = ST depression induced by exercise relative to rest
> 11. the slope of the peak exercise ST segment
> 12. number of major vessels (0-3) colored by flourosopy
> 13. thal: 3 = normal; 6 = fixed defect; 7 = reversable defect

Target is:

> The "goal" field refers to the presence of heart disease in the patient. 

> 0 (no presence) , 1 (heart disease) 

In [47]:
df['target'].unique()

array([1, 0], dtype=int64)

In [48]:
df.columns

Index(['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'],
      dtype='object')

### Preparing / Splitting the Data

In [49]:
# Creating features and targets

features_name = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
                 
features = df[features_name]

targets = df['target']

#Please split the data into training set (70%) , validation set (10%), test set (20%)

train_size = int(0.70 * features.shape[0])
validation_size = int(0.10 * features.shape[0])


# Training set (70% of all data rows) --> 212
train_features = features[:train_size]
train_targets = targets[:train_size]

# Validation set (10% of all data rows) --> 30
validation_features = features[train_size : train_size + validation_size]
validation_targets = targets[train_size: train_size + validation_size]

# Test set (20% of all data rows) --> 61
test_features = features[train_size + validation_size : ]
test_targets = targets[train_size + validation_size: ]

print(features.shape, train_features.shape, validation_features.shape, test_features.shape)

(303, 13) (212, 13) (30, 13) (61, 13)


### What is a Decision Tree

<img src="img/equation1.png">
<img src="img/equation2.png">

In [50]:
from numpy import log2 as log

# Machine epsilon
# The smallest representable positive number such that: 1.0 + eps != 1.0.
#  We used Machine Epsilon to avoid: log(0) or 0 in the denominator
eps = np.finfo(float).eps

In [51]:
# We need to find the ENTROPY and the
# INFORMATION GAIN for splitting the data set

# Calculating Entropy of column "target"

entropy_node = 0  #Initialize Entropy

# find unique valyes of "target"
values = df.target.unique()  #Unique objects - '0', '1'

# this calculates the entropy:

for value in values:
    # fraction is 'pm' --> proportion of # elements in a 
    #                    split group
    # example: how many '0's in total number of elements
    fraction = df.target.value_counts()[value]/len(df.target) 
    
    # summation
    entropy_node += - fraction * np.log2(fraction)

print("Entropy node = ", entropy_node)

Entropy node =  0.994264609261905


In [52]:
df.chol.unique()

array([233, 250, 204, 236, 354, 192, 294, 263, 199, 168, 239, 275, 266,
       211, 283, 219, 340, 226, 247, 234, 243, 302, 212, 175, 417, 197,
       198, 177, 273, 213, 304, 232, 269, 360, 308, 245, 208, 264, 321,
       325, 235, 257, 216, 256, 231, 141, 252, 201, 222, 260, 182, 303,
       265, 309, 186, 203, 183, 220, 209, 258, 227, 261, 221, 205, 240,
       318, 298, 564, 277, 214, 248, 255, 207, 223, 288, 160, 394, 315,
       246, 244, 270, 195, 196, 254, 126, 313, 262, 215, 193, 271, 268,
       267, 210, 295, 306, 178, 242, 180, 228, 149, 278, 253, 342, 157,
       286, 229, 284, 224, 206, 167, 230, 335, 276, 353, 225, 330, 290,
       172, 305, 188, 282, 185, 326, 274, 164, 307, 249, 341, 407, 217,
       174, 281, 289, 322, 299, 300, 293, 184, 409, 259, 200, 327, 237,
       218, 319, 166, 311, 169, 187, 176, 241, 131], dtype=int64)

In [53]:
attribute = 'chol'
target_variables = df.target.unique()  #This gives all '0' and '1'
variables = df[attribute].unique()    #This gives different features in that attribute (like '233')
entropy_attribute = 0
for variable in variables:
    entropy_each_feature = 0
    for target_variable in target_variables:
        num = len(df[attribute][df[attribute]==variable][df.target ==target_variable]) #numerator
        den = len(df[attribute][df[attribute]==variable])  #denominator
        fraction = num/(den+eps)  #pi
        entropy_each_feature += -fraction*log(fraction+eps) #This calculates entropy for one feature like 
    fraction2 = den/len(df)
    entropy_attribute += -fraction2*entropy_each_feature   #Sums up all the entropy 'chol'
    
abs(entropy_attribute)

0.43452808237653207

In [54]:
def find_entropy(df):
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    entropy = 0
    values = df[Class].unique()
    for value in values:
        fraction = df[Class].value_counts()[value]/len(df[Class])
        entropy += -fraction*np.log2(fraction)
    return entropy
  
  
def find_entropy_attribute(df,attribute):
  Class = df.keys()[-1]   #To make the code generic, changing target variable class name
  target_variables = df[Class].unique()  #
  variables = df[attribute].unique()    #
  entropy2 = 0
  for variable in variables:
      entropy = 0
      for target_variable in target_variables:
          num = len(df[attribute][df[attribute]==variable][df[Class] ==target_variable])
          den = len(df[attribute][df[attribute]==variable])
          fraction = num/(den+eps)
          entropy += -fraction*log(fraction+eps)
      fraction2 = den/len(df)
      entropy2 += -fraction2*entropy
  return abs(entropy2)


def find_winner(df):
    Entropy_att = []
    IG = []
    for key in df.keys()[:-1]:
#         Entropy_att.append(find_entropy_attribute(df,key))
        IG.append(find_entropy(df)-find_entropy_attribute(df,key))
    return df.keys()[:-1][np.argmax(IG)]
  
  
def get_subtable(df, node,value):
  return df[df[node] == value].reset_index(drop=True)


def buildTree(df,tree=None): 
    Class = df.keys()[-1]   #To make the code generic, changing target variable class name
    
    #Here we build our decision tree

    #Get attribute with maximum information gain
    node = find_winner(df)
    
    #Get distinct value of that attribute e.g Salary is node and Low,Med and High are values
    attValue = np.unique(df[node])
    
    #Create an empty dictionary to create tree    
    if tree is None:                    
        tree={}
        tree[node] = {}
    
   #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 

    for value in attValue:
        
        subtable = get_subtable(df,node,value)
        clValue,counts = np.unique(subtable['target'],return_counts=True)                        
        
        if len(counts)==1:#Checking purity of subset
            tree[node][value] = clValue[0]                                                    
        else:        
            tree[node][value] = buildTree(subtable) #Calling the function recursively 
                   
    return tree

In [55]:
tree = buildTree(df)


In [56]:
import pprint
pprint.pprint(tree)

{'chol': {126: 1,
          131: 0,
          141: 1,
          149: {'age': {49: 0, 71: 1}},
          157: 1,
          160: 1,
          164: 0,
          166: 0,
          167: 0,
          168: 1,
          169: 0,
          172: 0,
          174: 0,
          175: 1,
          176: 0,
          177: {'age': {43: 0, 46: 1, 59: 0, 65: 1}},
          178: 1,
          180: 1,
          182: 1,
          183: 1,
          184: 0,
          185: 0,
          186: 1,
          187: 0,
          188: 0,
          192: 1,
          193: {'age': {56: 1, 68: 0}},
          195: 1,
          196: 1,
          197: {'age': {44: 0, 46: 1, 53: 1, 58: 1, 63: 0, 76: 1}},
          198: {'age': {35: 0, 41: 1}},
          199: 1,
          200: 0,
          201: 1,
          203: {'age': {41: 1, 53: 0, 61: 0}},
          204: {'age': {29: 1, 41: 1, 46: 1, 47: 1, 52: 0, 59: 0}},
          205: {'age': {52: 1, 55: 0}},
          206: 0,
          207: {'age': {57: 1, 61: 0}},
          208: 1,
     

In [57]:
def predict(inst,tree):
    #This function is used to predict for any input variable 
    
    #Recursively we go through the tree that we built earlier

    for nodes in tree.keys():        
        
        value = inst[nodes]
        tree = tree[nodes][value]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(inst, tree)
        else:
            prediction = tree
            break;                            
        
    return prediction

In [30]:
inst = df.iloc[6]  #This takes row with index 6


In [31]:
inst

age          56.0
sex           0.0
cp            1.0
trestbps    140.0
chol        294.0
fbs           0.0
restecg       0.0
thalach     153.0
exang         0.0
oldpeak       1.3
slope         1.0
ca            0.0
thal          2.0
target        1.0
Name: 6, dtype: float64

In [32]:
data = {'age':56.0, 'sex':0.0, 'cp':1.0, 'trestbps':140.0, 'chol':294.0, 'fbs':0.0, 'restecg':0.0, 'thalach':153.0, 'exang':0.0, 'oldpeak':1.3, 'slope':1.0, 'ca':0.0, 'thal':2.0}
inst = pd.Series(data)

In [33]:
prediction = predict(inst, tree)
prediction

1