In [26]:
import numpy as np
import pandas as pd

In [27]:
def entropy(z):
    if p == 0 or p==1:
        return 0
    return -p*np.log2(p) - (1-p)*np.log2(1-p)

In [28]:
# uses index_feature to determine indices of samples one which one goes to left and to right
def split_indices(X,index_features):
    left_indices=[]
    right_indices=[]
    for idx,x in enumerate(X):
         if X[index_features]==1:               #eg: if features=[Ear Shape, Face Shape, Whiskers],index_features=0 means choosing ear shape
             left_indices.append(idx)           # if X=[[1,0,1],[0,1,1]]] , in first iteration x=[1,0,1] and x[0]=1 so the index 1 of X is placed for left side of split
         else:
             right_indices.append(idx)

    return left_indices,right_indices
    

In [29]:
#to measure average uncertainty of the child nodes after a split, weighted by their sizes
def weighted_entropy(X,y,left_indices,right_indices):
    if len(left_indices) == 0:
        H_left=0
    else:
        pleft=sum(y[left_indices])/len(left_indices)
        H_left=entropy(pleft)
    wleft= len(left_indices)/len(X)
    if len(right_indices) == 0:
        H_right=0
    else:
        pright=sum(y[right_indices])/len(right_indices)
        H_right=entropy(pright)
    wright=len(left_indices)/len(X)

    return wleft*H_left+wright*H_right

    

In [30]:
# to measure how much a split makes data cleaner
def information_gain(X,y,left_indices,right_indices):
    p_node=sum(y)/len(y)
    w_entropy=weighted_entropy(X,y,left_indices,right_indices)
    return entropy(p_node)-w_entropy

In [31]:
# to choose the best split among features given 
def best_split(X,y,features):  #features recieved is in a list form like [0,1,2,.....]

    best_gain = -1            # -1 for comparision simplification(info gain can never be negetive) 
    for feature in features:
        left_indices,right_indices=split_indices(X,feature)

        if len(left_indices) == 0 or len(right_indices) == 0:       #all data goes to one side , no splitting needed
            continue

        info_gain=information_gain(X,y,left_indices,right_indices)
        if info_gain>best_gain:
            best_gain = info_gain
            best_feature = feature
            best_indices= (left_indices,right_indices)

    return best_feature,best_indices,best_gain

In [32]:
#output is a dictonary something like this:  type(node),feature,left/right -> type(leaf),prediction,indices
def build_tree_recursive(X,y,indices,features,depth,max_depth):            #indices are indices of samples in X and y that belong to current node like indices = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    y_node=y[indices]                                                      #depth is current depth of the node in the tree
    #stopping condition
    if depth==max_depth or len(set(y_node))==1:            # set() takes a list and removes duplicates that means it checks if all samples in this node belong to the same class
        return {
            "type":"leaf",
            "prediction":int(np.round(np.mean(y_node))),
            "indices":indices
        }
    x_node=X[indices]
    y_node=y[indices]
    best_feature,split,info_gain = best_split(X,y,features)

    if best_features==None or info_gain==0:              #that means there is no improvement possible(met the deadend)
        return {
            "type":"leaf",
            "prediction":int(np.round(np.mean(y_node))),
            "indices":indices
        }
    left_local,right_local = split
    left_indices=[]
    for idx in left_local:                              #eg: indices=[0,3,4,5,7], left_local=[0,2,4] then left_indices=[0,4,7]
        left_indices.append(indices[idx])
    right_indices=[indices[idx] for idx in right_local] 

    return {
        "type":"node",
        "feature":best_feature,
        "Left":build_tree_recursive(X,y,left_indices,features,depth+1,max_depth),
        "Right":build_tree_recursive(X,y,right_indices,features,depth+1,max_depth)
    }
       

In [33]:
#text based tree visualization
def visualize_tree(tree,depth=0):
    indent="   "*depth

    if tree["type"]=="leaf":
        print(f"{indent}Leaf-> predict {tree["prediction"]} , indices={tree["indices"]}")

    print(f"{indent}Splitting on feature {tree["feature"]}")
    print(f"{indent}Left:")
    visualize_tree(tree["Left"],depth+1)
    print(f"{indent}Right:")
    visualize_tree(tree["Right"],depth+1)
    

In [40]:
#our dataset for heart disease
df=pd.read_csv("C:\\Users\\ACER\\Desktop\\Python\\JN\\heart.csv")
print(df.head())
df_local=pd.DataFrame()

#converting contionus value into binary
df_local["age>50"]=(df["age"]>50).astype(int)             # 1 if age>50
df_local["bp_high"]=(df["trestbps"] > 130).astype(int)    # 1 if resting bloop pressure > 130
df_local["col_high"]=(df["chol"] > 240).astype(int)       #1 if cholestrol > 240 mg/dl
df_local["hr_low"]=(df["thalach"]<150).astype(int)        # 1 if maximum heart rate < 150
df_local["oldpeak_high"] = (df["oldpeak"]>1).astype(int)    # high ST depression >1 is 1 ,lower is 0

#one-hot-encoding
df_local["cp_0"]=(df["cp"] == 0).astype(int)        #typical angina chest pain
df_local["cp_1"]=(df["cp"] == 1).astype(int)        #atypical angina chest pain
df_local["cp_2"]=(df["cp"] == 2).astype(int)        #non-anginal pain chest pain
df_local["cp_3"]=(df["cp"] == 3).astype(int)        #asymptomatic chest pain(most dangerous)
df_local["slope_up"] = (df["slope"] == 0).astype(int)
df_local["slope_flat"] = (df["slope"] == 1).astype(int)
df_local["slope_down"] = (df["slope"] == 2).astype(int)
df_local["restecg_0"] = (df["restecg"] == 0).astype(int)      #resting electrocardiographic results
df_local["restecg_1"] = (df["restecg"] == 1).astype(int)
df_local["restecg_2"] = (df["restecg"] == 2).astype(int)
df_local["thal_1"] = (df["thal"] == 1).astype(int)          #normal
df_local["thal_2"] = (df["thal"] == 2).astype(int)          #fixed defect
df_local["thal_3"] = (df["thal"] == 3).astype(int)          #reversible defect
df_local["ca_0"] = (df["ca"] == 0).astype(int)              #number of major vessels (0-3) colored by flourosopy
df_local["ca_1"] = (df["ca"] == 1).astype(int)
df_local["ca_2"] = (df["ca"] == 2).astype(int)
df_local["ca_3"] = (df["ca"] == 3).astype(int)

#others: sex(m=1,f=0),fbs(1 if fasting blood sugar>120 mg/dl),exang(1 if exercise induced angina present), target are already in binary
df_bin["sex"] = df["sex"]
df_bin["fbs"] = df["fbs"]
df_bin["exang"] = df["exang"]

X=df_local.values      #coverting pandas dataframe into numpy array
y=df["target"].values  # 1 for heart disease present


#train the tree





   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  
0   2     3  
1   0     3  
2   0     3  
3   1     3  
4   3     2  
0    0
1    0
2    0
3    0
4    0
Name: target, dtype: int64
<class 'numpy.ndarray'>
