# Decision Tree Assignment 

### [Problem_1](#Problem-1)

Consider two features, age and heart disease to  create a decision tree with gini impurity.


### [Problem_2](#Problem-2)

Consider two features, slope and heart disease to create a decision tree with Information gain.

------

### Importing the Headers

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

#### Loading data from CSV file

In [2]:
hp = pd.read_csv('processed.cleveland.csv')

------

## Problem 1

### Sort based on Age

In [3]:
new_df = hp[['age', 'num']]
new_df = new_df.sort_values(by=['age'])
data = new_df.to_numpy()

new_df

Unnamed: 0,age,num
132,29,0
101,34,0
225,34,0
283,35,0
117,35,0
...,...,...
42,71,0
103,71,0
233,74,0
257,76,0


### Functions Used

In [4]:
def count(data, X):
    m = data.shape[0]
    
    # For Condition = True
    T_m = np.sum(data[:, 0] < X) 
    T_1 = np.sum(data[(data[:, 0] < X)][:, 1])
    T_0 = T_m - T_1
    
    # For Condition = True
    F_m = m - T_m
    F_1 = np.sum(data[(data[:, 0] >= X)][:, 1])
    F_0 = F_m - F_1
    
    
    # Calculate P(1) and P(0)
    P_T_1 = T_1 / T_m
    P_T_0 = T_0 / T_m
    
    P_F_1 = F_1 / F_m
    P_F_0 = F_0 / F_m
    
    return P_T_1, P_T_0, P_F_1, P_F_0, T_m, F_m

    
def GINI(P_1, P_0):
    return (1 - P_1**2 - P_0**2)

    
def avg_GINI(T_m, F_m, G_T, G_F):
    first = G_T * T_m / (T_m + F_m)
    second = G_F * F_m / (T_m + F_m)
    
    return (first + second)
    

### Building the Decision Tree using GINI Impurity

In [5]:
optimalAge = 0
optimalGINI = 2

for i in range(302):
    avg_age = (data[i, 0] + data[i+1, 0]) / 2
    
    P_T_1, P_T_0, P_F_1, P_F_0, T_m, F_m = count(data, avg_age)
    
    GINI_T = GINI(P_T_1, P_T_0)
    GINI_F = GINI(P_F_1, P_F_0)
    
    avgGINI = avg_GINI(T_m, F_m, GINI_T, GINI_F)
    
    if avgGINI < optimalGINI:
        optimalGINI = avgGINI
        optimalAge = avg_age
        
print("Optimal Age : ", optimalAge)
print('Min GINI Impurity : ', optimalGINI)

Optimal Age :  54.5
Min GINI Impurity :  0.4558104848946434


------

## Problem 2

In [6]:
new_df = hp[['slope', 'num']]
data = new_df.to_numpy()

new_df.head()

Unnamed: 0,slope,num
0,3,0
1,2,1
2,2,1
3,3,0
4,1,0


### Functions Used

In [22]:
def T_F(slope, num, X):
    c = num[(slope == X)].shape[0]
    T = np.sum(num[(slope == X)])
    F = c - T

    return T, F


def Entropy(T, F):
    P_T, P_F = T/(T+F), F/(T+F)
    
    E = -(P_T*math.log2(P_T) + P_F*math.log2(P_F))
    
    return E
    
    
def Prob(X, c):
    m = X.shape[0]
    a = np.sum(X == c)
    
    return a/m
    
    
def Multi_Entropy(num, slope):
    m = slope.shape[0]
    E_m = 0
    
    for i in range(1,4):
        T, F = T_F(slope, num, i)
        E_m += Prob(slope, i) * Entropy(T, F)
    
    return E_m
    
def Gain(num, slope):
    m = num.shape[0]
    T = np.sum(num == 1)
    F = m - T
    
    G  = Entropy(T, F) - Multi_Entropy(num, slope)
    
    return G

### Building the Decision Tree using information Gain

In [8]:
print("P(1) : ", Prob(data[:, 0], 1))
print("P(2) : ", Prob(data[:, 0], 2))
print("P(3) : ", Prob(data[:, 0], 3))

print("\nEm(slope) : ", Multi_Entropy(data[:, 1], data[:, 0]))

print("\nG(num, slope) : ", Gain(data[:, 1], data[:, 0]))

P(1) :  0.46864686468646866
P(2) :  0.46204620462046203
P(3) :  0.06930693069306931

Em(slope) :  0.8826618526182525

G(num, slope) :  0.11242190687472087
