In [154]:
import pandas as pd
import numpy as np
import math

# Part 1: For Categorical Data

In [159]:
data = pd.read_csv("Dataset/play_tennis.csv")
data.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [160]:
# Calculating Initial Entropy
total = data.shape[0] # Total Number Of Columns
play_yes = data["play"].value_counts()[0]
play_no = data["play"].value_counts()[1]
p_play_yes = play_yes / total
p_play_no = play_no / total

print("Probability Of Playing =", p_play_yes)
print("Probability Of Not Playing =", p_play_no)

Probability Of Playing = 0.6428571428571429
Probability Of Not Playing = 0.35714285714285715


In [161]:
# Initial Entropy
E_0 = -(p_play_yes * math.log(p_play_yes)) - (p_play_yes * math.log(p_play_no))
print("Initial Entropy =", E_0)

Initial Entropy = 0.9459335375101269


In [162]:
columns_used = data.columns[1:5]
for i in columns_used:
    data_group = data.groupby(i) # Forming Groups For Each Columns
    
    E_column = 0 # To Store Final Entropy For The Group
    
    for j in data_group.groups.keys():
        
        data_x = data_group.get_group(j)["play"] # Getting Play Column For the Specific Group
        
        total_values = data_x.shape[0] # Number Of Rows In That Group
        
        data_x = data_x.value_counts() # Number Of Yes or No
        
        # For The Play = Yes
        count_yes = data_x[0]
        p_count_yes = count_yes / total_values # Probability Of Playing
        e_yes = math.log(p_count_yes) # Entropy For Yes
        
        # For The Play = No 
        try:
            count_no = data_x[1]
            p_count_no = count_no / total_values # Probability Of Not Playing
            e_no = math.log(p_count_no) # Entropy For No
        except:
            p_count_no = 0 # If no "no" column is found.
            e_no = 0 
        
        E_group = -(p_count_yes*e_yes) - (p_count_no*e_no) # Entropy For The Group
        E_column = E_column + ((total_values/total)*E_group) # Weighted Entropy Calculation For Each Group
        
    print("Column =", i)
    print("Weighted Entropy =", E_column)
    print("Information Gain =", E_0 - E_column)
    print()

Column = outlook
Weighted Entropy = 0.48072261929232607
Information Gain = 0.4652109182178008

Column = temp
Weighted Entropy = 0.6315010221774208
Information Gain = 0.3144325153327061

Column = humidity
Weighted Entropy = 0.5465122114944403
Information Gain = 0.3994213260156866

Column = wind
Weighted Entropy = 0.6183974457364384
Information Gain = 0.3275360917736885



In [175]:
data[data["outlook"] == "Overcast"]

Unnamed: 0,day,outlook,temp,humidity,wind,play
2,D3,Overcast,Hot,High,Weak,Yes
6,D7,Overcast,Cool,Normal,Strong,Yes
11,D12,Overcast,Mild,High,Strong,Yes
12,D13,Overcast,Hot,Normal,Weak,Yes


## Conclusion:
Since, Outlook (Overcast) column has the highest information gain, we select that column as the initial node.

# Part 2: For Numerical Column

In [148]:
data = pd.read_csv("Dataset/Iris.csv")
data.sort_values("PetalLengthCm", inplace=True)
data.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
22,23,4.6,3.6,1.0,0.2,Iris-setosa
13,14,4.3,3.0,1.1,0.1,Iris-setosa
14,15,5.8,4.0,1.2,0.2,Iris-setosa
35,36,5.0,3.2,1.2,0.2,Iris-setosa
36,37,5.5,3.5,1.3,0.2,Iris-setosa


In [149]:
def entropy(group):
    data = group.groupby("Species").count()["Id"] # Getting Number Of Occurence Of Each Species
    total = data.sum() # Total Number Of Rows
    entropy_sum = 0 # To store entropy
    for i in data :
        p = i/total
        entropy_sum = entropy_sum - (p*math.log(p))
    return(entropy_sum) # Entropy For the group

In [150]:
total = data.shape[0]
p_setosa = 50/total
p_versicolor = 50/total
p_verginica = 50/total

E_0 = -(p_setosa * math.log(p_setosa))-(p_versicolor * math.log(p_versicolor))-(p_verginica * math.log(p_verginica))
print("Initial Entropy =", E_0)

Initial Entropy = 1.0986122886681096


In [151]:
max_value = -1 # To Store Maximum Entropy
point = -1 # To Store Break Point

In [152]:
for i in data["PetalLengthCm"].unique():
    
    group_1 = data[(data["PetalLengthCm"] > i)]
    group_2 = data[~(data["PetalLengthCm"] > i)]
    
    len_group_1 = group_1.shape[0]
    len_group_2 = group_2.shape[0]
    
    E_1 = entropy(group_1)
    E_2 = entropy(group_2)
    
    w_average = (len_group_1/total * E_1) + (len_group_2/total * E_2)
    IG = E_0 - w_average
    
    print("Breaking Point =", i)
    print("Weighted Entropy =", w_average)
    print("Information Gain=", IG)
    print()
    
    # For Getting The Maximum Breakpoint
    if(IG > max_value):
        max_value = IG
        point = i

Breaking Point = 1.0
Weighted Entropy = 1.0912433629042542
Information Gain= 0.007368925763855394

Breaking Point = 1.1
Weighted Entropy = 1.083783116337971
Information Gain= 0.014829172330138585

Breaking Point = 1.2
Weighted Entropy = 1.0685784132540277
Information Gain= 0.030033875414081868

Breaking Point = 1.3
Weighted Entropy = 1.012071146421969
Information Gain= 0.08654114224614062

Breaking Point = 1.4
Weighted Entropy = 0.9001457603687307
Information Gain= 0.19846652829937883

Breaking Point = 1.5
Weighted Entropy = 0.7309878754978658
Information Gain= 0.36762441317024375

Breaking Point = 1.6
Weighted Entropy = 0.6158112441179745
Information Gain= 0.48280104455013506

Breaking Point = 1.7
Weighted Entropy = 0.5277242136737411
Information Gain= 0.5708880749943684

Breaking Point = 1.9
Weighted Entropy = 0.46209812037329684
Information Gain= 0.6365141682948128

Breaking Point = 3.0
Weighted Entropy = 0.4902565152135314
Information Gain= 0.6083557734545781

Breaking Point = 3.3


In [153]:
print("Point Where Maximum Information Gain Occurs =", point)

Point Where Maximum Information Gain Occurs = 1.9


In [158]:
for i in data[data["PetalLengthCm"] < 1.9]["Species"].tolist():
    print(i)

Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa
Iris-setosa


## Conclusion: 
Iris-Setosa Can easily be distincted from this breakpoint.