In [1]:
import pandas as pd
import math

In [2]:
dataset = pd.read_csv('income.csv')
dataset.head()

Unnamed: 0,Age,Income,Student,Credit_Rating,Buys Computer
0,<=30,high,no,fair,no
1,<=30,high,no,excellent,no
2,31?40,high,no,fair,yes
3,>40,medium,no,fair,yes
4,>40,low,yes,fair,yes


In [3]:
columns = list(dataset.columns)
print(columns)

['Age', 'Income', 'Student', 'Credit_Rating', 'Buys Computer']


In [4]:
output_column = list(dataset[columns[-1]])
output_classes = list(dataset[columns[-1]].unique())
prob_output_classes = []

for output_class in output_classes:
    count = output_column.count(output_class)
    print(count)
    prob_output_class = count/len(output_column)
    prob_output_classes.append(prob_output_class)

print(prob_output_classes)

5
9
[0.35714285714285715, 0.6428571428571429]


In [5]:
def entropy(prob_of_classes):
    H = 0
    for prob_of_class in prob_of_classes:
        #print(prob_of_class)
        if prob_of_class != 0:   
            H += (prob_of_class * math.log10(1/prob_of_class))
        else:
            H += 0
    return H

In [6]:
entropy_output_class = entropy(prob_output_classes)
print(entropy_output_class)

0.28305427806152245


In [7]:
columns.pop()
print(columns)

['Age', 'Income', 'Student', 'Credit_Rating']


In [8]:
prob_input_classes = []
count_of_subclasses = []
for column in columns:
    prob_input_subclasses = []
    count_of_subclass = []
    input_column = list(dataset[column])
    input_column_subclasses = list(dataset[column].unique())
    
    for input_column_subclass in input_column_subclasses:
        count = input_column.count(input_column_subclass)
        count_of_subclass.append(count)
        prob_input_class = count/len(input_column)
        prob_input_subclasses.append(prob_input_class)
        
    count_of_subclasses.append(count_of_subclass)
    prob_input_classes.append(prob_input_subclasses)

print(count_of_subclasses)
print(prob_input_classes)

[[5, 4, 5], [4, 6, 4], [7, 7], [8, 6]]
[[0.35714285714285715, 0.2857142857142857, 0.35714285714285715], [0.2857142857142857, 0.42857142857142855, 0.2857142857142857], [0.5, 0.5], [0.5714285714285714, 0.42857142857142855]]


In [9]:
input_classes = []
for column in columns:
    sub_class = list(dataset[column].unique())
    input_classes.append(sub_class)
    
print(input_classes)

[['<=30', '31?40', '>40'], ['high', 'medium', 'low'], ['no', 'yes'], ['fair', 'excellent']]


In [10]:
def get_prob(column, input_class):
    input_subclass_count = []
    for input_subclass in input_class:
        #print(input_subclass)
        subclass_count = []
        for output_class in output_classes:
            subclass_output_count = 0
            for row in dataset.values:
                #print(row)
                column_index = columns.index(column)
                if row[column_index] == input_subclass and row[-1] == output_class:
                    subclass_output_count = subclass_output_count + 1
            subclass_count.append(subclass_output_count)
        input_subclass_count.append(subclass_count)
            
    return input_subclass_count

In [11]:
iteration_param = 0
count_output_subclasses = []
while(iteration_param < len(columns)):
    input_subclass_count = get_prob(columns[iteration_param], input_classes[iteration_param])
    count_output_subclasses.append(input_subclass_count)
    iteration_param += 1
    
print(count_output_subclasses)

[[[3, 2], [0, 4], [2, 3]], [[2, 2], [2, 4], [1, 3]], [[4, 3], [1, 6]], [[2, 6], [3, 3]]]


In [12]:
index_col = 0
prob_subclasses = []

while(index_col < len(count_of_subclasses)):
    prob_output_subclasses = []
    no_of_subclass = 0
    count_of_subclass = count_of_subclasses[index_col]
    output_subclass_count = count_output_subclasses[index_col]
    while(no_of_subclass < len(count_of_subclass)):
        prob_output_subclass = []
        count = count_of_subclass[no_of_subclass]
        subclasses_count = output_subclass_count[no_of_subclass]
        for subclass_count in subclasses_count:
            prob_output_subclass.append(subclass_count/count)
        no_of_subclass += 1
        prob_output_subclasses.append(prob_output_subclass)
    prob_subclasses.append(prob_output_subclasses)
    index_col += 1
    
print(prob_subclasses)

[[[0.6, 0.4], [0.0, 1.0], [0.4, 0.6]], [[0.5, 0.5], [0.3333333333333333, 0.6666666666666666], [0.25, 0.75]], [[0.5714285714285714, 0.42857142857142855], [0.14285714285714285, 0.8571428571428571]], [[0.25, 0.75], [0.5, 0.5]]]


In [13]:
entropy_subclasses = []
for prob_subclass in prob_subclasses:
    entropy_subclass = []
    for prob in prob_subclass:
        H = entropy(prob)
        entropy_subclass.append(H)
    entropy_subclasses.append(entropy_subclass)
    
        
print(entropy_subclasses)

[[0.29228525323862886, 0.0, 0.29228525323862886], [0.3010299956639812, 0.27643459094367495, 0.24421905028821556], [0.296583221518423, 0.17811125397113373], [0.24421905028821556, 0.3010299956639812]]


In [37]:
def information_gain(entropy_subclasses):
    no_of_subclasses = 0
    gain_list = []
    while(no_of_subclasses < len(entropy_subclasses)):
        sum_entropy_prob = 0
        entropy_subclass = entropy_subclasses[no_of_subclasses]
        prob_subclass = prob_input_classes[no_of_subclasses]
        no_subclass_subclasses = 0
        while(no_subclass_subclasses < len(entropy_subclass)):
            sum_entropy_prob += (entropy_subclass[no_subclass_subclasses] * prob_subclass[no_subclass_subclasses])
            no_subclass_subclasses += 1
        #print(sum_entropy_prob)
        G = entropy_output_class - sum_entropy_prob
        gain_list.append(G)
        no_of_subclasses += 1
    return gain_list

In [46]:
gain_list = information_gain(entropy_subclasses)
print(gain_list)

[0.07427909717678755, 0.008796868813605585, 0.04570704031674411, 0.014487679755121607]


In [50]:
max_gain_value = max(gain_list)
max_gain_col_index = gain_list.index(max_gain_value)

print( "Maximum Information Gain: ",max_gain_value, "\nFirst Splitting Attribute is: "+columns[max_gain_col_index])

Maximum Information Gain:  0.07427909717678755 
First Splitting Attribute is: Age
