In [15]:
import pandas as pd
import numpy as np
import pprint
from collections import Counter


In [16]:
df = pd.DataFrame(data = pd.read_csv('weatherid3.csv'))


In [17]:
def entropy(probs):
    import math
    return sum([-prob*math.log(prob,2) for prob in probs])

In [18]:
def entropy_of_list(a_list):
    
    
    cnt = Counter(x for x in a_list)
    
#     print("Yes and no classes are ",a_list.name,cnt)
    total_instances = len(a_list)*1.0
    
    probs=[ x/total_instances  for x in cnt.values() ]
    return entropy(probs)   

A Counter is a subclass of dict. Therefore it is an unordered collection where elements and their respective count are stored as a dictionary. This is equivalent to a bag or multiset of other languages.



Syntax :

class collections.Counter([iterable-or-mapping])



from collections import Counter
  

print(Counter(['B','B','A','B','C','A','B','B','A','C']))
  

print(Counter({'A':3, 'B':5, 'C':2}))

print(Counter(A=3, B=5, C=2))


Output of all the three lines is same :

Counter({'B': 5, 'A': 3, 'C': 2})
Counter({'B': 5, 'A': 3, 'C': 2})
Counter({'B': 5, 'A': 3, 'C': 2})

In [19]:
entropy_of_list(df['Answer'])

0.9402859586706309

In [20]:
def information_gain(df,split_attr,target_attr,trace=0):
    
    df_split = df.groupby(split_attr)
   
    nobs = len(df.index)*1.0
    
    
    #calculate info gain
    
    
    df_agg_ent = df_split.agg({target_attr : [entropy_of_list, lambda x:len(x)/nobs]})
    
    df_agg_ent.columns =['Entropy','PropObservations']
    
    
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations']) 
    old_entropy = entropy_of_list(df[target_attr])
    return old_entropy-new_entropy


    
    
    
    

In [21]:

def id3(df,target_attr,attr_list,default_class=None):
    

    cnt =Counter(x for x in df[target_attr])
    
    if len(cnt) == 1:
        return next(iter(cnt))
    elif df.empty or (not attr_list):
        return default_class
    else:
        default_class = max(cnt.keys()) # most common
        
        gain_of_all = [information_gain(df, attr, target_attr) for attr in attr_list]
        
        
        
        index_of_max = gain_of_all.index(max(gain_of_all))
        
        best_attr = attr_list[index_of_max]
        
        tree={best_attr:{}}
        
        remaining_attrs = attr_list.copy()
        remaining_attrs.remove(best_attr)

        for attr_val, data_subset in df.groupby(best_attr):
            tree[best_attr][attr_val]= id3(data_subset,target_attr,remaining_attrs,default_class)
                        
    
    return tree









In [22]:
attr_list= list(df.columns)
print("List of Attributes are" , attr_list)
attr_list.remove('Answer')

print("Predicting Attributes are" , attr_list)
tree=id3(df, 'Answer', attr_list)
print()
print("Resultant decision tree is :")
pprint.pprint(tree)

List of Attributes are ['Outlook', 'Temperature', 'Humidity', 'Wind', 'Answer']
Predicting Attributes are ['Outlook', 'Temperature', 'Humidity', 'Wind']

Resultant decision tree is :
{'Outlook': {'overcast': 'yes',
             'rain': {'Wind': {'strong': 'no', 'weak': 'yes'}},
             'sunny': {'Humidity': {'high': 'no', 'normal': 'yes'}}}}
