In [17]:
# Max height of tree is 5

import pandas as pd
import numpy as np
buff = np.finfo(float).eps
import random

In [25]:
def entropy(data):
    target = data.columns[-1]
    entropy = 0
    values = pd.unique(data[target])
    for value in values:
        prob = data[target].value_counts()[value]/len(data[target])
        entropy += -prob * (np.log(prob))    
    #print(entropy)

    return entropy

def entropy_attribute(data,attribute):
    target = data.columns[-1]
    
    values = pd.unique(data[attribute]) 
    targets = pd.unique(data[target])
    entropy = 0
    #print(data)
    for value in values:
        temp = 0
        for t in targets:
            n = len(data[attribute][data[attribute]==value][data[target]==t])
            d = len(data[attribute][data[attribute]==value])
            f = n/(d+buff)  # buff added to not make denominator zero
            temp += -f * (np.log(f + buff))
            
        entropy += (d/len(data))*temp
        #print(entropy)
        return abs(entropy)
            
def best(data):
    entopy_attr = []
    attributes = list(data.columns)[:-1]
    #print(attributes)
    l = [] 
    for attribute in attributes:
        #print(attribute)
        #print(entropy_attribute(data,attribute))
        l.append(entropy(data) - entropy_attribute(data,attribute))
    
    #print(ig)
    return data.columns[:-1][np.argmax(l)]


def get_table(data, node,value):
    return data[data[node] == value].reset_index(drop=True)
            
def makedtree(data,height,dtree=None): 
    target = data.columns[-1]
    #print(target)
    node = best(data)
    #print(node)
    values = np.unique(data[node])
    
    if dtree is None:                    
        dtree={}
        dtree[node] = {}

    for value in values:
        
        table = get_table(data,node,value)
        classes,count = np.unique(table[target],return_counts=True)
        #print(list(table.columns))
        if len(count)==1 or height==5 or len(list(table.columns))==2:
            #print(dtree)
            #print()
            if(len(count) == 1):
                dtree[node][value] = classes[0]   
            else:            
                dtree[node][value] = classes[np.argmax(count)]                                                   
        else:       
        
            #print(table,node)
            dtree[node][value] = makedtree(table.drop([node],axis=1),height+1) 
    
    return dtree        
    

In [26]:
data1 = pd.read_csv('weather.csv')
data2 = pd.read_csv('train.csv')

In [27]:
data1.head(4)

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes


In [28]:
data2.head(4)

Unnamed: 0,Day,Season,Wind,Rain,Class
0,Weekday,Spring,,,On-Time
1,Weekday,Winter,,Slight,On-Time
2,Weekday,Winter,,Slight,On-Time
3,Weekday,Winter,High,Heavy,Late


In [29]:
# Building dtree 1
tree1 = makedtree(data1,0,None)
print("Tree 1")
print(tree1)

Tree 1
{'temperature': {'cool': {'windy': {False: 'yes', True: {'outlook': {'overcast': 'yes', 'rainy': 'no'}}}}, 'hot': {'outlook': {'overcast': 'yes', 'sunny': 'no'}}, 'mild': {'outlook': {'overcast': 'yes', 'rainy': {'windy': {False: 'yes', True: 'no'}}, 'sunny': {'humidity': {'high': 'no', 'normal': 'yes'}}}}}}


In [32]:
# Building dtree 2
tree2 = makedtree(data2,0,None)
print("Tree 2")
print(tree2)

Tree 2
{'Wind': {'High': {'Season': {'Autumn': 'On-Time', 'Spring': 'Cancelled', 'Summer': 'On-Time', 'Winter': {'Day': {'Weekday': {'Rain': {'Heavy': 'Late'}}}}}}, 'None': 'On-Time', 'Normal': {'Season': {'Autumn': 'Very-Late', 'Spring': 'On-Time', 'Summer': 'On-Time', 'Winter': {'Day': {'Saturday': 'Late', 'Weekday': 'Very-Late'}}}}}}


In [33]:
def print_tree(dtree,width):
    
    for key in dtree.keys():
        for value in list(dtree[key].keys()):
            tree_value = dtree[key][value]
            if(type(tree_value) is dict):                
                print("| "*width + key + ' = ' + str(value))
            else:
                print("| "*width + key + ' = ' + str(value),end = '')
                
            #print(tree_value)
            if(type(tree_value) is dict):
                print_tree(tree_value,width+1)
            else:
                print(": " + tree_value)

In [34]:
print_tree(tree1,0)

temperature = cool
| windy = False: yes
| windy = True
| | outlook = overcast: yes
| | outlook = rainy: no
temperature = hot
| outlook = overcast: yes
| outlook = sunny: no
temperature = mild
| outlook = overcast: yes
| outlook = rainy
| | windy = False: yes
| | windy = True: no
| outlook = sunny
| | humidity = high: no
| | humidity = normal: yes


In [35]:
print_tree(tree2,0)

Wind = High
| Season = Autumn: On-Time
| Season = Spring: Cancelled
| Season = Summer: On-Time
| Season = Winter
| | Day = Weekday
| | | Rain = Heavy: Late
Wind = None: On-Time
Wind = Normal
| Season = Autumn: Very-Late
| Season = Spring: On-Time
| Season = Summer: On-Time
| Season = Winter
| | Day = Saturday: Late
| | Day = Weekday: Very-Late


In [11]:
# Prediction

def predict(data,dtree):
    
    prediction = -1
    for value in dtree.keys():
        recurse_value = data[value]
        
        tree_value = dtree[value][recurse_value]
        if type(tree_value) is dict:
            #print(tree_value)
            prediction = predict(data,tree_value)
        
        else:
            prediction = tree_value
        
    if(prediction == -1):
        return None
    
    return prediction
    

In [12]:
print("Data 1 Predictions")
p = 0
n = 0
for j in range(0,5):
    random_no = random.randint(0,len(data1)-1)
    print("Test Input " + str(j+1))
    print(data1.iloc[random_no][:-1])
    print('\033[1m',end='')
    pred = predict(data1.iloc[random_no][:-1],tree1)
    print("Prediction = {}".format(pred))
    print('\033[0m')    
    if(data1.iloc[random_no][-1] == pred):
          p += 1
    else:
          n += 1
    print()
print('Accuracy = {}'.format(p/(p+n)))          

Data 1 Predictions
Test Input 1
outlook         sunny
temperature      mild
humidity       normal
windy            True
Name: 10, dtype: object
[1mPrediction = yes
[0m

Test Input 2
outlook        sunny
temperature     mild
humidity        high
windy          False
Name: 7, dtype: object
[1mPrediction = no
[0m

Test Input 3
outlook        sunny
temperature      hot
humidity        high
windy           True
Name: 1, dtype: object
[1mPrediction = no
[0m

Test Input 4
outlook        rainy
temperature     mild
humidity        high
windy          False
Name: 3, dtype: object
[1mPrediction = yes
[0m

Test Input 5
outlook        rainy
temperature     mild
humidity        high
windy          False
Name: 3, dtype: object
[1mPrediction = yes
[0m

Accuracy = 1.0


In [13]:
print("Data 2 Predictions")
p = 0
n = 0
for j in range(0,5):
    random_no = random.randint(0,len(data2)-1)
    print("Test Input " + str(j+1))
    print(data2.iloc[random_no][:-1])
    print('\033[1m',end='')
    pred = predict(data2.iloc[random_no][:-1],tree2)    
    print("Prediction = {}".format(pred))
    print('\033[0m')    
    if(data2.iloc[random_no][-1] == pred):
          p += 1
    else:
          n += 1                    
    print()

print('Accuracy = {}'.format(p/(p+n)))          

Data 2 Predictions
Test Input 1
Day       Saturday
Season      Winter
Wind        Normal
Rain          None
Name: 12, dtype: object
[1mPrediction = Late
[0m

Test Input 2
Day       Weekday
Season     Winter
Wind         High
Rain        Heavy
Name: 3, dtype: object
[1mPrediction = Late
[0m

Test Input 3
Day       Weekday
Season     Winter
Wind         None
Rain       Slight
Name: 2, dtype: object
[1mPrediction = On-Time
[0m

Test Input 4
Day       Weekday
Season     Summer
Wind         None
Rain       Slight
Name: 9, dtype: object
[1mPrediction = On-Time
[0m

Test Input 5
Day       Saturday
Season      Winter
Wind        Normal
Rain          None
Name: 12, dtype: object
[1mPrediction = Late
[0m

Accuracy = 1.0


In [16]:
# Accuracy coming high because we are using random examples from the training dataset