In [1]:
import pandas as pd
from pandas import DataFrame 

In [2]:
df = pd.read_csv('csv_result-monks_t100.csv')
df

Unnamed: 0,id,a1,a2,a3,a4,a5,a6,class
0,1,1,1,1,1,1,1,1
1,2,1,1,1,1,1,1,1
2,3,1,1,1,3,1,1,1
3,4,1,1,1,3,1,1,1
4,5,1,1,2,1,1,1,1
...,...,...,...,...,...,...,...,...
119,120,3,3,2,1,1,1,1
120,121,3,3,2,3,1,1,1
121,122,3,3,2,3,1,1,1
122,123,3,3,2,3,1,1,1


In [3]:
df = df.drop(['id'],axis='columns')
df.keys()[0]
df

Unnamed: 0,a1,a2,a3,a4,a5,a6,class
0,1,1,1,1,1,1,1
1,1,1,1,1,1,1,1
2,1,1,1,3,1,1,1
3,1,1,1,3,1,1,1
4,1,1,2,1,1,1,1
...,...,...,...,...,...,...,...
119,3,3,2,1,1,1,1
120,3,3,2,3,1,1,1
121,3,3,2,3,1,1,1
122,3,3,2,3,1,1,1


# Entropy of the Training Data Set

In [4]:
def entropy(probs):  
    import math
    return sum( [-prob*math.log(prob, 2) for prob in probs] )

def entropy_of_list(a_list):  
    from collections import Counter
    cnt = Counter(x for x in a_list)   
   
    num_instances = len(a_list)*1.0  
    #print("\n Number of Instances of the Current Sub Class is {0}:".format(num_instances ))
    probs = [x / num_instances for x in cnt.values()]  
    #print("\n Classes:",min(cnt),max(cnt))
    print(" \n Probabilities of Class {0} is {1}:".format(min(cnt),min(probs)))
    print(" \n Probabilities of Class {0} is {1}:".format(max(cnt),max(probs)))
    return entropy(probs) 
    

print("\n  INPUT DATA SET FOR ENTROPY CALCULATION:\n", df['class'])

total_entropy = entropy_of_list(df['class'])

print("\n Total Entropy of Class Data Set:",total_entropy)


  INPUT DATA SET FOR ENTROPY CALCULATION:
 0      1
1      1
2      1
3      1
4      1
      ..
119    1
120    1
121    1
122    1
123    1
Name: class, Length: 124, dtype: int64
 
 Probabilities of Class 0 is 0.5:
 
 Probabilities of Class 1 is 0.5:

 Total Entropy of Class Data Set: 1.0


# Information Gain of Attributes 

In [5]:
def information_gain(df, split_attribute_name, target_attribute_name, trace=0):
    print("Information Gain Calculation of ",split_attribute_name)
    
   
    df_split = df.groupby(split_attribute_name)
   
    
    
    nobs = len(df.index) * 1.0
    
    df_agg_ent = df_split.agg({target_attribute_name : [entropy_of_list, lambda x: len(x)/nobs] })[target_attribute_name]
    
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    
    
    
    new_entropy = sum( df_agg_ent['Entropy'] * df_agg_ent['PropObservations'] )
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy


print('Info-gain for a1 is :'+str( information_gain(df, 'a1', 'class')),"\n")
print('\n Info-gain for a2 is: ' + str( information_gain(df, 'a2', 'class')),"\n")
print('\n Info-gain for a3 is:' + str( information_gain(df, 'a3', 'class')),"\n")
print('\n Info-gain for a4 is:' + str( information_gain(df, 'a4','class')),"\n")
print('\n Info-gain for a5 is:' + str( information_gain(df, 'a5','class')),"\n")
print('\n Info-gain for a6 is:' + str( information_gain(df, 'a6','class')),"\n")

Information Gain Calculation of  a1
 
 Probabilities of Class 0 is 0.3111111111111111:
 
 Probabilities of Class 1 is 0.6888888888888889:
 
 Probabilities of Class 0 is 0.47619047619047616:
 
 Probabilities of Class 1 is 0.5238095238095238:
 
 Probabilities of Class 0 is 0.2972972972972973:
 
 Probabilities of Class 1 is 0.7027027027027027:
 
 Probabilities of Class 0 is 0.5:
 
 Probabilities of Class 1 is 0.5:
Info-gain for a1 is :0.07527255560831936 

Information Gain Calculation of  a2
 
 Probabilities of Class 0 is 0.42857142857142855:
 
 Probabilities of Class 1 is 0.5714285714285714:
 
 Probabilities of Class 0 is 0.47619047619047616:
 
 Probabilities of Class 1 is 0.5238095238095238:
 
 Probabilities of Class 0 is 0.46808510638297873:
 
 Probabilities of Class 1 is 0.5319148936170213:
 
 Probabilities of Class 0 is 0.5:
 
 Probabilities of Class 1 is 0.5:

 Info-gain for a2 is: 0.005838429962909175 

Information Gain Calculation of  a3
 
 Probabilities of Class 0 is 0.4615384615

# ID3 Algorithm

In [6]:
def id3(df, target_attribute_name, attribute_names, default_class=None):
    
    
    from collections import Counter
    cnt = Counter(x for x in df[target_attribute_name])
    
   
    if len(cnt) == 1:
        return next(iter(cnt))  
    
    
    elif df.empty or (not attribute_names):
        return default_class  
    
   
    else:
        
        default_class = max(cnt.keys()) 
        
        gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names] #
        index_of_max = gainz.index(max(gainz)) 
       
        best_attr = attribute_names[index_of_max]
        
        
        tree = {best_attr:{}} 
        remaining_attribute_names = [i for i in attribute_names if i != best_attr]
        
        
        for attr_val, data_subset in df.groupby(best_attr):
            subtree = id3(data_subset,
                        target_attribute_name,
                        remaining_attribute_names,
                        default_class)
            tree[best_attr][attr_val] = subtree
        return tree

# Predicting Attributes

In [7]:
attribute_names = list(df.columns)
print("List of Attributes:", attribute_names) 
attribute_names.remove('class') 
print("Predicting Attributes:", attribute_names)

List of Attributes: ['a1', 'a2', 'a3', 'a4', 'a5', 'a6', 'class']
Predicting Attributes: ['a1', 'a2', 'a3', 'a4', 'a5', 'a6']


In [8]:
from pprint import pprint
tree = id3(df,'class',attribute_names)

print("\n\nThe Resultant Decision Tree is :\n")
print(tree)
pprint(tree)
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)
print("Tree Keys:\n",tree[attribute].keys())

Information Gain Calculation of  a1
 
 Probabilities of Class 0 is 0.3111111111111111:
 
 Probabilities of Class 1 is 0.6888888888888889:
 
 Probabilities of Class 0 is 0.47619047619047616:
 
 Probabilities of Class 1 is 0.5238095238095238:
 
 Probabilities of Class 0 is 0.2972972972972973:
 
 Probabilities of Class 1 is 0.7027027027027027:
 
 Probabilities of Class 0 is 0.5:
 
 Probabilities of Class 1 is 0.5:
Information Gain Calculation of  a2
 
 Probabilities of Class 0 is 0.42857142857142855:
 
 Probabilities of Class 1 is 0.5714285714285714:
 
 Probabilities of Class 0 is 0.47619047619047616:
 
 Probabilities of Class 1 is 0.5238095238095238:
 
 Probabilities of Class 0 is 0.46808510638297873:
 
 Probabilities of Class 1 is 0.5319148936170213:
 
 Probabilities of Class 0 is 0.5:
 
 Probabilities of Class 1 is 0.5:
Information Gain Calculation of  a3
 
 Probabilities of Class 0 is 0.46153846153846156:
 
 Probabilities of Class 1 is 0.5384615384615384:
 
 Probabilities of Class 0 i

# Classification Accuracy

In [9]:
def classify(instance, tree, default=None):
    
    
    attribute = next(iter(tree))      
    
    if instance[attribute] in tree[attribute].keys():   
        result = tree[attribute][instance[attribute]]
       
        if isinstance(result, dict): 
            return classify(instance, result)
        else:
            return result 
    else:
        return default

In [10]:
attribute = next(iter(tree))
print("Best Attribute :\n",attribute)

Best Attribute :
 a1


In [11]:
df['predicted'] = df.apply(classify, axis=1, args=(tree,'1') ) 
    

#print(df['predicted'])

print('\n Accuracy is:\n' + str( sum(df['class']==df['predicted'] ) / (1.0*len(df.index)) ))


df[['class', 'predicted']]



 Accuracy is:
0.7419354838709677


Unnamed: 0,class,predicted
0,1,1
1,1,1
2,1,1
3,1,1
4,1,1
...,...,...
119,1,1
120,1,1
121,1,1
122,1,1


# Classification Accuracy: Training/Testing Set

In [13]:
test_data  = pd.read_csv("csv_result-monks_test.csv")
test_data = test_data.drop(['id'],axis='columns')
print(test_data)
train_tree = id3(df, 'class', attribute_names)

test_data['predicted2'] = test_data.apply(classify, axis=1, args=(train_tree,0.0) )
#print(test_data)

print ('\n\n Accuracy is : ' + str( sum(test_data['class']==test_data['predicted2'] ) / (1.0*len(test_data.index)) ))
test_data[['class', 'predicted2']]

     a1  a2  a3  a4  a5  a6  class
0     1   1   1   1   1   1      1
1     1   1   1   1   1   1      1
2     1   1   1   1   1   1      1
3     1   1   1   1   1   1      1
4     1   1   1   1   1   1      1
..   ..  ..  ..  ..  ..  ..    ...
427   3   3   2   3   1   1      1
428   3   3   2   3   1   1      1
429   3   3   2   3   1   1      1
430   3   3   2   3   1   1      1
431   3   3   2   3   1   1      1

[432 rows x 7 columns]
Information Gain Calculation of  a1
 
 Probabilities of Class 0 is 0.3111111111111111:
 
 Probabilities of Class 1 is 0.6888888888888889:
 
 Probabilities of Class 0 is 0.47619047619047616:
 
 Probabilities of Class 1 is 0.5238095238095238:
 
 Probabilities of Class 0 is 0.2972972972972973:
 
 Probabilities of Class 1 is 0.7027027027027027:
 
 Probabilities of Class 0 is 0.5:
 
 Probabilities of Class 1 is 0.5:
Information Gain Calculation of  a2
 
 Probabilities of Class 0 is 0.42857142857142855:
 
 Probabilities of Class 1 is 0.5714285714285714:
 


Unnamed: 0,class,predicted2
0,1,1.0
1,1,1.0
2,1,1.0
3,1,1.0
4,1,1.0
...,...,...
427,1,1.0
428,1,1.0
429,1,1.0
430,1,1.0


# End