In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import numpy.linalg as la
from numpy import log2 as log
import math
import operator
import pprint

In [3]:
col_names = ['price_buy', 'price_main', 'n_doors', 'n_persons', 'lug_boot', 'safety', 'recommendation']
df = pd.read_csv("car.data", header=None, names=col_names)
df.head()

Unnamed: 0,price_buy,price_main,n_doors,n_persons,lug_boot,safety,recommendation
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [96]:
class ID3():
       
        
    def gain_setup(self,data):
        features_wrt_count = {}
        features_proba ={}
        
        nbr_features = data.shape[1]-1
        nbr_classes = max(np.unique(data[:,data.shape[1]-1]))+1
        
        for a in range(nbr_features):
            features_wrt_count[a] =  [] 
            features_proba[a]=[]
            for i in range(max(np.unique(data[:,a])+1)):
                l=[0]*nbr_classes
                for j in range(data.shape[0]):
                    if data[j,a] == i:
                        l[data[j,data.shape[1]-1]] =l[data[j,data.shape[1]-1]]+1
                features_proba[a].append(sum(l)/data.shape[0])
                if sum(l) != 0 :
                    features_wrt_count[a].append([x / sum(l) for x in l])
                else:
                    features_wrt_count[a].append([0])
        return features_wrt_count,features_proba
    
    def log_upgrade(self,x):
        if x != 0:
            return math.log2(x)
        else :
            return 0
        
    def log_fct(self,array):
        somme = 0
        for i in range(len(array)):
            somme = somme + array[i]*self.log_upgrade(array[i])
        return somme
    
    def attr_entropy(self,data):
        feature_dict,proba_dict=self.gain_setup(data)
        gain_dict={}
        for target_feature in range(data.shape[1]-1):      
            gain=0
            highlight_proba =proba_dict[target_feature]
            highlight_feature = feature_dict[target_feature]
            for i in range(len(highlight_feature)):
                gain=gain + highlight_proba[i] * self.log_fct(highlight_feature[i])
            gain_dict[target_feature]=-gain
        return gain_dict
                    
    def dataset_entropy(self,data):
        target=data[:,-1]
        classes,class_occur=np.unique(target,return_counts=True)
        classes_proba=[]
        for i in range(len(classes)):
            classes_proba.append(class_occur[i]/data.shape[0])
        entropy = -self.log_fct(classes_proba)
        return entropy
    
    
    def info_gain(self,data):
        set_ent = self.dataset_entropy(data)
        att_ent = self.attr_entropy(data)
        for e in att_ent.keys():
            att_ent[e]=set_ent-att_ent[e]
        return att_ent
    
    def find_split_attr(self,data):
        i_gain=self.info_gain(data)
        return max(i_gain.items(), key=operator.itemgetter(1))[0]
    
    def get_subtable(self,df, node,value):
        return df[df[node] == value].reset_index(drop=True)
        
             
    
    def build(self,df,tree=None):
        ai2an_map = col_names
        ai2aiv2aivn_map = []
        enc_cols = []
        for col in df.columns:
            df[col] = df[col].astype('category')
            a = np.array(df[col].cat.codes.values).reshape((-1,1))
            enc_cols.append(a)
            ai2aiv2aivn_map.append(list(df[col].cat.categories.values))
        Class = df.keys()[-1]
        data = np.hstack(enc_cols)
        
        attr_names = list(df.columns)      
        classes_names =list(np.unique(df[df.keys()[-1]]))
        
        split_attr = self.find_split_attr(data)        
        node =attr_names[split_attr]
        attValue = np.unique(df[node])
        if tree is None:                    
            tree={}
            tree[attr_names[split_attr]] = {}
        for value in attValue:
            subtable = self.get_subtable(df,node,value)
            class_Value,counts = np.unique(subtable[Class],return_counts=True)                        
            if len(counts)==1:
                tree[node][value] = class_Value[0]                                                    
            else:        
                tree[node][value] = self.build(subtable) #Calling the function recursively 
    

         
        return tree
        
        
                    
        
        

In [97]:
model=ID3()
model.build(df)

{'safety': {'high': {'n_persons': {'2': 'unacc',
    '4': {'price_buy': {'high': {'price_main': {'high': 'acc',
        'low': 'acc',
        'med': 'acc',
        'vhigh': 'unacc'}},
      'low': {'price_main': {'high': {'lug_boot': {'big': 'vgood',
          'med': {'n_doors': {'2': 'acc',
            '3': 'acc',
            '4': 'vgood',
            '5more': 'vgood'}},
          'small': 'acc'}},
        'low': {'lug_boot': {'big': 'vgood',
          'med': {'n_doors': {'2': 'good',
            '3': 'good',
            '4': 'vgood',
            '5more': 'vgood'}},
          'small': 'good'}},
        'med': {'lug_boot': {'big': 'vgood',
          'med': {'n_doors': {'2': 'good',
            '3': 'good',
            '4': 'vgood',
            '5more': 'vgood'}},
          'small': 'good'}},
        'vhigh': 'acc'}},
      'med': {'price_main': {'high': 'acc',
        'low': {'lug_boot': {'big': 'vgood',
          'med': {'n_doors': {'2': 'good',
            '3': 'good',
            '4