In [1]:
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np

In [2]:
X = load_iris().data
y = load_iris().target

In [3]:
someY = []
for i in y:
    someY.append([i])

In [4]:
X = np.append(X, someY, axis=1)

In [5]:
def divideset(rows,column,value):
    split_function=None
    if isinstance(value,int) or isinstance(value,float):
        split_function=lambda row:row[column]>=value
    else:
        split_function=lambda row:row[column]==value
       
    set1=[row for row in rows if split_function(row)]
    set2=[row for row in rows if not split_function(row)]
    return (set1,set2)

In [6]:
def uniquecounts(rows):
    results={}
    for row in rows:
        r = row[-1]
        if r not in results.keys(): 
            results[r]=0
        results[r]+=1
    return results

In [7]:
def entropy(rows):
    log2 = lambda x:log(x)/log(2)  
    results = uniquecounts(rows)
    ent = 0.0
    for r in results.keys():
        prob = float(results[r]) / len(rows)
        ent -= prob * np.log2(prob)
    return ent

In [8]:
uniquecounts(X)

{0.0: 50, 1.0: 50, 2.0: 50}

In [9]:
entropy(X)

1.5849625007211561

In [10]:
class decisionNode:
    def __init__(self, col=-1, value=None, results=None, tb=None, fb=None):
        self.col=col
        self.value=value
        self.results=results
        self.tb=tb
        self.fb=fb

In [11]:
def buildTree(rows, scoref=entropy):
    if(len(rows) == 0):
        return decisionNode()
    
    curScore = scoref(rows)
    bestGain = 0.0
    bestCriteria = None
    bestSets = None
    
    columnCnt = len(rows[0]) - 1
    for col in range(0, columnCnt):
        global columnValues
        columnValues = {}
        for row in rows:
            columnValues[row[col]] = 1
        for value in columnValues.keys():
            (set1, set2) = divideset(rows, col, value)
            
            curProb = float(len(set1))/len(rows) 
            curGain = curScore - curProb * scoref(set1) - (1-curProb) * scoref(set2)
            if(curGain > bestGain and len(set1) > 0 and len(set2) > 0):
                bestGain = curGain
                bestCriteria = (col, value)
                bestSets = (set1, set2)
    
    if(bestGain > 0):
        tBranch = buildTree(bestSets[0])
        fBranch = buildTree(bestSets[1])
        return decisionNode(col=bestCriteria[0], value=bestCriteria[1], 
                           tb = tBranch, fb = fBranch)
    else:
        return decisionNode(results=uniquecounts(rows))

In [12]:
tree = buildTree(X)

In [13]:
def printtree(tree, indent=''):
    if(tree.results != None):
        print(str(tree.results))
    else:
        print(str(tree.col)+':'+str(tree.value)+'? ')
        print(indent+'T->', end=" ")
        printtree(tree.tb,indent+'  ')
        print(indent+'F->', end=" ")
        printtree(tree.fb,indent+'  ')

In [14]:
printtree(tree)

2:3.0? 
T-> 3:1.8? 
  T-> 2:4.9? 
    T-> {2.0: 43}
    F-> 0:6.0? 
      T-> {2.0: 2}
      F-> {1.0: 1}
  F-> 2:5.0? 
    T-> 3:1.6? 
      T-> 0:7.2? 
        T-> {2.0: 1}
        F-> {1.0: 2}
      F-> {2.0: 3}
    F-> 3:1.7? 
      T-> {2.0: 1}
      F-> {1.0: 47}
F-> {0.0: 50}


In [15]:
def classify(observation,tree):
    if tree.results!=None:
        return tree.results
    else:
        v=observation[tree.col]
        branch=None
        if isinstance(v,int) or isinstance(v,float):
            if v>=tree.value: 
                branch=tree.tb
            else: 
                branch=tree.fb
        else:
            if v==tree.value: branch=tree.tb
            else: branch=tree.fb
    return classify(observation,branch)

In [16]:
classify([5.1, 0, 8, 0], tree)

{2.0: 3}