## 信息增益计算

In [1]:
from math import log
from collections import Counter
from functools import reduce 

In [2]:
outlook = {'sunny':[1,1,0,0,0],'overcast':[1,1,1,1],'rainy':[1,1,1,0,0]}
humidity = {'high':[1,1,1,0,0,0,0],'normal':[1,1,1,1,1,1,0]}
windy = {'windy_false':[1,1,1,1,1,1,0,0],'windy_true':[1,1,1,0,0,0]}
temperature = {'hot':[1,1,0,0],'mild':[1,1,1,1,0,0],'cool':[1,1,1,0]}

In [3]:
def calcShannonEnt(dataSet):
    numEntries = len(dataSet)
    counter = Counter(dataSet)
    shannonEnt = 0.0
    for key in counter.keys():
        prob = float(counter[key]) / numEntries
        shannonEnt -= prob * log(prob,2)
    return shannonEnt

In [4]:
def calcInfoGain(dataSet):
    numEntries = sum(len(x) for x in dataSet.values())
    baseEntropy = calcShannonEnt(reduce(lambda x,y:x+y,dataSet.values()))
    newEntropy = 0.0
    for value in dataSet.values():
        prob = len(value) / float(numEntries)
        newEntropy += prob * calcShannonEnt(value)
    infoGain = baseEntropy - newEntropy
    return infoGain

In [5]:
print('outlook 信息增益：',calcInfoGain(outlook))
print('humidity 信息增益：',calcInfoGain(humidity))
print('windy 信息增益：',calcInfoGain(windy))
print('temperature 信息增益：',calcInfoGain(temperature))

outlook 信息增益： 0.2467498197744391
humidity 信息增益： 0.15183550136234136
windy 信息增益： 0.04812703040826927
temperature 信息增益： 0.029222565658954647


In [6]:
# outlook属性对应的信息增益最大