In [2]:
import operator
from math import log
from collections import Counter

#计算给定数据集的熵
def calcShannonEnt(dataSet):   #dataset是数据集
    numEntries = len(dataSet)  #计算数据集的长度

    labelCounts = {}
    for featVec in dataSet:    #计算分类标签label出现的次数
        currentLabel = featVec[-1]    #每一行最后一个数据是标签，存储当前的标签
        if currentLabel not in labelCounts.keys(): #为所有可能的分类创建字典，如果不在字典中，则创建键值
            labelCounts[currentLabel] = 0
        labelCounts[currentLabel] += 1

    shannonEnt = 0.0  #熵初始值
    for key in labelCounts:
        prob = float(labelCounts[key])/numEntries #计算类别出现的概率
        shannonEnt -= prob * log(prob,2) #计算熵，以2为底
    return shannonEnt

def splitDataSet(dataSet,index,value): #通过遍历dataset数据集，求出index对应的列的值为value的行
    retDataSet = []
    for featVec in dataSet:
        if featVec[index] == value: #判断index列的值是否等于value
            reducedFeatVec = featVec[:index] #取前index行
            reducedFeatVec.extend(featVec[index+1:]) #跳过index行，取后面的数据
            retDataSet.append(reducedFeatVec) 
    return retDataSet

def chooseBestFeatureToSplit(dataSet): #选择最好的划分特征
    numFeatures = len(dataSet[0]) - 1 #求出数据集特征总数
    baseEntropy = calcShannonEnt(dataSet) #初始熵
    bestInfoGain,bestFeature = 0.0,-1 #最优信息增益和最优特征
    for i in range(numFeatures):
        featList = [example[i] for example in dataSet]
        uniqueVals = set(featList)
        newEntropy = 0.0
        for value in uniqueVals:
            subDataSet = splitDataSet(dataSet,i,value)
            prob = len(subDataSet)/float(len(dataSet))
            newEntropy += prob * calcShannonEnt(subDataSet) #计算特征的熵
        infoGain = baseEntropy - newEntropy #计算信息增益
        if(infoGain > bestInfoGain): #找到最好的特征
            bestInfoGain = infoGain
            bestFeature = i
    return bestFeature

def majorityCnt(classList): #选择出现次数做多的结果
    classCount = {}
    for vote in classList:
        if vote in classCount.keys():
            classCount[vote] = 0
        classCount[vote] += 1

    sortedClassCount = sorted(classCount.iteritems(),key=operator.itemgetter(1),reverse=True)#倒叙排序
    return sortedClassCount[0][0] #返回第一个结果

def createTree(dataSet,labels): #创建树
    classList = [example[-1] for example in dataSet]
    if classList.count(classList[0]) == len(classList): #如果数据集最后一列的第一个值等于整个集合的数量，则数据集只有你一个类别，直接返回
        return classList[0]
    if len(dataSet[0]) == 1: #使用完了所有特征，仍然不能将数据集划分为仅包含唯一类别的分组
        return majorityCnt(classList)

    bestFeat = chooseBestFeatureToSplit(dataSet) #选择最优的列
    bestFeatLabel = labels[bestFeat] #得到最优列对应的label含义
    myTree = {bestFeatLabel:{}}

    del(labels[bestFeat])
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        subLabels = labels[:] #求出剩余标签
        myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet,bestFeat,value),subLabels)
    return myTree

def classify(inputTree,featLabels,testVec): #划分输入节点
    firstStr = inputTree.keys()[0] #获得树对应的key值
    secondDict = inputTree[firstStr] #通过key得到根节点对应的value
    featIndex = featLabels.index(firstStr)
    key = testVec[featIndex]
    valueOfFeat = secondDict[key]

    if isinstance(valueOfFeat,dict):
        classLabel = classify(valueOfFeat,featLabels,testVec)
    else:
        classLabel = valueOfFeat
    return classLabel
fr = open('lenses.txt')
lenses = [inst.strip().split('\t') for inst in fr.readlines()]
lensesLabels = ['age','prescript','astigmatic','tearRate']
lensesTree = createTree(lenses,lensesLabels)
print(lensesTree)


{'tearRate': {'normal': {'astigmatic': {'no': {'age': {'pre': 'soft', 'presbyopic': {'prescript': {'myope': 'no lenses', 'hyper': 'soft'}}, 'young': 'soft'}}, 'yes': {'prescript': {'myope': 'hard', 'hyper': {'age': {'pre': 'no lenses', 'presbyopic': 'no lenses', 'young': 'hard'}}}}}}, 'reduced': 'no lenses'}}
