In [157]:
# !/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Projects: Judge fish non-fish
"""
from math import log
import operator

In [158]:
def createDataSet():
    dataSet = [[1, 1, 'yes'],
            [1, 1, 'yes'],
            [1, 0, 'no'],
            [0, 1, 'no'],
            [0, 1, 'no']]
    labels = ['no surfacing', 'flippers']
    return dataSet, labels

In [159]:
dataSet, labels = createDataSet()
dataSet, labels

([[1, 1, 'yes'], [1, 1, 'yes'], [1, 0, 'no'], [0, 1, 'no'], [0, 1, 'no']],
 ['no surfacing', 'flippers'])

In [160]:
def calcShonnonEntropy(dataSet):
    num_datas = len(dataSet)
    classCount = {}
    for i in dataSet:
        votelabels = i[-1]
        classCount[votelabels] = classCount.get(votelabels, 0) + 1
    shonnonEnt = 0.0
    for key in classCount.keys():
        prob = int(classCount[key]) / float(num_datas)
        shonnonEnt -= prob * log(prob, 2)
    return shonnonEnt

In [161]:
calcShonnonEntropy(dataSet)

0.9709505944546686

In [162]:
def splitDataSet(dataSet, index, value):
    splitData = []
    for i in dataSet:
        if i[index] == value:
            reduceFeatVec = i[:index]
            reduceFeatVec.extend(i[index+1:])
            splitData.append(reduceFeatVec)
    return splitData
splitDataSet(dataSet, 0, 1)

[[1, 'yes'], [1, 'yes'], [0, 'no']]

In [163]:
def chooseBestFeatureTosplit_Gain(dataSet):
    ent = calcShonnonEntropy(dataSet)
    print(ent)
    baseinfoGain = 0.0
    bestFeat = -1
    num_feats = len(dataSet[0]) - 1
    for i in range(num_feats):
        featLists = [example[i] for example in dataSet]
        uniqueFeats = set(featLists)
        newEntropy = 0
        for value in uniqueFeats:
            subsplitdata = splitDataSet(dataSet, i , value)
            prob = len(subsplitdata) / len(subsplitdata)
            newEntropy += prob * calcShonnonEntropy(subsplitdata)
        infoGain = ent - newEntropy
        if infoGain > baseinfoGain:
            baseinfoGain = infoGain
            bestFeat = i
    return bestFeat
chooseBestFeatureTosplit_Gain(dataSet)

0.9709505944546686


0

In [164]:
def chooseBestFeatureTosplit_Gain(dataSet):
    num_feats = len(dataSet[0]) - 1
    ent = calcShonnonEntropy(dataSet)
    print(ent)
    baseinfoGain = 0.0
    bestFeat = -1
    for i in range(num_feats):
        featList = [example[i] for example in dataSet]
        uniqueFeat = set(featList)
        newEntropy = 0
        for n in uniqueFeat:
            subDataSet = splitDataSet(dataSet, i, n)
            prob = len(subDataSet) / len(dataSet)
            newEntropy += prob * calcShonnonEntropy(subDataSet)
        infoGain = ent - newEntropy
        if infoGain > baseinfoGain:
            baseinfoGain = infoGain
            bestFeat = i
    return bestFeat
chooseBestFeatureTosplit_Gain(dataSet)

0.9709505944546686


0

In [165]:
def chooseBestFeatureTosplit_GainRatio(dataSet):
    num_feats = len(dataSet[0]) - 1
    ent = calcShonnonEntropy(dataSet)
    print(ent)
    baseinfoGainRatio = 0.0
    bestFeat = -1
    for i in range(num_feats):
        featList = [example[i] for example in dataSet]
        uniqueFeat = set(featList)
        newEntropy = 0
        iv = 0
        for n in uniqueFeat:
            subDataSet = splitDataSet(dataSet, i, n)
            prob = len(subDataSet) / len(dataSet)
            iv -= prob * log(prob, 2)
            newEntropy += prob * calcShonnonEntropy(subDataSet)
        infoGain = ent - newEntropy
        GainRatio = infoGain / iv
        if GainRatio > baseinfoGainRatio:
            baseinfoGainRatio = GainRatio
            bestFeat = i
    return bestFeat
chooseBestFeatureTosplit_GainRatio(dataSet)

0.9709505944546686


0

#### 创建树

![image.png](attachment:image.png)

In [166]:
def majorityCnt(classList):
    classCount = {}
    for vote in classList:
        classCount[vote] = classCount.get(vote, 0) + 1
    sortedclassCount = sorted(classCount.items(), key=operator.itemgetter, reverse=True)
    return sortedclassCount[0][0]

In [167]:
def creatBranch(dataSet, labels):
    classList = [example[-1] for example in dataSet]
    classList
    # 检测数据集中的所有数据的分类标签是否相同
    if classList.count(classList[0]) == len(classList):
        return classList[0]
    # 使用完了所有特征 仍然不能将数据集划分为仅包含唯一类别的分组
    if len(dataSet[0]) == 1:
        return majorityCnt(classList)
    # 选择最优的列
    bestFeat = chooseBestFeatureTosplit_Gain(dataSet)
    print(bestFeat)
    # 获取列的名称
    bestFeatLabel = labels[bestFeat]
    # 初始化myTree
    myTree = {bestFeatLabel : {}}
    del labels[bestFeat]
    # 取出最优列，然后它的branch做分类
    featValues = [example[bestFeat] for example in dataSet]
    uniqueVals = set(featValues)
    for value in uniqueVals:
        sublabels = labels[:]
        myTree[bestFeatLabel][value] = creatBranch(splitDataSet(dataSet, bestFeat, value), sublabels)
    return myTree
inputTree = creatBranch(dataSet, labels)
inputTree.keys()

0.9709505944546686
0
0.9182958340544896
0


dict_keys(['no surfacing'])

#### 使用决策树

In [176]:
def classify(inputTree, featLabels, testVec):
    """
    inputTree: 决策树模型
    featLabels: 标签对应名称
    testVec: 输入数据
    """
    # 获取tree的根节点对应于key的值
    firstStr = list(inputTree.keys())[0]
    print(firstStr)
    # 通过key得到根节点对应的value
    secondDict = inputTree[firstStr]
    print(secondDict)
    # 判断根节点名称 获取根节点在label中的先后顺序
    print(featLabels)
    featIndex = featLabels.index(firstStr)
    # 测试数据
    key = testVec[featIndex]
    valueOFfet = secondDict[key]
    # 判断分支是否结束
    if isinstance (valueOFfet, dict):
        classLabel = classify(valueOFfet, featLabels, testVec)
    else:
        classLabel = valueOFfet
    return classLabel

In [177]:
classify(inputTree, ['no surfacing', 'flippers'], [1, 1])

no surfacing
{0: 'no', 1: {'flippers': {0: 'no', 1: 'yes'}}}
['no surfacing', 'flippers']
flippers
{0: 'no', 1: 'yes'}
['no surfacing', 'flippers']


'yes'

TypeError: 'builtin_function_or_method' object is not subscriptable