# 模块载入

In [1]:
import numpy as np 
import pandas as pd

# 1.创建数据

In [2]:
def createData():
    dataDic = {"age":['young', 'young', 'young', 'young', 'young', 'mid', 'mid', 'mid', 'mid', 'mid', 'old', 'old', 'old', 'old', 'old'], 
               "work":['no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no', 'no', 'no', 'yes', 'yes', 'no'], 
               "house":['no', 'no', 'no', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'no', 'no', 'no'], 
               "credit":['normal', 'good', 'good', 'normal', 'normal', 'normal', 'good', 'good', 'verygood', 'verygood', 'verygood', 'good', 'good', 'verygood', 'normal'], 
               "class":['no', 'no', 'yes', 'yes', 'no', 'no', 'no', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'no']}
    df = pd.DataFrame(dataDic,columns=['age','work','house','credit','class'])
    return df

In [3]:
data = createData()
data

Unnamed: 0,age,work,house,credit,class
0,young,no,no,normal,no
1,young,no,no,good,no
2,young,yes,no,good,yes
3,young,yes,yes,normal,yes
4,young,no,no,normal,no
5,mid,no,no,normal,no
6,mid,no,no,good,no
7,mid,yes,yes,good,yes
8,mid,no,yes,verygood,yes
9,mid,no,yes,verygood,yes


# 2. 计算信息熵

In [4]:
def calEntropy(dataset, col=-1):
    if col==-1:
        label = dataset.iloc[:,-1].value_counts()
        prob = np.array(label*1.0/np.sum(label))
        entropy = -np.sum(prob*np.log2(prob))
    else:
        featureLabel = dataset.iloc[:,[col,-1]]
        tmp = featureLabel.groupby([dataset.columns[col],dataset.columns[-1]]).size()
        feature = dataset.iloc[:,col].unique()
        entropy = 0
        for i in feature:
            freq = np.sum(np.array(tmp[i]))*1.0/len(dataset)
            prob = (np.array(tmp[i])*1.0)/np.sum(np.array(tmp[i]))
            entropySub = -np.sum(prob*np.log2(prob))
            entropy += entropySub*freq
    return entropy

In [5]:
print calEntropy(data, col=-1)-calEntropy(data, col=0)
print calEntropy(data, col=-1)-calEntropy(data, col=1)
print calEntropy(data, col=-1)-calEntropy(data, col=2)
print calEntropy(data, col=-1)-calEntropy(data, col=3)

0.08300749985576883
0.32365019815155627
0.4199730940219749
0.36298956253708536


# 3. 根据数据返回相应数据最优特征

In [6]:
def bestFeature(dataset):
    bestFeature = -1
    bestIG = 0
    for i in range(dataset.shape[1]-1):
        IG = calEntropy(dataset, col=-1)-calEntropy(dataset, col=i)
        if (IG>=bestIG):
            bestIG = IG
            bestFeature = i
    return dataset.columns[bestFeature]


bestFeature(data.loc[data.house=='no',])


'work'

# 4. 根据某一列对数据进行分割

In [7]:
def dataSplit(dataset, col):
    colValue = dataset.loc[:,col].unique()
    subDfList = []
    for i in colValue:
        subData = dataset.loc[dataset.loc[:,col]==i,:]
        subDataDelete = subData.drop([col], axis=1)
        subDataDelete = subDataDelete.reset_index(drop = True)
        subDfList.append(subDataDelete) # 返回多个pandas dataframe
    return subDfList,colValue # 返回子dataframe和相应列的值
dataSplit(data,'age')

([  work house  credit class
  0   no    no  normal    no
  1   no    no    good    no
  2  yes    no    good   yes
  3  yes   yes  normal   yes
  4   no    no  normal    no,   work house    credit class
  0   no    no    normal    no
  1   no    no      good    no
  2  yes   yes      good   yes
  3   no   yes  verygood   yes
  4   no   yes  verygood   yes,   work house    credit class
  0   no   yes  verygood   yes
  1   no   yes      good   yes
  2  yes    no      good   yes
  3  yes    no  verygood   yes
  4   no    no    normal    no], array(['young', 'mid', 'old'], dtype=object))

# 5. 迭代计算决策树

In [8]:
def decisionTree(dataset):
    tree = {}
    IG = 100
    label = 'None'
    if calEntropy(dataset)==0:
        return dataset.iloc[1,-1] # 标签相同停止迭代，返回最终标签
    if dataset.shape[1]==1 or IG<0.1:
        return dataset.iloc[:,-1].value_counts().argmax() # 数据特征没有多余的，或者信息增益小于阈值，停止迭代，返回标签
    bestfeature = bestFeature(dataset)
    subData,colValue = dataSplit(dataset, bestfeature)
    tree[bestfeature] = {}
    for i,v in enumerate(colValue):
        tree[bestfeature][v] = decisionTree(subData[i])
    return tree


In [9]:
decisionTree(data,)

{'house': {'no': {'work': {'no': 'no', 'yes': 'yes'}}, 'yes': 'yes'}}