In [1]:
import urllib.request
import pandas as pd
import numpy as np
from io import StringIO
from collections import Counter
from collections import OrderedDict

In [2]:
data = \
'''
编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,好瓜
2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,好瓜
3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,好瓜
4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,好瓜
5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,好瓜
6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,好瓜
7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,好瓜
8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,好瓜
9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,坏瓜
10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,坏瓜
11,浅白,硬挺,清脆,模糊,平坦,硬滑,0.245,0.057,坏瓜
12,浅白,蜷缩,浊响,模糊,平坦,软粘,0.343,0.099,坏瓜
13,青绿,稍蜷,浊响,稍糊,凹陷,硬滑,0.639,0.161,坏瓜
14,浅白,稍蜷,沉闷,稍糊,凹陷,硬滑,0.657,0.198,坏瓜
15,乌黑,稍蜷,浊响,清晰,稍凹,软粘,0.36,0.37,坏瓜
16,浅白,蜷缩,浊响,模糊,平坦,硬滑,0.593,0.042,坏瓜
17,青绿,蜷缩,沉闷,稍糊,稍凹,硬滑,0.719,0.103,坏瓜
'''

In [3]:
data = pd.read_csv(StringIO(data),sep = ',')

In [4]:
print(data.get_values().shape)

(17, 10)


In [5]:
data

Unnamed: 0,编号,色泽,根蒂,敲声,纹理,脐部,触感,密度,含糖率,好瓜
0,1,青绿,蜷缩,浊响,清晰,凹陷,硬滑,0.697,0.46,好瓜
1,2,乌黑,蜷缩,沉闷,清晰,凹陷,硬滑,0.774,0.376,好瓜
2,3,乌黑,蜷缩,浊响,清晰,凹陷,硬滑,0.634,0.264,好瓜
3,4,青绿,蜷缩,沉闷,清晰,凹陷,硬滑,0.608,0.318,好瓜
4,5,浅白,蜷缩,浊响,清晰,凹陷,硬滑,0.556,0.215,好瓜
5,6,青绿,稍蜷,浊响,清晰,稍凹,软粘,0.403,0.237,好瓜
6,7,乌黑,稍蜷,浊响,稍糊,稍凹,软粘,0.481,0.149,好瓜
7,8,乌黑,稍蜷,浊响,清晰,稍凹,硬滑,0.437,0.211,好瓜
8,9,乌黑,稍蜷,沉闷,稍糊,稍凹,硬滑,0.666,0.091,坏瓜
9,10,青绿,硬挺,清脆,清晰,平坦,软粘,0.243,0.267,坏瓜


In [6]:
assert data.duplicated().any().any() == False
assert data.isnull().any().any()     == False

In [7]:
# discrete
y = np.array(data)[:,-1]
x = np.array(data)[:,1:-3]

a = OrderedDict()
for i in data.axes.pop()[1:-3]:
    a[i] = set(data[i])

In [9]:
def entropy(labels):
    sumE = 0
    for label in set(labels):
        p = np.sum(labels == label) / labels.shape[0]
        # assert p != 0
        if p == 0:
            continue
        sumE -= p * np.log2(p)
        
    return sumE
        

In [10]:
def gain(x, y, att):

    sumE = entropy(y)
    for dv in set(x[:,att]):
        labels = y[x[:,att] == dv]
        sumE -= entropy(labels) * np.sum(x[:,att] == dv) / x[:,att].shape[0]
    return sumE

In [11]:
fbestIndex = lambda x,y,a: np.argmax([gain(x,y,att=i) for i in range(len(a))])

In [12]:
def generateTree(x,y,a):

    if len(set(y)) == 1:
        return set(y).pop()
    
    if len(a) == 1:
        return {list(a.keys()).pop():Counter(y.tolist()).most_common(1).pop()[0]}
    
    tree = {}
    
    bestIndex= fbestIndex(x,y,a)
    bestAttrs = a[list(a)[bestIndex]]
    for bestAttr in bestAttrs:
        
        dvx = x[x[:,bestIndex] == bestAttr,:]
        dvx = np.delete(dvx,bestIndex,axis=1)
        
        if dvx.shape[0] == 0:
            
            tree[bestAttr] = Counter(y.tolist()).most_common(1).pop()[0]
        else:

            dvy = y[x[:,bestIndex] == bestAttr]
            dva = a.copy()
            dva.pop(list(a)[bestIndex])

            tree[bestAttr] = generateTree(dvx,dvy,dva)
            
    return tree

In [13]:
generateTree(x,y,a)

{'模糊': '坏瓜',
 '清晰': {'硬挺': '坏瓜',
  '稍蜷': {'乌黑': {'硬滑': '好瓜', '软粘': '坏瓜'}, '浅白': '好瓜', '青绿': '好瓜'},
  '蜷缩': '好瓜'},
 '稍糊': {'硬滑': '坏瓜', '软粘': '好瓜'}}