In [1]:
import numpy as np
import pandas as pd

In [2]:
#sep分隔符，加了才能正常打开文件
lenses = pd.read_table("./lenses.txt", sep="\s+")
lenses = lenses.drop('id', axis = 1)
dataset = lenses.values.tolist()

In [3]:
lenses

Unnamed: 0,age,prescription,astigmtic,rate,type
0,1,1,1,1,3
1,1,1,1,2,2
2,1,1,2,1,3
3,1,1,2,2,1
4,1,2,1,1,3
5,1,2,1,2,2
6,1,2,2,1,3
7,1,2,2,2,1
8,2,1,1,1,3
9,2,1,1,2,2


# ID3

In [5]:
#计算信息熵
def Ent(dataset):
    #统计每个分类结果分别出现的次数
    kinds = {}
    for value in dataset:
        i = value[-1]
        if i not in kinds.keys():
            count = 0
            kinds[i] = count
        kinds[i] = kinds[i] + 1
    
    ent = 0.0
    for i in kinds.keys():
        ent = ent + ((kinds[i] / len(dataset)) + np.log2(kinds[i] / len(dataset)))
    ent = -ent
    return ent

In [6]:
#dataset:原始数据集
#axis:第几个特征（就是数据集中的第几列）
#value:特征的值，例如特征age,age=1,1就是特征的值
def split_dataset(dataset, axis, value):
    new_dataset = []
    for data in dataset:
        if data[axis] == value:
            t = data[:axis]
            t.extend(data[axis+1:])
            new_dataset.append(t)
            #相当于删除第axis列，得到一个新的数据集
    return new_dataset

In [7]:
#用于计算信息增益，选择最优特征
def Gain(dataset):
    #feats:用来装特征值
    feats = []
    #最优特征
    best_feat = -1
    #最大的信息增益
    gain_max = 0.0
    #数据集的熵
    base_ent = Ent(dataset)
    #特征的个数
    feat_len = len(dataset[0]) - 1
    
    #遍历每个特征
    for i in range(feat_len):
        #提取每个特征的不重复的特征值
        for data in dataset:
            if data[i] not in feats:
                feats.append(data[i])
            #这一步可以化简代码为
#         feats = [data[i] for data in dataset]
#         feats = set(feats)

        #信息增益
        gain = 0.0
        #每个特征值的信息熵
        new_ent = 0.0
        
        #遍历每个特征值
        for feat in feats:
            new_dataset = split_dataset(dataset, i, feat)
            #计算条件概率
            prob = len(new_dataset) / len(dataset)
            #计算经验条件熵
            new_ent += prob * Ent(new_dataset)
        #得到信息增益
        gain = base_ent - new_ent
        
        #得到最大的信息增益和它对应的特征
        if gain > gain_max:
            gain_max = gain
            best_feat = i
    return best_feat

In [8]:
#选出出现得最多得特征
#labels_list:分类结果，即数据集最后一列
def voting(labels_list):
    features = {}
    #计算每种分类结果出现的次数
    for label in labels_list:
        if label not in features.keys():
            features[label] = 0
        features[label] = features[label] + 1
    #选出分类结果出现得最多的次数
    type_name = sorted(features.items(), reverse = True)
    #返回出现最多的分类结果
    return type_name[0][0]

In [9]:
#labels:特征('age', 'prescription', 'astigmtic', 'rate')
def creat_tree(dataset, labels):
    labels_list = []
    #将分类结果(数据集最后一列）放置列表中
    for data in dataset:
        labels_list.append(data[-1])
    #如果只有一种分类结果
    if labels_list.count((labels_list[0])) == len(labels_list):
        return labels_list[0]
    #如果只剩下一列（即剩下数据集最后一列）,选出出现最多次的分类结果
    if len(dataset[0]) == 1:
        return voting(labels_list)
    #获得最优特征的位置
    best_feat = Gain(dataset)
    #获得最优特征
    best_feat_label = labels[best_feat]
    tree = {best_feat_label:{}}
    #把最优特征从分类结果中删除
    del(labels[best_feat])
    #获取最优特征值
    #feat_values = []
    for data in dataset:
#         if data[best_feat] not in feat_values:
#             feat_values.append(data[best_feat])
        feat_values = [example[best_feat] for example in dataset]
        unique_vals = set(feat_values)
    #遍历最优特征值
    for feat in feat_values:
        #用来存放除去最优特征后的特征
        sub_labels = labels[:]
        #递归求解
        tree[best_feat_label][feat] = creat_tree(split_dataset(dataset, best_feat, feat), sub_labels)
    return tree

In [10]:
def classify_tree(input_tree, feat_labels, test_vec):
    first_str = list(input_tree.keys())[0]
    second_dict = input_tree[first_str]
    # 将标签字符串转换为索引
    feat_index = feat_labels.index(first_str)
    class_label = {}
    # 递归遍历整棵树
    for key in second_dict.keys():
        if test_vec[feat_index] == key:
            if type(second_dict[key]).__name__ == 'dict':
                class_label = classify_tree(second_dict[key], feat_labels, test_vec)
            else:
                class_label = second_dict[key]
    return class_label

In [11]:
labels = [ 'age', 'prescription', 'astigmtic', 'rate']

In [12]:
tree = creat_tree(dataset, labels)
tree

{'astigmtic': {1: {'rate': {1: 3,
    2: {'age': {1: 2, 2: 2, 3: {'prescription': {1: 3, 2: 2}}}}}},
  2: {'rate': {1: 3,
    2: {'prescription': {1: 1, 2: {'age': {1: 1, 2: 3, 3: 3}}}}}}}}

In [14]:
labels = [ 'age', 'prescription', 'astigmtic', 'rate']
test = lenses.drop('type', axis = 1).values.tolist()
results = []
for i in range(len(test)):
    results.append(classify_tree(tree, labels, test[i]))

没有剪枝，感觉是过拟合了

In [15]:
results

[3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 3, 3, 3, 3, 1, 3, 2, 3, 3]

In [17]:
lenses['type'].values.tolist()

[3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 1, 3, 2, 3, 3, 3, 3, 3, 1, 3, 2, 3, 3]