In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import datasets
%matplotlib inline

In [8]:
0.1/150/np.sqrt(2)

0.0004714045207910317

In [11]:
20*np.log(8.74/0.052)

102.48843500136309

In [2]:
class DecisionTree:
    def __init__(self):
        self.tree = {}

    # 计算信息熵
    def entropy(self, data):
        unique_values, counts = np.unique(data, return_counts=True)
        probs = counts / len(data)
        entropy = 0
        for prob in probs:
            if prob > 0:
                entropy -= prob * np.log2(prob)
        return entropy

    # 计算信息增益
    def information_gain(self, feature_data, target_data):
        entropy_before = self.entropy(target_data)
        entropies_after = []
        for value in np.unique(feature_data):
            mask = (feature_data == value)
            entropy_after = self.entropy(target_data[mask])
            entropies_after.append(entropy_after)
        return entropy_before - np.mean(entropies_after)

    # 基于信息增益选择最优特征
    def choose_best_feature(self, features_data, target_data):
        best_gain = 0
        best_feature_index = None
        num_features = len(features_data[0])
        for i in range(num_features):
            gain = self.information_gain(features_data[:, i], target_data)
            if gain > best_gain:
                best_gain = gain
                best_feature_index = i
        return best_feature_index

    # 构建决策树
    def build_tree(self, features_data, target_data):
        best_feature_index = self.choose_best_feature(features_data, target_data)
        if best_feature_index is not None:
            tree = {}
            unique_values = np.unique(features_data[:, best_feature_index])
            for value in unique_values:
                mask = (features_data[:, best_feature_index] == value)
                sub_target_data = target_data[mask]
                sub_features_data = features_data[mask]
                sub_tree = {}
                sub_tree[value] = self.build_tree(sub_features_data, sub_target_data)
                tree[value] = sub_tree
        else:
            mode_value, _ = np.unique(target_data, return_counts=True)
            tree = mode_value[0]
        return tree

# 测试样例
dtree = DecisionTree()
original_data = np.loadtxt('lenses.data',dtype='str')
features_data = original_data[:,1:-1]
target_data = original_data[:,-1]
tree = dtree.build_tree(features_data, target_data)
print(tree)

{'1': {'1': '3'}, '2': {'2': {'1': {'1': {'1': {'1': '2'}, '2': {'2': '2'}, '3': {'3': {'1': {'1': '3'}, '2': {'2': '2'}}}}}, '2': {'2': {'1': {'1': '1'}, '2': {'2': {'1': {'1': '1'}, '2': {'2': '3'}, '3': {'3': '3'}}}}}}}}
