In [1]:
import numpy as np
import pandas as pd

In [2]:
def Info_Ent(Ydata: pd.Series) ->float:
    '''
    计算信息熵
    '''
    n_sample = len(Ydata)
    prob = Ydata.value_counts() / n_sample
    info_ent = (prob * np.log2(prob)).sum()
    return info_ent

In [3]:
def Info_Gain(Xdata: pd.DataFrame, Ydata: pd.Series, attr: int) ->float:
    '''
    计算信息增益
    '''
    info_ent = Info_Ent(Ydata)
    vals = Xdata[attr].unique()
    n_sample = len(Xdata)
    attr_ent = 0
    for v in vals:
        idx = (Xdata[attr] == v)
        attr_ent += len(idx) / n_sample * Info_Ent(Ydata[idx])
    info_gain = info_ent - attr_ent
    return info_gain

In [None]:
class DecisionTree:
    
    def __init__(self):
        pass
    
    def fit(self, Xtrain: pd.DataFrame, Ytrain: pd.Series) ->dict:
        '''
        训练
        '''
        attr_set = Xdata.columns.values
        self.tree = self.createTree(Xtrain, Ytrain, attr_set)
        return tree
    
    def createTree(self, Xdata, Ydata, attr_set):
        if len(Ydata) == 0:
            return {}
        if len(Ydata.unique()) == 1:
            return Ydata[0]
        if len(attr_set == 0) or len(Xdata.drop_duplicates()) == 1:
            return Ydata.value_counts().index[0]
        tree = {}
        gains = np.array([])
        for attr in attr_set:
            gains = np.append(gains, Info_Gain(Xdata, Ydata, attr))
        attr_idx = gains.argmax()
        div_attr = attr_set[attr_idx]
        childs = {}
        tree[div_attr] = childs
        new_attr_set = np.delete(attr_set, attr_idx)
        attr_vals = Xdata[div_attr].unique()
        for val in attr_vals:
            samp_idx = (Xdata[div_attr] == val)
            X = Xdata[samp_idx]
            Y = Ydata[samp_idx]
            childs[val] = createTree(X, Y, new_attr_set)
            if type(childs[val]) == dict and len(childs[val]) == 0:
                childs[val] = Ydata.value_counts().index[0]
        return tree
        
    def predict_1(self, x):
        cur = self.tree
        while type(cur) == dict:
            attr = list(t.keys())[0]
            cur = cur[attr]
            cur = cur[x[attr]]
        return cur
    
    def predict_n(self, Xtest):
        Ypred = []
        for x in Xtest:
            Ypred.append(self.predict_1(x))
        return Ypred
    
    def evaluate(self, Xtest, Ytest):
        Ypred = self.predict_n(Xtest)
        accurancy = (Ypred == Ytest).sum() / len(Ytest)
        return accurancy

In [None]:
# 测试