In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

数据集来源: http://archive.ics.uci.edu/ml/datasets/Heart+Disease 
- 选择这个数据集的原因在于特征既有离散型和连续型两种类型

In [3]:
df = pd.read_csv("processed.cleveland.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restectg","thalach","exang","oldpeak","slope","ca","thal","num"])
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restectg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
# numerical or categorical
for index in df:
    print(index,len(set(df[index])))

age 41
sex 2
cp 4
trestbps 50
chol 152
fbs 2
restectg 3
thalach 91
exang 2
oldpeak 40
slope 3
ca 5
thal 4
num 5


In [5]:
# drop ? values
print(len(df))
index = ["?" not in row for row in df.values]
# convert to np.float
dataset = df.values[index].astype(np.float32)
print(len(dataset))
# split to X,y
X,y = dataset[:,:-1],dataset[:,-1]
y = y.astype(np.int64)
print(X.shape)
print(y.shape)

303
297
(297, 13)
(297,)


In [6]:
kinds = ["categorical" if len(set(col))<6 else "numerical"for col in X.T]
print(len(kinds))

13


In [7]:
from sklearn.model_selection import train_test_split
from collections import Counter
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [8]:
def entropy(y):
    precs = np.array(list(Counter(y).values()))/len(y)
    ent = np.sum(-1 * precs * np.log2(precs))
    return ent

In [9]:
# 选择哪个特征进行分裂
def decide_feature(X,y,fas,kinds):
    # fas refers to feature_available;if one feature can be splitted,this feature_available is True,else False
    (n_samples,n_features) = X.shape
    ers = np.zeros(n_features)
    bestfvs = np.zeros(n_features)
    for fi in range(n_features):
        if not fas[fi]:
            continue
        
        if kinds[fi] == "categorical":
            I,H = entropy(y),0
            for fv,c in Counter(X[:,fi]).items():
                splity = y[X[:,fi] == fv]
                proc = c/n_samples
                I -= proc * entropy(splity)
                H += -proc * np.log2(proc)
            ers[fi] = I/H
        else:
            for fv in list(sorted(set(X[:,fi])))[:-1]:
                splity_less = y[X[:,fi] <= fv]
                proc_less = len(splity_less) / n_samples
                splity_more = y[X[:,fi] > fv]
                proc_more = len(splity_more) / n_samples
                I = -proc_less * entropy(splity_less) - proc_more * entropy(splity_more) + entropy(y)
                H = -1*proc_less * np.log2(proc_less) - proc_more * np.log2(proc_more)
                if I/H > ers[fi]:
                    ers[fi] = I/H
                    bestfvs[fi] = fv
    return ers,bestfvs

In [10]:
fas = np.array([True]*len(y_train))
decide_feature(X_train,y_train,fas,kinds)

(array([0.12727528, 0.06540155, 0.1236421 , 0.33174944, 0.17997159,
        0.04576981, 0.06550353, 0.3644227 , 0.13087699, 0.37516299,
        0.12154806, 0.1668392 , 0.17192469]),
 array([ 37.       ,   0.       ,   0.       , 192.       , 394.       ,
          0.       ,   0.       ,  71.       ,   0.       ,   4.4000001,
          0.       ,   0.       ,   0.       ]))

In [11]:
def build_tree(X,y,fas,kinds):
    counts = dict(Counter(y))
    if len(counts) == 1 or (fas==False).all():
        result = max(counts,key=counts.get)
        return {"counts":counts,"result":result}
    ers,bestfvs = decide_feature(X,y,fas,kinds)
    next_ = {}
    bestfi = np.argmax(ers)
    if kinds[bestfi] == "categorical":
        fas[bestfi] = False
        for fv in set(X[:,bestfi]):
            index = (X[:,bestfi] == fv) 
            next_["{}{}".format("==",fv)] = build_tree(X[index],y[index],fas,kinds)
    else:
        bestfv = bestfvs[bestfi]
        index_less = X[:,bestfi] <= bestfv
        fas_less = fas.copy()
        if len(set(X[index_less][:,bestfi])) == 1:
            fas_less[bestfi] = False
        next_["{}{}".format("<=",bestfv)] = build_tree(X[index_less],y[index_less],fas_less,kinds)
        index_more = X[:,bestfi] > bestfv
        fas_more = fas.copy()
        if len(set(X[index_more][:,bestfi])) == 1:
            fas_more[bestfi] = False
        next_["{}{}".format(">",bestfv)] = build_tree(X[index_more],y[index_more],fas_more,kinds)
    return {"fi":bestfi,"counts":counts,"result":None,"next":next_}

In [12]:
fas = np.array([True]*X_train.shape[-1])
tree = build_tree(X_train,y_train,fas,kinds)
tree



{'fi': 9,
 'counts': {3: 27, 0: 125, 2: 22, 4: 9, 1: 39},
 'result': None,
 'next': {'<=4.400000095367432': {'fi': 9,
   'counts': {3: 25, 0: 125, 2: 22, 4: 9, 1: 39},
   'result': None,
   'next': {'<=4.199999809265137': {'fi': 7,
     'counts': {3: 25, 0: 125, 2: 22, 4: 8, 1: 39},
     'result': None,
     'next': {'<=71.0': {'counts': {2: 1}, 'result': 2},
      '>71.0': {'fi': 3,
       'counts': {3: 25, 0: 125, 2: 21, 4: 8, 1: 39},
       'result': None,
       'next': {'<=192.0': {'fi': 9,
         'counts': {3: 24, 0: 125, 2: 21, 4: 8, 1: 39},
         'result': None,
         'next': {'<=3.5999999046325684': {'fi': 3,
           'counts': {3: 23, 0: 124, 2: 21, 4: 6, 1: 39},
           'result': None,
           'next': {'<=180.0': {'fi': 7,
             'counts': {3: 23, 0: 124, 2: 21, 4: 6, 1: 38},
             'result': None,
             'next': {'<=88.0': {'counts': {1: 1}, 'result': 1},
              '>88.0': {'fi': 7,
               'counts': {3: 23, 0: 124, 2: 21, 4: 6,

In [18]:
def predict_one(x,kinds,tree):
    while tree["result"] == None:
        fi = tree["fi"]
        fv = x[fi]
        flag = False
        for condition in tree["next"]:
            if eval(str(fv)+condition):
                tree = tree["next"][condition]
                flag = True
                break
        if not flag:
            counts = tree["counts"]
            return max(counts,key=counts.get)
    return tree["result"]

In [257]:
class C45DecisionTree:
    @staticmethod
    def entropy(y):
        precs = np.array(list(Counter(y).values()))/len(y)
        ent = np.sum(-1 * precs * np.log2(precs))
        return ent

    # 选择哪个特征进行分裂
    def decide_feature(self,X,y,fas,kinds):
        # fas refers to feature_available;if one feature can be splitted,this feature_available is True,else False
        (n_samples,n_features) = X.shape
        ers = np.ones(n_features) * -1
        bestfvs = np.zeros(n_features)
        for fi in range(n_features):
            if not fas[fi]:
                continue

            if kinds[fi] == "categorical":
                I,H = self.entropy(y),0
                for fv,c in Counter(X[:,fi]).items():
                    splity = y[X[:,fi] == fv]
                    proc = c/n_samples
                    I -= proc * self.entropy(splity)
                    H += -proc * np.log2(proc)
                ers[fi] = I/(H+1e-7)
            else:
#                 print(set(X[:,fi]))
                for fv in list(sorted(set(X[:,fi])))[:-1]:
                    splity_less = y[X[:,fi] <= fv]
                    proc_less = len(splity_less) / n_samples
                    splity_more = y[X[:,fi] > fv]
                    proc_more = len(splity_more) / n_samples
                    I = -proc_less * self.entropy(splity_less) - proc_more * self.entropy(splity_more) + self.entropy(y)
                    H = -1*proc_less * np.log2(proc_less) - proc_more * np.log2(proc_more)
                    if I/H > ers[fi]:
                        ers[fi] = I/H
                        bestfvs[fi] = fv
        return ers,bestfvs

    def build_tree(self,X,y,fas,kinds):
        counts = dict(Counter(y))
        result = max(counts,key=counts.get)
#         print("fas",fas,"len(counts)",len(counts))
        if len(counts) == 1 or (fas==False).all():
            return {"counts":counts,"result":result}
        ers,bestfvs = self.decide_feature(X,y,fas,kinds)
        if (ers == -1).all():
            return {"counts":counts,"result":result}
        next_ = {}
        bestfi = np.argmax(ers)
#         print(bestfi,ers)
        if kinds[bestfi] == "categorical":
            fas[bestfi] = False
            for fv in set(X[:,bestfi]):
                index = (X[:,bestfi] == fv)
#                 print("next: {} {} {}, size:{}".format(bestfi,"==",fv,len(y[index])))
                next_["{}{}".format("==",fv)] = self.build_tree(X[index],y[index],fas,kinds)
        else:
            bestfv = bestfvs[bestfi]
            index_less = X[:,bestfi] <= bestfv
            fas_less = fas.copy()
            if len(set(X[index_less][:,bestfi])) == 1:
                fas_less[bestfi] = False
#             print("next: {} {} {}, size:{}".format(bestfi,"<=",bestfv,len(y[index_less])))
            next_["{}{}".format("<=",bestfv)] = self.build_tree(X[index_less],y[index_less],fas_less,kinds)
            index_more = X[:,bestfi] > bestfv
            fas_more = fas.copy()
            if len(set(X[index_more][:,bestfi])) == 1:
                fas_more[bestfi] = False
#             print("next: {} {} {}, size:{}".format(bestfi,">=",bestfv,len(y[index_more])))
            next_["{}{}".format(">",bestfv)] = self.build_tree(X[index_more],y[index_more],fas_more,kinds)
        return {"fi":bestfi,"counts":counts,"result":None,"next":next_}
    
    def fit(self,X,y,kinds):
        fas = np.array([True]*X.shape[-1])
        self.tree = self.build_tree(X,y,fas,kinds)

    def predict_one(self,x):
        tree = self.tree
        while tree["result"] == None:
            fi = tree["fi"]
            fv = x[fi]
            flag = False
            for condition in tree["next"]:
                if eval(str(fv)+condition):
                    tree = tree["next"][condition]
                    flag = True
                    break
            if not flag:
                counts = tree["counts"]
                return max(counts,key=counts.get)
        return tree["result"]
    
    def predict(self,X):
        y_predicts = []
        for x_test in X_test:
            y_predicts.append(self.predict_one(x_test))
        return y_predicts
        
    def score(self,X_test,y_test):
        y_predicts = self.predict(X_test)
        return np.sum(y_predicts == y_test)/ len(y_test)

In [263]:
mytree = C45DecisionTree()
mytree.fit(X_train,y_train,kinds)
mytree.score(X_test,y_test)

0.48