In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

数据集来源: http://archive.ics.uci.edu/ml/datasets/Heart+Disease 
- 选择这个数据集的原因在于特征既有离散型和连续型两种类型

In [2]:
df = pd.read_csv("processed.cleveland.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restectg","thalach","exang","oldpeak","slope","ca","thal","num"])
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restectg,thalach,exang,oldpeak,slope,ca,thal,num
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
# numerical or categorical
for index in df:
    print(index,len(set(df[index])))

age 41
sex 2
cp 4
trestbps 50
chol 152
fbs 2
restectg 3
thalach 91
exang 2
oldpeak 40
slope 3
ca 5
thal 4
num 5


In [4]:
# drop ? values
print(len(df))
index = ["?" not in row for row in df.values]
# convert to np.float
dataset = df.values[index].astype(np.float32)
print(len(dataset))
# split to X,y
X,y = dataset[:,:-1],dataset[:,-1]
y = y.astype(np.int64)
print(X.shape)
print(y.shape)

303
297
(297, 13)
(297,)


In [34]:
kinds = ["categorical" if len(set(col))<6 else "numerical"for col in X.T]
print(len(kinds))

13


In [6]:
from sklearn.model_selection import train_test_split
from collections import Counter
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [7]:
def entropy(y):
    precs = np.array(list(Counter(y).values()))/len(y)
    ent = np.sum(-1 * precs * np.log2(precs))
    return ent

In [8]:
# 选择哪个特征进行分裂
def decide_feature(X,y,fas,kinds):
    # fas refers to feature_available;if one feature can be splitted,this feature_available is True,else False
    (n_samples,n_features) = X.shape
    ers = np.zeros(n_features)
    bestfvs = np.zeros(n_features)
    for fi in range(n_features):
        if not fas[fi]:
            continue
        
        if kinds[fi] == "categorical":
            I,H = entropy(y),0
            for fv,c in Counter(X[:,fi]).items():
                splity = y[X[:,fi] == fv]
                proc = c/n_samples
                I -= proc * entropy(splity)
                H += -proc * np.log2(proc)
            ers[fi] = I/H
        else:
            for fv in list(sorted(set(X[:,fi])))[:-1]:
                splity_less = y[X[:,fi] <= fv]
                proc_less = len(splity_less) / n_samples
                splity_more = y[X[:,fi] > fv]
                proc_more = len(splity_more) / n_samples
                I = -proc_less * entropy(splity_less) - proc_more * entropy(splity_more) + entropy(y)
                H = -1*proc_less * np.log2(proc_less) - proc_more * np.log2(proc_more)
                if I/H > ers[fi]:
                    ers[fi] = I/H
                    bestfvs[fi] = fv
    return ers,bestfvs

In [9]:
fas = np.array([True]*len(y_train))
decide_feature(X_train,y_train,fas,kinds)

(array([0.47644121, 0.07059057, 0.16897168, 0.29052733, 0.33776014,
        0.05824757, 0.05307299, 0.35731845, 0.17548659, 0.3820413 ,
        0.09767001, 0.14835536, 0.16813304]),
 array([ 76.       ,   0.       ,   0.       , 180.       , 131.       ,
          0.       ,   0.       ,  71.       ,   0.       ,   4.4000001,
          0.       ,   0.       ,   0.       ]))

In [12]:
def build_tree(X,y,fas,kinds):
    counts = dict(Counter(y))
    if len(counts) == 1 or (fas==False).all():
        result = max(counts,key=counts.get)
        return {"counts":counts,"result":result}
    ers,bestfvs = decide_feature(X,y,fas,kinds)
    next_ = {}
    bestfi = np.argmax(ers)
    if kinds[bestfi] == "categorical":
        fas[bestfi] = False
        for fv in set(X[:,bestfi]):
            index = (X[:,bestfi] == fv) 
            next_["{}{}".format("==",fv)] = build_tree(X[index],y[index],fas,kinds)
    else:
        bestfv = bestfvs[bestfi]
        index_less = X[:,bestfi] <= bestfv
        fas_less = fas.copy()
        if len(set(X[index_less][:,bestfi])) == 1:
            fas_less[bestfi] = False
        next_["{}{}".format("<=",bestfv)] = build_tree(X[index_less],y[index_less],fas_less,kinds)
        index_more = X[:,bestfi] > bestfv
        fas_more = fas.copy()
        if len(set(X[index_more][:,bestfi])) == 1:
            fas_more[bestfi] = False
        next_["{}{}".format(">",bestfv)] = build_tree(X[index_more],y[index_more],fas_more,kinds)
    return {"fi":bestfi,"counts":counts,"result":None,"next":next_}

In [13]:
fas = np.array([True]*X_train.shape[-1])
tree = build_tree(X_train,y_train,fas,kinds)
tree



{'fi': 0,
 'counts': {0: 127, 1: 35, 2: 23, 4: 11, 3: 26},
 'result': None,
 'next': {'<=76.0': {'fi': 9,
   'counts': {0: 127, 1: 35, 2: 23, 4: 10, 3: 26},
   'result': None,
   'next': {'<=4.400000095367432': {'fi': 9,
     'counts': {0: 127, 1: 35, 2: 23, 4: 10, 3: 24},
     'result': None,
     'next': {'<=4.199999809265137': {'fi': 7,
       'counts': {0: 127, 1: 35, 2: 23, 4: 9, 3: 24},
       'result': None,
       'next': {'<=71.0': {'counts': {2: 1}, 'result': 2},
        '>71.0': {'fi': 4,
         'counts': {0: 127, 1: 35, 2: 22, 4: 9, 3: 24},
         'result': None,
         'next': {'<=131.0': {'counts': {3: 1}, 'result': 3},
          '>131.0': {'fi': 7,
           'counts': {0: 127, 1: 35, 2: 22, 4: 9, 3: 23},
           'result': None,
           'next': {'<=90.0': {'counts': {3: 1}, 'result': 3},
            '>90.0': {'fi': 3,
             'counts': {0: 127, 1: 35, 2: 22, 4: 9, 3: 22},
             'result': None,
             'next': {'<=180.0': {'fi': 9,
           

In [18]:
def predict_one(x,kinds,tree):
    while tree["result"] == None:
        fi = tree["fi"]
        fv = x[fi]
        flag = False
        for condition in tree["next"]:
            if eval(str(fv)+condition):
                tree = tree["next"][condition]
                flag = True
                break
        if not flag:
            counts = tree["counts"]
            return max(counts,key=counts.get)
    return tree["result"]

In [257]:
class C45DecisionTree:
    @staticmethod
    def entropy(y):
        precs = np.array(list(Counter(y).values()))/len(y)
        ent = np.sum(-1 * precs * np.log2(precs))
        return ent

    # 选择哪个特征进行分裂
    def decide_feature(self,X,y,fas,kinds):
        # fas refers to feature_available;if one feature can be splitted,this feature_available is True,else False
        (n_samples,n_features) = X.shape
        ers = np.ones(n_features) * -1
        bestfvs = np.zeros(n_features)
        for fi in range(n_features):
            if not fas[fi]:
                continue

            if kinds[fi] == "categorical":
                I,H = self.entropy(y),0
                for fv,c in Counter(X[:,fi]).items():
                    splity = y[X[:,fi] == fv]
                    proc = c/n_samples
                    I -= proc * self.entropy(splity)
                    H += -proc * np.log2(proc)
                ers[fi] = I/(H+1e-7)
            else:
#                 print(set(X[:,fi]))
                for fv in list(sorted(set(X[:,fi])))[:-1]:
                    splity_less = y[X[:,fi] <= fv]
                    proc_less = len(splity_less) / n_samples
                    splity_more = y[X[:,fi] > fv]
                    proc_more = len(splity_more) / n_samples
                    I = -proc_less * self.entropy(splity_less) - proc_more * self.entropy(splity_more) + self.entropy(y)
                    H = -1*proc_less * np.log2(proc_less) - proc_more * np.log2(proc_more)
                    if I/H > ers[fi]:
                        ers[fi] = I/H
                        bestfvs[fi] = fv
        return ers,bestfvs

    def build_tree(self,X,y,fas,kinds):
        counts = dict(Counter(y))
        result = max(counts,key=counts.get)
#         print("fas",fas,"len(counts)",len(counts))
        if len(counts) == 1 or (fas==False).all():
            return {"counts":counts,"result":result}
        ers,bestfvs = self.decide_feature(X,y,fas,kinds)
        if (ers == -1).all():
            return {"counts":counts,"result":result}
        next_ = {}
        bestfi = np.argmax(ers)
#         print(bestfi,ers)
        if kinds[bestfi] == "categorical":
            fas[bestfi] = False
            for fv in set(X[:,bestfi]):
                index = (X[:,bestfi] == fv)
#                 print("next: {} {} {}, size:{}".format(bestfi,"==",fv,len(y[index])))
                next_["{}{}".format("==",fv)] = self.build_tree(X[index],y[index],fas,kinds)
        else:
            bestfv = bestfvs[bestfi]
            index_less = X[:,bestfi] <= bestfv
            fas_less = fas.copy()
            if len(set(X[index_less][:,bestfi])) == 1:
                fas_less[bestfi] = False
#             print("next: {} {} {}, size:{}".format(bestfi,"<=",bestfv,len(y[index_less])))
            next_["{}{}".format("<=",bestfv)] = self.build_tree(X[index_less],y[index_less],fas_less,kinds)
            index_more = X[:,bestfi] > bestfv
            fas_more = fas.copy()
            if len(set(X[index_more][:,bestfi])) == 1:
                fas_more[bestfi] = False
#             print("next: {} {} {}, size:{}".format(bestfi,">=",bestfv,len(y[index_more])))
            next_["{}{}".format(">",bestfv)] = self.build_tree(X[index_more],y[index_more],fas_more,kinds)
        return {"fi":bestfi,"counts":counts,"result":None,"next":next_}
    
    def fit(self,X,y,kinds):
        fas = np.array([True]*X.shape[-1])
        self.tree = self.build_tree(X,y,fas,kinds)

    def predict_one(self,x):
        tree = self.tree
        while tree["result"] == None:
            fi = tree["fi"]
            fv = x[fi]
            flag = False
            for condition in tree["next"]:
                if eval(str(fv)+condition):
                    tree = tree["next"][condition]
                    flag = True
                    break
            if not flag:
                counts = tree["counts"]
                return max(counts,key=counts.get)
        return tree["result"]
    
    def predict(self,X):
        y_predicts = []
        for x_test in X_test:
            y_predicts.append(self.predict_one(x_test))
        return y_predicts
        
    def score(self,X_test,y_test):
        y_predicts = self.predict(X_test)
        return np.sum(y_predicts == y_test)/ len(y_test)

In [263]:
mytree = C45DecisionTree()
mytree.fit(X_train,y_train,kinds)
mytree.score(X_test,y_test)

0.48

In [63]:
from tqdm import tqdm

In [255]:
class RandomForest:
    def __init__(self,n_estimators=100):
        self.n_estimators = n_estimators
        self.base_clf = C45DecisionTree
        self.clfs = []
        self.index_cols = []
        
    def fit(self,X,y,kinds):
        n_samples,n_features = X.shape
        for clf in tqdm(range(self.n_estimators)):
            index = np.random.choice(n_samples,n_samples,replace=True)
            col_index = np.random.choice(n_features,8,replace=False)
            self.index_cols.append(col_index)
            X_this,y_this = X[index][:,col_index],y[index]
            kind_this = [kinds[i] for i in col_index]
            this_clf = self.base_clf()
            this_clf.fit(X_this,y_this,kind_this)
            self.clfs.append(this_clf)
    
    def predict(self,X):
        results = np.array([clf.predict(X[:,self.index_cols[i]]) for i,clf in enumerate(self.clfs)])
        y_predicts = []
        for i in range(results.shape[-1]):
            count = dict(Counter(results[:,i]))
            print(count)
            result = max(count,key=count.get)
            y_predicts.append(result)
        return y_predicts
            
    def score(self,X,y):
        y_predicts = self.predict(X)
        return np.sum(y_predicts == y) / len(y)

In [260]:
np.random.seed(22)
rf = RandomForest(n_estimators=100)
rf.fit(X_train,y_train,kinds)

100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:33<00:00,  3.03it/s]


In [261]:
rf.score(X_test,y_test)

{4: 16, 2: 3, 0: 28, 3: 34, 1: 19}
{1: 19, 4: 16, 0: 32, 3: 31, 2: 2}
{4: 16, 2: 4, 0: 28, 3: 32, 1: 20}
{3: 33, 4: 16, 0: 30, 1: 17, 2: 4}
{4: 16, 2: 2, 0: 30, 3: 32, 1: 20}
{3: 33, 4: 16, 0: 30, 1: 18, 2: 3}
{1: 19, 4: 16, 0: 31, 3: 31, 2: 3}
{4: 17, 0: 32, 3: 31, 1: 18, 2: 2}
{4: 17, 0: 31, 3: 30, 1: 20, 2: 2}
{4: 16, 2: 2, 0: 29, 3: 33, 1: 20}
{0: 31, 4: 15, 2: 2, 3: 33, 1: 19}
{3: 31, 4: 16, 0: 32, 1: 18, 2: 3}
{4: 16, 2: 3, 0: 29, 3: 33, 1: 19}
{4: 17, 0: 31, 3: 31, 1: 19, 2: 2}
{3: 32, 4: 16, 0: 30, 1: 19, 2: 3}
{0: 30, 4: 15, 2: 3, 3: 32, 1: 20}
{4: 17, 0: 31, 3: 30, 1: 19, 2: 3}
{4: 16, 2: 4, 0: 29, 3: 33, 1: 18}
{3: 34, 4: 16, 0: 27, 1: 18, 2: 5}
{0: 33, 4: 15, 2: 2, 3: 32, 1: 18}
{4: 14, 2: 3, 0: 27, 3: 34, 1: 22}
{1: 20, 4: 15, 2: 3, 0: 28, 3: 34}
{3: 32, 4: 16, 0: 32, 1: 18, 2: 2}
{3: 32, 4: 16, 0: 31, 1: 19, 2: 2}
{3: 32, 4: 16, 0: 30, 1: 19, 2: 3}
{0: 32, 4: 16, 3: 31, 1: 19, 2: 2}
{4: 16, 2: 3, 0: 29, 3: 33, 1: 19}
{3: 33, 4: 16, 0: 29, 1: 18, 2: 4}
{3: 34, 4: 16, 0: 28

0.32

In [245]:
X_this = X_train[:,rf.index_cols[-1]]
y_this = y_train
kind_this = [kinds[i] for i in rf.index_cols[-1]]

In [242]:
n_samples,n_features = X_train.shape
index = np.random.choice(n_samples,n_samples,replace=True)
col_index = np.random.choice(n_features,8,replace=False)
X_this,y_this = X_train[index][:,col_index],y_train[index]
kind_this = [kinds[i] for i in col_index]
this_clf = C45DecisionTree()
this_clf.fit(X_this,y_this,kind_this)

In [249]:
print(kind_this)

['categorical', 'categorical', 'categorical', 'numerical', 'categorical', 'categorical', 'categorical', 'numerical']


In [253]:
this_clf = C45DecisionTree()
this_clf.fit(X_this,y_this,kind_this)

fas [ True  True  True  True  True  True  True  True] len(counts) 5
{29.0, 34.0, 35.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 76.0, 77.0}
{128.0, 129.0, 130.0, 132.0, 134.0, 135.0, 136.0, 138.0, 140.0, 142.0, 144.0, 145.0, 146.0, 148.0, 150.0, 152.0, 155.0, 158.0, 160.0, 165.0, 170.0, 172.0, 174.0, 178.0, 180.0, 192.0, 94.0, 100.0, 101.0, 102.0, 105.0, 106.0, 108.0, 110.0, 112.0, 114.0, 115.0, 117.0, 118.0, 120.0, 122.0, 123.0, 124.0, 125.0, 126.0}
3 [0.11104268 0.08086429 0.02671945 0.49209132 0.13309159 0.04612286
 0.15348344 0.26938034]
next: 3 <= 76.0, size:221
fas [ True  True  True  True  True  True  True  True] len(counts) 5
{29.0, 34.0, 35.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65

next: 1 == 0.0, size:10
fas [False False  True  True  True  True False  True] len(counts) 1
next: 1 == 1.0, size:13
fas [False False  True  True  True  True False  True] len(counts) 4
{66.0, 67.0, 39.0, 43.0, 45.0, 48.0, 49.0, 51.0, 52.0, 54.0, 58.0, 62.0}
{110.0, 112.0, 115.0, 118.0, 120.0, 124.0, 125.0}
7 [-1.         -1.          0.10341415  0.29030231  0.32787627  0.16258562
 -1.          0.75161353]
next: 7 <= 112.0, size:2
fas [False False  True  True  True  True False  True] len(counts) 2
{58.0, 45.0}
{112.0, 110.0}
3 [-1.        -1.         0.         1.         0.9999999  0.9999999
 -1.         1.       ]
next: 3 <= 45.0, size:1
fas [False False  True False  True  True False  True] len(counts) 1
next: 3 >= 45.0, size:1
fas [False False  True False  True  True False  True] len(counts) 1
next: 7 >= 112.0, size:11
fas [False False  True  True  True  True False  True] len(counts) 3
{66.0, 67.0, 39.0, 43.0, 48.0, 49.0, 51.0, 52.0, 54.0, 58.0, 62.0}
{115.0, 118.0, 120.0, 124.0, 125.

5 [-1.          0.         -1.          0.17606518  0.          0.33155967
 -1.          0.17606518]
next: 5 == 0.0, size:3
fas [False  True False  True  True False False  True] len(counts) 1
next: 5 == 2.0, size:2
fas [False  True False  True  True False False  True] len(counts) 2
{54.0}
{122.0, 124.0}
7 [-1.  0. -1. -1.  0. -1. -1.  1.]
next: 7 <= 122.0, size:1
fas [False  True False  True  True False False False] len(counts) 1
next: 7 >= 122.0, size:1
fas [False  True False  True  True False False False] len(counts) 1
next: 2 == 1.0, size:1
fas [False  True False  True  True False False  True] len(counts) 1
next: 7 >= 140.0, size:1
fas [False  True  True  True  True  True False False] len(counts) 1
next: 3 >= 55.0, size:31
fas [False  True  True  True  True  True False  True] len(counts) 5
{64.0, 67.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0}
{128.0, 130.0, 132.0, 134.0, 170.0, 138.0, 140.0, 174.0, 145.0, 152.0, 150.0, 120.0, 124.0, 125.0}
3 [-1.          0.08201348  0.132953

In [143]:
this_clf.tree

{'fi': 1,
 'counts': {0: 129, 1: 29, 2: 21, 4: 11, 3: 32},
 'result': None,
 'next': {'<=95.0': {'counts': {3: 4}, 'result': 3},
  '>95.0': {'fi': 7,
   'counts': {0: 129, 1: 29, 2: 21, 4: 11, 3: 28},
   'result': None,
   'next': {'==3.0': {'fi': 1,
     'counts': {0: 101, 2: 4, 4: 2, 3: 4, 1: 7},
     'result': None,
     'next': {'<=132.0': {'fi': 6,
       'counts': {2: 4, 3: 3, 0: 7},
       'result': None,
       'next': {'==0.0': {'fi': 5,
         'counts': {0: 7, 2: 1},
         'result': None,
         'next': {'==0.0': {'counts': {0: 6}, 'result': 0},
          '==2.0': {'counts': {0: 1}, 'result': 0},
          '==3.0': {'counts': {2: 1}, 'result': 2}}},
        '==1.0': {'fi': 1,
         'counts': {2: 3, 3: 3},
         'result': None,
         'next': {'<=125.0': {'fi': 3,
           'counts': {2: 1, 3: 3},
           'result': None,
           'next': {'==0.0': {'counts': {2: 1}, 'result': 2},
            '==1.0': {'counts': {3: 3}, 'result': 3}}},
          '>125.0': {

In [79]:
X_this[:,2]

array([1., 1., 3., 1., 1., 1., 2., 2., 1., 1., 3., 3., 3., 1., 2., 2., 1.,
       1., 1., 3., 1., 1., 2., 1., 2., 1., 1., 2., 1., 3., 1., 1., 1., 2.,
       2., 2., 1., 2., 2., 1., 1., 1., 3., 2., 2., 1., 2., 1., 1., 1., 1.,
       2., 1., 1., 1., 3., 2., 1., 2., 1., 2., 1., 2., 2., 1., 2., 2., 2.,
       2., 3., 1., 1., 1., 1., 2., 1., 1., 1., 1., 1., 1., 2., 1., 2., 1.,
       1., 1., 2., 3., 1., 2., 3., 1., 2., 1., 2., 3., 1., 1., 2., 1., 1.,
       1., 1., 1., 2., 1., 2., 3., 1., 3., 2., 1., 2., 2., 2., 1., 2., 1.,
       1., 1., 3., 2., 2., 1., 2., 1., 2., 2., 2., 3., 1., 2., 1., 2., 2.,
       3., 1., 2., 2., 1., 1., 2., 1., 1., 2., 3., 2., 1., 1., 1., 1., 2.,
       1., 1., 1., 1., 2., 2., 2., 1., 2., 1., 2., 2., 2., 2., 1., 2., 1.,
       2., 1., 3., 1., 2., 2., 1., 2., 2., 1., 1., 1., 1., 2., 1., 1., 1.,
       3., 2., 1., 1., 2., 1., 1., 3., 1., 1., 3., 2., 3., 2., 2., 1., 2.,
       1., 1., 2., 1., 2., 2., 1., 1., 1., 2., 2., 2., 1., 1., 1., 1., 1.,
       1.], dtype=float32

In [58]:
y_this.shape[-1]

222

In [89]:
-0.5 * np.log2(0.5) - 0.5* np.log2(0.5)

1.0

In [90]:
1 * np.log2(1)

0.0