In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from collections import Counter
from tqdm import tqdm
%matplotlib inline

In [6]:
df = pd.read_csv("processed.cleveland.data",header=None,names=["age","sex","cp","trestbps","chol","fbs","restectg","thalach","exang","oldpeak","slope","ca","thal","num"])
index = ["?" not in row for row in df.values]
dataset = df.values[index].astype(np.float32)
X,y = dataset[:,:-1],dataset[:,-1]
y = y.astype(np.int64)
X_train,X_test,y_train,y_test = train_test_split(X,y)

In [10]:
kinds = ["categorical" if len(set(col))<6 else "numerical"for col in X.T]

In [1]:
class C45DecisionTree:
    @staticmethod
    def entropy(y):
        precs = np.array(list(Counter(y).values()))/len(y)
        ent = np.sum(-1 * precs * np.log2(precs))
        return ent

    # 选择哪个特征进行分裂
    def decide_feature(self,X,y,fas,kinds):
        # fas refers to feature_available;if one feature can be splitted,this feature_available is True,else False
        (n_samples,n_features) = X.shape
        ers = np.ones(n_features) * -1
        bestfvs = np.zeros(n_features)
        for fi in range(n_features):
            if not fas[fi]:
                continue

            if kinds[fi] == "categorical":
                I,H = self.entropy(y),0
                for fv,c in Counter(X[:,fi]).items():
                    splity = y[X[:,fi] == fv]
                    proc = c/n_samples
                    I -= proc * self.entropy(splity)
                    H += -proc * np.log2(proc)
                ers[fi] = I/(H+1e-7)
            else:
#                 print(set(X[:,fi]))
                for fv in list(sorted(set(X[:,fi])))[:-1]:
                    splity_less = y[X[:,fi] <= fv]
                    proc_less = len(splity_less) / n_samples
                    splity_more = y[X[:,fi] > fv]
                    proc_more = len(splity_more) / n_samples
                    I = -proc_less * self.entropy(splity_less) - proc_more * self.entropy(splity_more) + self.entropy(y)
                    H = -1*proc_less * np.log2(proc_less) - proc_more * np.log2(proc_more)
                    if I/(H+1e-7) > ers[fi]:
                        ers[fi] = I/(H+1e-7)
                        bestfvs[fi] = fv
        return ers,bestfvs

    def build_tree(self,X,y,fas,kinds):
        counts = dict(Counter(y))
        result = max(counts,key=counts.get)
#         print("fas",fas,"len(counts)",len(counts))
        if len(counts) == 1 or (fas==False).all():
            return {"counts":counts,"result":result}
        ers,bestfvs = self.decide_feature(X,y,fas,kinds)
        if (ers == -1).all():
            return {"counts":counts,"result":result}
        next_ = {}
        bestfi = np.argmax(ers)
#         print(bestfi,ers)
        if kinds[bestfi] == "categorical":
            fas[bestfi] = False
            for fv in set(X[:,bestfi]):
                index = (X[:,bestfi] == fv)
#                 print("next: {} {} {}, size:{}".format(bestfi,"==",fv,len(y[index])))
                next_["{}{}".format("==",fv)] = self.build_tree(X[index],y[index],fas,kinds)
        else:
            bestfv = bestfvs[bestfi]
            index_less = X[:,bestfi] <= bestfv
            fas_less = fas.copy()
            if len(set(X[index_less][:,bestfi])) == 1:
                fas_less[bestfi] = False
#             print("next: {} {} {}, size:{}".format(bestfi,"<=",bestfv,len(y[index_less])))
            next_["{}{}".format("<=",bestfv)] = self.build_tree(X[index_less],y[index_less],fas_less,kinds)
            index_more = X[:,bestfi] > bestfv
            fas_more = fas.copy()
            if len(set(X[index_more][:,bestfi])) == 1:
                fas_more[bestfi] = False
#             print("next: {} {} {}, size:{}".format(bestfi,">=",bestfv,len(y[index_more])))
            next_["{}{}".format(">",bestfv)] = self.build_tree(X[index_more],y[index_more],fas_more,kinds)
        return {"fi":bestfi,"counts":counts,"result":None,"next":next_}
    
    def fit(self,X,y,kinds):
        fas = np.array([True]*X.shape[-1])
        self.tree = self.build_tree(X,y,fas,kinds)

    def predict_one(self,x):
        tree = self.tree
        while tree["result"] == None:
            fi = tree["fi"]
            fv = x[fi]
            flag = False
            for condition in tree["next"]:
                if eval(str(fv)+condition):
                    tree = tree["next"][condition]
                    flag = True
                    break
            if not flag:
                counts = tree["counts"]
                return max(counts,key=counts.get)
        return tree["result"]
    
    def predict(self,X):
        y_predicts = []
        for x_test in X_test:
            y_predicts.append(self.predict_one(x_test))
        return y_predicts
        
    def score(self,X_test,y_test):
        y_predicts = self.predict(X_test)
        return np.sum(y_predicts == y_test)/ len(y_test)

In [37]:
tree = C45DecisionTree()
tree.fit(X_train,y_train,kinds)
tree.score(X_test,y_test)

0.48

In [29]:
class RandomForest:
    def __init__(self,n_estimators=100):
        self.n_estimators = n_estimators
        self.base_clf = C45DecisionTree
        self.clfs = []
        self.index_cols = []
        
    def fit(self,X,y,kinds):
        n_samples,n_features = X.shape
        for clf in tqdm(range(self.n_estimators)):
            index = np.random.choice(n_samples,n_samples,replace=True)
            col_index = np.random.choice(n_features,8,replace=False)
            self.index_cols.append(col_index)
            X_this,y_this = X[index][:,col_index],y[index]
            kind_this = [kinds[i] for i in col_index]
            this_clf = self.base_clf()
            this_clf.fit(X_this,y_this,kind_this)
            self.clfs.append(this_clf)
    
    def predict(self,X):
        results = np.array([clf.predict(X[:,self.index_cols[i]]) for i,clf in enumerate(self.clfs)])
        y_predicts = []
        for i in range(results.shape[-1]):
            count = dict(Counter(results[:,i]))
#             print(count)
            result = max(count,key=count.get)
            y_predicts.append(result)
        return y_predicts
            
    def score(self,X,y):
        y_predicts = self.predict(X)
        return np.sum(y_predicts == y) / len(y)

In [30]:
np.random.seed(22)
rf = RandomForest(n_estimators=10)
rf.fit(X_train,y_train,kinds)

100%|████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.89it/s]


In [31]:
rf.score(X_test,y_test)

0.18666666666666668

In [32]:
class RandomForest2:
    def __init__(self,n_estimators=100):
        self.n_estimators = n_estimators
        self.base_clf = C45DecisionTree
        self.clfs = []
        self.index_cols = []
        
    def fit(self,X,y,kinds):
        n_samples,n_features = X.shape
        for clf in tqdm(range(self.n_estimators)):
            index = np.random.choice(n_samples,n_samples,replace=True)
            X_this,y_this = X[index],y[index]
            this_clf = self.base_clf()
            this_clf.fit(X_this,y_this,kinds)
            self.clfs.append(this_clf)
    
    def predict(self,X):
        results = np.array([clf.predict(X) for i,clf in enumerate(self.clfs)])
        y_predicts = []
        for i in range(results.shape[-1]):
            count = dict(Counter(results[:,i]))
#             print(count)
            result = max(count,key=count.get)
            y_predicts.append(result)
        return y_predicts
            
    def score(self,X,y):
        y_predicts = self.predict(X)
        return np.sum(y_predicts == y) / len(y)

In [35]:
np.random.seed(22)
rf = RandomForest2(n_estimators=100)
rf.fit(X_train,y_train,kinds)

100%|██████████████████████████████████████████████████████████████████████████████████████| 100/100 [01:00<00:00,  1.66it/s]


In [36]:
rf.score(X_test,y_test)

0.56