In [1]:
import pandas as pd
import numpy as np
from utils import ClassificationTree

data = pd.read_csv('glass.csv')
X = data.iloc[:,:-1].values
Y = data.iloc[:,-1].values

In [2]:
class RandomForest:
    
    def __init__(self,n_estimators=100,subsample=1,colsample=1,\
                 max_depth=float('inf'),min_samples_leaf=1):
        self.n_estimators = n_estimators
        self.subsample = subsample
        self.colsample = colsample
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        
    def fit(self,X,Y):
        self.clf_list = []
        for m in range(self.n_estimators):
            idx = np.random.permutation([i for i in range(len(Y))])
            idx = list(idx)[:round(self.subsample*len(Y))]
            X_m, Y_m = X[idx,:],Y[idx]
            clf = ClassificationTree(max_depth=self.max_depth,min_samples_leaf=self.min_samples_leaf)
            clf.fit(X_m,Y_m,col_num=round(X.shape[1]*self.colsample))
            self.clf_list.append(clf)
            
    def find_most_frequent(self,x):
        return np.bincount(list(x)).argmax()
    
    def predict(self,new_X):
        out = np.concatenate([clf.predict(new_X) for clf in self.clf_list],axis=1)
        return np.apply_along_axis(self.find_most_frequent,axis=1,arr=out)

In [3]:
clf = [RandomForest(n_estimators=i) for i in range(1,10)]
for c in clf:
    c.fit(X,Y)
    print('基分类器个数：',c.n_estimators,'准确率：',round(np.sum(c.predict(X)==Y.reshape(-1))/len(Y),3))

基分类器个数： 1 准确率： 0.944
基分类器个数： 2 准确率： 0.967
基分类器个数： 3 准确率： 0.977
基分类器个数： 4 准确率： 0.972
基分类器个数： 5 准确率： 0.991
基分类器个数： 6 准确率： 1.0
基分类器个数： 7 准确率： 0.995
基分类器个数： 8 准确率： 0.981
基分类器个数： 9 准确率： 1.0
