# 从零实现随机森林

In [284]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

## 1. 加载数据

In [285]:
X, y = load_iris(return_X_y=True)

In [286]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [287]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

## 2. 随机森林

In [288]:
def bootstrap_sample(X, y):
    """自助采样法"""
    Nsamples = X.shape[0]
    indices = np.random.choice(Nsamples, Nsamples, replace=True)
    return X[indices], y[indices]

In [289]:
def random_forest(X, y, n_trees=10, max_depth=None, min_samples_split=2, max_features='sqrt'):
    trees = []
    for i in range(n_trees):
        _X, _y = bootstrap_sample(X, y)
        model = DecisionTreeClassifier(
            max_depth=max_depth, 
            min_samples_split=min_samples_split,  # 这里设置最小样本分裂条件
            max_features=max_features  
        )
        model.fit(_X, _y)
        trees.append(model)
    return trees

In [290]:
trees = random_forest(X_train, y_train, n_trees=20, max_depth=5, min_samples_split=10, max_features='sqrt')

In [291]:
y_pres = [tree.predict(X_test) for tree in trees]

In [292]:
from scipy import stats
y_pre = stats.mode(y_pres, axis=0)[0]

In [293]:
y_pre.shape

(30,)

In [294]:
(y_pre == y_test).mean()

0.9333333333333333

## 3. sklearn RF

In [295]:
from sklearn.ensemble import RandomForestClassifier

In [296]:
model = RandomForestClassifier(n_estimators=10, criterion='gini', max_depth=5, min_samples_split=5)
model.fit(X_train, y_train)
y_pre = model.predict(X_test)

In [297]:
(y_pre == y_test).mean()

0.9333333333333333