# ランダム・フォレスト(Random Forest)

## ランダム・フォレストとは

- バギング(bagging)の代表的手法
- 一部の特徴のみ(サンプルも一部のみの場合あり)を使って学習した複数の決定木の多数決や平均値で予測
- 複数の決定木を組み合わせることで予測精度や汎化能力向上
- 決定木を基にしているが、決定木のような結果に対する説明力はない
- [特徴選択にも利用可能](random_forest_feature_importance.ipynb)

## 分類

### データの準備

In [None]:
from sklearn.datasets import make_moons, make_circles
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

datasets = [
    make_moons(noise=.3, random_state=0),
    make_circles(noise=.2, factor=.5, random_state=0)
]

def get_min_max(x):
    margin = .1
    return x.min() - margin, x.max() + margin

plt.figure(figsize=(3, 3 * len(datasets)))

for i, (X, y) in enumerate(datasets):
    x_min, x_max = get_min_max(X[:, 0])
    y_min, y_max = get_min_max(X[:, 1])

    ax = plt.subplot(len(datasets), 1, i + 1)

    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr')

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())

plt.show()

### 学習

In [None]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

titles = ['Decision Tree', 'Random Forest']
classifiers = [
    DecisionTreeClassifier,
    RandomForestClassifier
]
models = []

for (X, y ) in datasets:
    for i, clf in enumerate(classifiers):
        model = clf(max_depth=5, random_state=0)
        if i == 1:
            model.set_params(n_estimators=20, max_features=1)
        model.fit(X, y)
        models.append(model)

### 結果の可視化

In [None]:
resolution = 200

plt.figure(figsize=(3 * len(classifiers), 3 * len(datasets)))

for i, model in enumerate(models):
    ds_id = i // len(classifiers)
    clf_id = i % len(datasets)
    X, y = datasets[ds_id]
    x_min, x_max = get_min_max(X[:, 0])
    y_min, y_max = get_min_max(X[:, 1])
    xx, yy = np.meshgrid(np.linspace(x_min, x_max, resolution), np.linspace(y_min, y_max, resolution))
    Z = model.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1].reshape(xx.shape)

    ax = plt.subplot(len(datasets), len(classifiers), i + 1)

    if ds_id == 0:
        ax.set_title(titles[clf_id])
    plt.pcolormesh(xx, yy, Z, cmap='bwr', alpha=.5)
    ax.scatter(X[:, 0], X[:, 1], c=y, cmap='bwr')

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())

plt.show()

## 回帰

### データの準備

In [None]:
np.random.seed(0)

n_sample = 300
n_noise = 5
x = np.random.uniform(low=-3, high=3, size=(n_sample, 1))
y = (np.sin(x) + np.random.normal(scale=.3, size=x.shape)).ravel()
x_min, x_max = get_min_max(x)
y_min, y_max = get_min_max(y)

plt.figure(figsize=(6, 3))

plt.scatter(x, y)

plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())

plt.show()

### 学習

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

tree = DecisionTreeRegressor(random_state=0).fit(x, y)
forest = RandomForestRegressor(n_estimators=20, n_jobs=-1, random_state=0).fit(x, y)
regressors = [tree, forest]

### 結果の可視化

In [None]:
titles = ['Decision Tree', 'Random Forest']

plt.figure(figsize=(12, 3))

sample_x = np.linspace(x_min, x_max, n_sample * 4)[:, np.newaxis]

for i, model in enumerate(regressors):
    ax = plt.subplot(1, 2, i + 1)
    ax.set_title(titles[i])

    ax.scatter(x, y)
    ax.plot(sample_x, model.predict(sample_x), color='r')

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
    ax.set_xticks(())
    ax.set_yticks(())

plt.show()