# 随机森林的OOB评估

## sklearn 实现

In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

In [5]:
# 加载数据集
X, y = load_iris(return_X_y=True)
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 创建随机森林模型，启用 OOB 评估
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=42)
# 训练模型
rf.fit(X_train, y_train)
# 打印 OOB 评估得分
print(f"OOB Score: {rf.oob_score_:.4f}")
# 在测试集上评估模型
test_score = rf.score(X_test, y_test)
print(f"Test Score: {test_score:.4f}")

OOB Score: 0.9167
Test Score: 1.0000


## 自定义 OOB 评估的随机森林类


In [4]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from collections import defaultdict

class CustomRandomForest:
    def __init__(self, n_estimators=100):
        self.n_estimators = n_estimators
        self.trees = []
        self.oob_predictions = None
        self.oob_count = None

    def fit(self, X, y):
        n_samples = X.shape[0]
        self.oob_predictions = np.zeros(n_samples)
        self.oob_count = np.zeros(n_samples)

        for _ in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, n_samples, replace=True)
            oob_indices = np.setdiff1d(np.arange(n_samples), indices)

            # Train a tree on the bootstrapped dataset
            tree = DecisionTreeClassifier()
            tree.fit(X[indices], y[indices])
            self.trees.append(tree)

            # OOB prediction
            for idx in oob_indices:
                self.oob_predictions[idx] += tree.predict(X[idx].reshape(1, -1))
                self.oob_count[idx] += 1

    def oob_score(self, y_true):
        # Calculate the OOB score based on predictions
        oob_pred = np.where(self.oob_count > 0, self.oob_predictions / self.oob_count, 0)
        oob_pred = np.round(oob_pred).astype(int)  # Convert to integer class labels
        return np.mean(oob_pred == y_true)

# Example of using the custom random forest
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the custom random forest
custom_rf = CustomRandomForest(n_estimators=100)
custom_rf.fit(X_train, y_train)

# Calculate OOB score
oob_accuracy = custom_rf.oob_score(y_train)
print(f"OOB Score: {oob_accuracy:.4f}")

OOB Score: 0.9500
