# 随机森林回归模型示例

## 基于 sklearn 系统库的实现

In [131]:
import pandas as pd
from sklearn import preprocessing
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston

### （1）构建训练数据集

In [132]:
boston_house = load_boston()

boston_feature_name = boston_house.feature_names
boston_features = boston_house.data
boston_target = boston_house.target

数据集情况展示：

In [133]:
print(boston_house.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [134]:
boston_feature_name

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [135]:
boston_features[:5,:]

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, 0.0000e+00, 5.3800e-01,
        6.5750e+00, 6.5200e+01, 4.0900e+00, 1.0000e+00, 2.9600e+02,
        1.5300e+01, 3.9690e+02, 4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        6.4210e+00, 7.8900e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9690e+02, 9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, 0.0000e+00, 4.6900e-01,
        7.1850e+00, 6.1100e+01, 4.9671e+00, 2.0000e+00, 2.4200e+02,
        1.7800e+01, 3.9283e+02, 4.0300e+00],
       [3.2370e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        6.9980e+00, 4.5800e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9463e+02, 2.9400e+00],
       [6.9050e-02, 0.0000e+00, 2.1800e+00, 0.0000e+00, 4.5800e-01,
        7.1470e+00, 5.4200e+01, 6.0622e+00, 3.0000e+00, 2.2200e+02,
        1.8700e+01, 3.9690e+02, 5.3300e+00]])

In [136]:
boston_target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

### （2）模型构建与训练

In [137]:
# 模型创建
# help(RandomForestRegressor)
rgs = RandomForestRegressor(n_estimators=15) 
print(rgs)

RandomForestRegressor(n_estimators=15)


In [138]:
# 训练
rgs = rgs.fit(boston_features, boston_target)

In [139]:
# 预测
predictions = rgs.predict(boston_features)
print(predictions)

[24.22       21.66666667 34.14       33.78666667 34.6        26.34666667
 22.31333333 22.68       16.08       18.9        15.5        19.16666667
 21.22       20.40666667 19.00666667 20.02       22.6        17.66
 19.72666667 18.65333333 14.15333333 19.28       15.46       14.33333333
 15.58666667 14.86       16.86       14.66666667 18.84       21.43333333
 12.96666667 16.55333333 13.19333333 13.48666667 13.45333333 19.88666667
 20.37333333 21.72       23.43333333 30.41333333 35.13333333 29.60666667
 25.14666667 24.88666667 20.72       19.15333333 19.51333333 17.72666667
 16.18666667 18.88666667 20.06       21.12       24.67333333 21.94
 19.37333333 34.35333333 23.10666667 31.39333333 23.41333333 19.90666667
 18.42       17.66666667 22.97333333 25.68       31.64666667 23.84666667
 19.54666667 21.24666667 18.15333333 20.86666667 23.75333333 21.09333333
 22.88       23.8        24.26666667 22.12       20.32       21.22666667
 21.12666667 20.68666667 27.52       24.33333333 24.05333333 23

### （3）结果指标

In [140]:
from sklearn.metrics import mean_squared_error  # 均方误差
from sklearn.metrics import mean_absolute_error # 平方绝对误差
from sklearn.metrics import r2_score            # R square
print(mean_squared_error(boston_target, predictions))
print(mean_absolute_error(boston_target, predictions))
print(r2_score(boston_target, predictions))

1.6536119455423814
0.8510671936758897
0.9804119801046642


---

## 基础从零开始实现

In [141]:
import csv
from random import seed
from random import randrange
from math import sqrt

### （1）数据集的创建

In [142]:
# 加载数据集
def loadCSV(filename):
    dataSet = []
    with open(filename, 'r') as file:
        csvReader = csv.reader(file)
        for line in csvReader:
            dataSet.append(line)
    return dataSet


# 类型转换（标签除外，其他全部变为 float 类型）
def column_to_float(dataSet):
    features_len = len(dataSet[0])
    for data in dataSet:
        for column in range(features_len - 1):
            data[column] = float(data[column].strip())


# 交叉集划分
def splitDataSet(dataSet, n_folds):
    fold_size = int(len(dataSet) / n_folds)
    dataSet_split = []
    for i in range(n_folds):
        fold = []
        while len(fold) < fold_size: 
            index = randrange(len(dataSet) - 1)
            fold.append(dataSet[index])
        dataSet_split.append(fold)
    return dataSet_split


# 子集构造
def get_subsample(dataSet, ratio):
    subdataSet = []
    len_subdataSet = round(len(dataSet) * ratio) # 长度为整数
    while len(subdataSet) < len_subdataSet:
        index = randrange(len(dataSet) - 1)
        subdataSet.append(dataSet[index])
    return subdataSet

### （2）基础决策树的构造：节点 / 数据分裂

In [143]:
# 从切分点 (index, value) 分割数据
def data_split(dataSet, index, value):
    left = []
    right = []
    for row in dataSet:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right


# 分割代价的计算（这里是分类问题，计算 “基尼指数”）
def split_loss(left, right, class_values):
    loss = 0.0
    for class_value in class_values:
        left_size = len(left)
        if left_size != 0:  # 避免除数为 0
            prop = [row[-1] for row in left].count(class_value) / float(left_size)
            loss += (prop * (1.0 - prop))
        
        right_size = len(right)
        if right_size != 0:
            prop = [row[-1] for row in right].count(class_value) / float(right_size)
            loss += (prop * (1.0 - prop))
    return loss


# 寻找最优拆分点（先随机挑选 n 个特征，再在这些特征里计算分割时的最优）
def get_best_split(dataSet, n_features):
    features = []
    class_values = list(set(row[-1] for row in dataSet))
    b_loss = float('inf')

    while len(features) < n_features:
        index = randrange(len(dataSet[0]) - 1)
        if index not in features:
            features.append(index)

    for index in features:
        for row in dataSet:
            left, right = data_split(dataSet, index, row[index])
            loss = split_loss(left, right, class_values)
            if loss < b_loss:
                b_index, b_value, b_loss, b_left, b_right = index, row[index], loss, left, right
    return {'index': b_index, 'value': b_value, 'left': b_left, 'right': b_right}


# 利用投票法，决策子节点的输出标签
def decide_label(data):
    output = [row[-1] for row in data]
    return max(set(output), key=output.count)


# 决策树分裂过程
def sub_split(root, n_features, max_depth, min_size, depth):
    left = root['left']
    right = root['right']

    # 清空当前节点数据，节省空间
    del(root['left'])
    del(root['right'])

    if not left or not right:
        root['left'] = root['right'] = decide_label(left + right)
        return

    if depth > max_depth:
        root['left'] = decide_label(left)
        root['right'] = decide_label(right)
        return

    if len(left) < min_size:
        root['left'] = decide_label(left)
    else:
        root['left'] = get_best_split(left, n_features)
        sub_split(root['left'], n_features, max_depth, min_size, depth + 1)

    if len(right) < min_size:
        root['right'] = decide_label(right)
    else:
        root['right'] = get_best_split(right, n_features)
        sub_split(root['right'], n_features, max_depth, min_size, depth + 1)


# 构造决策树
def build_tree(dataSet, n_featrures, max_depth, min_size):
    root = get_best_split(dataSet, n_features)
    sub_split(root, n_features, max_depth, min_size, 1)
    return root


### （3）创建随机森林

In [144]:
def random_forest(train, ratio, n_feature, max_depth, min_size, n_trees):
    trees = []
    for i in range(n_trees):
        train = get_subsample(train, ratio)
        tree = build_tree(train, n_features, max_depth, min_size)
        # print('tree %d: '%i, tree)
        trees.append(tree)
    return trees

### （4）预测

In [145]:
# 预测测试集结果
def predict(tree, row):
    predictions = []
    if row[tree['index']] < tree['value']:
        if isinstance(tree['left'], dict):
            return predict(tree['left'], row)
        else:
            return tree['left']
    else:
        if isinstance(tree['right'], dict):
            return predict(tree['right'], row)
        else:
            return tree['right']


def bagging_predict(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    return max(set(predictions), key=predictions.count)

### （5）精确度计算

In [146]:
def accuracy(predict_values, actual):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predict_values[i]:
            correct += 1
    return correct / float(len(actual))

### 训练测试

In [147]:
seed(1)

dataSet = loadCSV('./data/sonar.all-data.csv')
column_to_float(dataSet)

n_folds = 5
max_depth = 15
min_size = 1
ratio = 1.0
n_features = 15
n_trees = 10

folds = splitDataSet(dataSet, n_folds)

scores = []
for fold in folds:
    # 构造训练集
    train_set = folds[:]  # 拷贝
    train_set.remove(fold)
    train_set = sum(train_set, [])

    # 构造测试集
    test_set = []
    for row in fold:
        row_copy = list(row) # 拷贝
        row_copy[-1] = None
        test_set.append(row_copy)
    actual = [row[-1] for row in fold]

    rf_trees = random_forest(train_set, ratio, n_features, max_depth, min_size, n_trees)
    
    # predict_values = [predict(trees,row) for row in test]
    predict_values = [bagging_predict(rf_trees, row) for row in test_set]
    accurcy = accuracy(predict_values, actual)
    scores.append(accurcy)

print ('scores:%s' % scores)
print ('mean score:%s' % (sum(scores) / float(len(scores))))

scores:[0.7804878048780488, 0.6097560975609756, 0.7317073170731707, 0.7073170731707317, 0.7073170731707317]
mean score:0.7073170731707317
