# 多决策树 - 随机森林 - 随机决策
对决策树的一个常见批评是，一旦在回答问题后对训练集进行划分，就不可能重新考虑这个决策。例如，如果我们将男性和女性分开，那么每个后续问题都只涉及男性或女性，而且该方法不能考虑其他类型的问题（例如，年龄不到一岁，不论性别如何）。**随机森林**尝试在每个步骤中引入一定程度的**随机化**，创建备选树并将它们组合来获得最终预测。考虑几个回答相同问题的分类器的这些类型的算法，被称为**集成方法**.   
随机森林建议基于训练实例的子集（**带放回随机选择**）来构建决策树，但是在特征集的每个集合中使用少量随机的特征。这种树生长过程重复几次，产生一组分类器。在预测时，给定一个实例的每个成型的树都会像决策树一样预测其目标类。大多数树所投票的类（即树中预测最多的类）是集成分类器所建议的类。  
随机森林只是许多树，建立在数据的不同随机子集（带放回抽样）上，并对于每个分裂，使用特征的不同随机子集（无放回抽样）。 这使得树彼此不同，并使它们过拟合不同的方面。 然后，他们的预测被平均，产生更平稳的估计，更少过拟合。

声呐信号分析 [来源](https://github.com/apachecn/AiLearning/blob/master/docs/ml/7.%E9%9B%86%E6%88%90%E6%96%B9%E6%B3%95-%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97%E5%92%8CAdaBoost.md#%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97)
---

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(suppress=True, precision=4, threshold=16)

In [None]:
def load_data(file_path):
    with open(file_path) as f:
        data = []
        for line in f.readlines():
            if not line:
                continue
            line_data = []
            for str_ in line.split(','):
                str_ = str_.strip()
                if str_.isdigit():
                    line_data.append(float(str_))
                else:
                    line_data.append(str_)
            data.append(line_data)
        return data

In [None]:
data = np.array(load_data('sonar-all-data.txt'))
X, y = data[:, :-1].astype(float), data[:, -1]
labels, indices_u = np.unique(y, return_inverse=True)
y_ = np.where(indices_u == 0, -1, indices_u)
y_

In [None]:
# 定义节点类 CART 二叉树
class Node:
    def __init__(self, root=True, label=None, feature=None, value=None):
        self.root = root  # 当前节点是叶节点了
        self.label = label  # 节点的分类
        # self.feature_name = feature_name  # 节点划分子集使用的特征名
        self.feature = feature   # 当前节点 划分子集使用的特征编号
        self.tree = {}  # 子节点
        self.value = value  # 分割值

    def __repr__(self): 
        result = {
            'label': self.label,
            'feature': self.feature,
            'value': self.value,
            'tree': self.tree
        }
        if not self.label:
            result.pop('label')
        if not self.tree:
            result.pop('tree')
        if not self.feature:
            result.pop('feature')
        if self.value is None:
            result.pop('value')   
        return '{}'.format(result)

    def add_node(self, val, node):
        self.tree[val] = node

    def predict(self, X):
        # X 单个特征样本
        if self.root is True:
            return self.label
        if X[self.feature] >= self.value :
            return self.tree['left'].predict(X)
        else:
            return self.tree['right'].predict(X)

In [None]:
class CART:
    def __init__(self, epsilon=0.1, sample_least=5):
        self.epsilon = epsilon  # 基尼指数阈值
        self.sample_least = sample_least
        self._tree = {}
    
    @staticmethod
    def gini_(y1, y2):
        # 获取当前样本集 的gini指数
        ginis = []
        
        for y in (y1, y2):
            # Gini(D) = 1- \sum (|C_k|/|D|)^2
            labels, counts = np.unique(y, return_counts=True)
            gini = 1- np.sum((counts / np.sum(counts)) ** 2)
            ginis.append(gini)
        # Gini(D, A) = |D1| / |D| * gini(D1) + |D2| / |D| * gini(D2)
        A = np.array([len(y1), len(y2)])
        # print(ginis, A)
        gini_A = (ginis * (A / A.sum())).sum()
        return gini_A
    
    @staticmethod
    def data_split(X, feature, value):
        # 根据特征和value 分割数据 返回序号
        data_set = X[:, feature]
        left , right = [], []
        for i in range(len(data_set)):
            if data_set[i] >= value:
                left.append(i)
            else:
                right.append(i)
        return left, right
    
    def choose_best_value(self, data, y, feature):
        # 返回最佳分割点和相应的gini指数
        # 选定特征A, A的不同划分取值a 下的gini指数
        value_split_list = np.sort(data[:, feature])
        value_split = (value_split_list[1:] + value_split_list[:-1]) / 2
        gini_one_feature = []
        
        for value in value_split:
            # 每个划分点计算 基尼指数
            left, right = self.data_split(data, feature, value)
            gini = self.gini_(y[left], y[right])
            gini_one_feature.append(gini)
        # 得到最小的gini
        min_index = np.argmin(gini_one_feature)
        return value_split[min_index], gini_one_feature[min_index] 

    def choose_best_feature(self, X, y, features):
        # 寻找最好的 分割特征和分割值
        temp = np.zeros((len(features), 3))
        for i, feature in enumerate(features):
            value, gini = self.choose_best_value(X, y, feature)
            temp[i] = [gini, feature, value]
        best = np.argmin(temp[:, 0])
        return temp[best]
    
    def build_tree(self, X, y, features):
        X = X.copy()
        y = y.copy()
        u, counts = np.unique(y, return_counts=True)
        # 1, 样本个数小于预订阈值, 多数表决
        if len(X) < self.sample_least:
            return Node(root=True, label=u[counts.argmax()])
        
        # 2, 若A为空，没有特征继续进行划分了, 则T为单节点树，将D中实例树最大的类Ck作为该节点的类标记，返回T
        if len(features) == 0:
            return Node(root=True, label=u[counts.argmax()])
        
        # 3. 计算 最佳的特征 分割点 gini
        gini, feature, value = self.choose_best_feature(X, y, features)
        # 3个一同存储的, 会变成float类型
        feature = int(feature)
        
        # 4 gini指数小于某一值
        if gini < self.epsilon:
            return Node(root=True, label=u[counts.argmax()], value=value)
        
        # 5 构建结点
        node_tree = Node(root=False, feature=feature, value=value)
        features_copy = features.copy()  # 纯数字list的copy
        features_copy.remove(feature)
        # 左>=  右<
        # 6 递归生成树
        left, right = self.data_split(X, feature, value)
        # print(len(left), len(right), X.shape, y.shape)
        left_tree = self.build_tree(X[left], y[left], features_copy)
        node_tree.add_node('left', left_tree)
        right_tree = self.build_tree(X[right], y[right], features_copy)
        node_tree.add_node('right', right_tree)
        
        return node_tree
        
    def fit(self, X, y, features):
        print('选择的特征', features)
        self._tree = self.build_tree(X, y, features)
        return self._tree
            
    def predict(self, X):
        ret = np.zeros(X.shape[0])
        for i, sample in enumerate(X):
            ret[i] = (self._tree.predict(sample))
        return ret
    
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.sum(y_pred == y) / len(y)

In [None]:
cart_tree = CART()
feature_all = list(range(X.shape[1]))
cart_tree.fit(X, indices_u, feature_all)

In [None]:
cart_tree.score(X, indices_u)  # X 的正确率

In [None]:
class RamdomForest:
    def __init__(self, T=21, random_state=0, feature_nums=6):
        self.T = T  #  Bagging 方法生成T棵 DT
        np.random.seed(random_state)
        self.feature_nums = feature_nums  # 选定多少个特征构建决策树
        self.trees = []
        self.self_validation = 0.0
        
    @staticmethod
    def bagging(n_sample, tree_num):
        # 样本数据随机化
        indices = np.random.randint(0, n_sample, size=(tree_num, n_sample))
        oobs = []
        for sample in indices:
            oobs.append(np.setdiff1d(range(n_sample), sample))
        return indices, oobs
    
    @staticmethod
    def random_feature(tree_num, n_feature, feature_nums):
        # 每棵树都从M个特征中选取m个
        feature_array = np.zeros((tree_num, feature_nums))
        for i in range(tree_num):
            feature_index = np.arange(n_feature)
            np.random.shuffle(feature_index)
            feature_selected = feature_index[:feature_nums]
            feature_array[i,:] = feature_selected
        return feature_array.astype(int)
        
    def fit(self, X, y):
        n_sample, n_feature = X.shape
        indices, oobs = self.bagging(n_sample, self.T)
        features = self.random_feature(self.T, n_feature, self.feature_nums)
        X_samples, y_samples = X[indices, :], y[indices]
        
        for i in range(self.T):
            tree = CART()
            tree.fit(X_samples[i, :], y_samples[i], list(features[i]))  # 转成list 方便操作
            self.trees.append(tree)
        
        y_pred = []
        for i in range(n_sample):
            y_pred_i = []
            for j in range(self.T): 
                # 如果(X_n, y_n)是某棵树的oob, 则使用这棵树判断此样本
                if i in oobs[j]:  # [1, 2 ,5, ...]
                    y_pred_i.append(self.trees[j].predict(X[[i]]))
            # 取平均 计算当前样本预测值 
            y_i = np.array(y_pred_i).ravel().mean()
            y_pred.append(1 if y_i >= 0 else -1)
        # 所有样本的平均表现
        self.self_validation = np.sum(y_pred == y) / n_sample
    
    
    def predict(self, X):
        y_pred = np.zeros((self.T, X.shape[0]))
        for i in range(self.T):
            y_pred[i] = self.trees[i].predict(X)
        y = np.where(np.average(y_pred, axis=0)>0, 1, -1)
        return y
        
    def score(self, X, y):
        y_pred = self.predict(X)
        return np.sum(y==y_pred)/ len(y)

In [None]:
rf = RamdomForest(random_state=12345)
rf.fit(X, y_)

In [None]:
rf.self_validation

In [None]:
rf.score(X, y_)  # 比单一决策树好

**使用sklearn解决**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz

rf = RandomForestClassifier(n_estimators=21, random_state=12345, n_jobs=-1, oob_score=True)
rf.fit(X, y_)

In [None]:
rf.score(X, y_)

In [None]:
# 查看oob得分
rf.oob_score_

## sk-learn 官方User Guide的例子 

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectFromModel
from sklearn import metrics
import matplotlib.pyplot as plt
import graphviz
from sklearn import datasets
import pandas as pd
import numpy as np
np.set_printoptions(precision=4, suppress=True, threshold=15)
pd.options.display.max_rows = 20

In [None]:
iris = datasets.load_iris()
X, y = iris.data, iris.target

[数据科学和人工智能技术笔记 十三、树和森林](https://github.com/apachecn/ds-ai-tech-notes/blob/master/13.md)

In [None]:
# 使用随机森林
clf = RandomForestClassifier(random_state=12345, n_jobs=-1)
clf.fit(X, y)

**特征的重要性**

In [None]:
# 计算特征重要性  越接近于1 表示越重要
importances = clf.feature_importances_  # 所有重要性得分加起来为 100%
importances  

In [None]:
# 整个数据集上的特征重要性分布
plt.bar(range(X.shape[1]), importances)
plt.title('Feature Importamces')
plt.xticks(range(X.shape[1]), iris.feature_names, rotation=90)

**使用随机森林的特征选择**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

In [None]:
clf = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf.fit(X_train, y_train)

In [None]:
# 使用4个特征进行分类的 准确率
y_pred = clf.predict(X_test)
metrics.accuracy_score(y_test, y_pred)

In [None]:
clf.score(X_test, y_test)

In [None]:
# 创建一个选择器对象，
# 该对象将使用随机森林分类器来标识重要性大于 0.15的特征
sfm = SelectFromModel(clf, threshold=0.15)
sfm.fit(X_train, y_train)
X_important_train = sfm.transform(X_train)
X_important_test = sfm.transform(X_test)
X_important_train

In [None]:
sfm.get_support(indices=True)  # 所选择的特征编号

In [None]:
important_names = np.array(iris.feature_names)[sfm.get_support()]
important_names  # 最重要的2个特征

In [None]:
# 使用最重要的特征 训练随机森林
clf_important = RandomForestClassifier(n_estimators=10000, random_state=0, n_jobs=-1)
clf_important.fit(X_important_train, y_train)
# 使用2个特征 的模型准确率
y_pred = clf_important.predict(X_important_test)
metrics.accuracy_score(y_test, y_pred)

**在随机森林中处理不平衡类别**

In [None]:
# 通过移除前 40 个观测，生成高度不平衡的类别
X = X[30:, ]
y = y[30:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

其中类自动加权，与它们在数据中出现的频率成反比  
$$w_j = \frac {n}{kn_j}$$
$w_j$是$j$类的权重, $n$是总观测数, $n_j$是类$j$的观测数, $k$为类的总数

In [None]:
clf = RandomForestClassifier(random_state=0, n_jobs=-1, class_weight='balanced')
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
metrics.accuracy_score(y_test, y_pred)