随机森林
---
[例子](https://github.com/apachecn/AiLearning/blob/master/docs/ml/7.%E9%9B%86%E6%88%90%E6%96%B9%E6%B3%95-%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97%E5%92%8CAdaBoost.md#%E9%9A%8F%E6%9C%BA%E6%A3%AE%E6%9E%97)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
np.set_printoptions(suppress=True, precision=4, threshold=16)

In [None]:
def load_data(file_path):
    with open(file_path) as f:
        data = []
        for line in f.readlines():
            if not line:
                continue
            line_data = []
            for str_ in line.split(','):
                str_ = str_.strip()
                if str_.isdigit():
                    line_data.append(float(str_))
                else:
                    line_data.append(str_)
            data.append(line_data)
        return data
                    

In [None]:
data = np.array(load_data('sonar-all-data.txt'))
X, y = data[:, :-1].astype(float), data[:, -1]

In [None]:
X

In [None]:
labels, indices_u = np.unique(y, return_inverse=True)

In [None]:
indices_u.shape

In [None]:
# 定义节点类 二叉树
class Node:
    def __init__(self, root=True, label=None, feature_name=None, feature=None):
        self.root = root  # 当前节点是叶节点了
        self.label = label  # 节点的分类
        # self.feature_name = feature_name  # 节点划分子集使用的特征名
        self.feature = feature   # 当前节点 划分子集使用的特征编号
        self.tree = {}  # 子节点

    def __repr__(self): 
        result = {
            'label': self.label,
            'feature': self.feature,
            'tree': self.tree
        }
        if not self.label:
            result.pop('label')
        if not self.tree:
            result.pop('tree')
        if not self.feature:
            result.pop('feature')
        return '{}'.format(result)

    def add_node(self, val, node):
        self.tree[val] = node

    def predict(self, features):
        if self.root is True:
            return self.label
        return self.tree[features[self.feature]].predict(features)

In [None]:
class CART:
    def __init__(self, epsilon=0.1, sample_least=5):
        self.epsilon = epsilon  # 基尼指数阈值
        self.sample_least = sample_least
        self._tree = {}
    
    @staticmethod
    def gini_(y1, y2):
        # 获取当前样本集 的gini指数
        ginis = []
        A = np.array([y1, y2])
        for y in A:
            # Gini(D) = 1- \sum (|C_k|/|D|)^2
            labels, counts = np.unique(y, return_counts=True)
            gini = 1- np.sum((counts / np.sum(counts)) ** 2)
            ginis.append(gini)
        # Gini(D, A) = |D1| / |D| * gini(D1) + |D2| / |D| * gini(D2)
        gini_A = (ginis * (A / A.sum())).sum()
        return gini_A
    
    @staticmethod
    def data_splt(X, feature, value):
        # 根据特征和value 分割数据 返回序号
        data_set = X[:, feature]
        left , right = [], []
        for i in range(len(data_set)):
            if data_set[i] >= value:
                left.append(i)
            else:
                right.append(i)
        return left, right
    
    def chose_best_value(self, data, y, feature):
        # 返回最佳分割点和相应的gini指数
        # 选定特征A, A的不同划分取值a 下的gini指数
        data_set = np.sort(X[:, feature])
        value_split = (data_set[1:] + data_set[:-1]) / 2
        gini_one_feature = []
        
        for value in value_split:
            # 每个划分点计算 基尼指数
            left, right = self.data_split(X, feature, value)
            gini = self.gini_(y[left], y[right])
            gini_one_feature.append(gini)
        # 得到最小的gini
        min_index = np.argmin(gini_one_feature)
        return value_split[min_index], gini_one_feature[min_index] 

    def chose_best_feature(self, X, y, features):
        # 寻找最好的 分割特征和分割值
        temp = np.zeros((len(features), 3))
        for i, feature in enumerate(features):
            value, gini = self.chose_best_value(X, y, feature)
            temp[i] = [gini, feature, value]
        best = np.argmin(temp[:0])
        return temp[best]
    
    def build_tree(self, X, y, features):
        
        # 1, 样本个数小于预订阈值, 多数表决
        if len(X) < self.sample_least:
            u, counts = np.unique(y, return_counts=True)
            return Node(root=True, label=u[counts.argmax()])
        
        # 2, 若A为空，没有特征继续进行划分了, 则T为单节点树，将D中实例树最大的类Ck作为该节点的类标记，返回T
        if len(features) == 0:
            u, counts = np.unique(y, return_counts=True)
            return Node(root=True, label=u[counts.argmax()])
        
        # 3. 计算 最佳的特征 分割点 gini
        gini, feature, value = chose_best_feature(X, y, features)
        
        # gini指数小于同一值
        
        features_copy = features.copy()  # 纯数字list的copy
        
        features_copy.remove(feature)
        # 左>=  右<
        left, right = self.data_splt(X, feature, value)
        node = {'left': self.build_tree(X[left], y[left], features_copy),
                'right': self.build_tree(X[right], y[right], features_copy), 
                'feature': feature,
                'value': value,
                'label': 
               }
        
        
        
    def fit(self, X, y, features):
        features = features 
        for feature in features:
            # 每个特征 选择合适的划分点
            
            gini_one_feature = []
            index, gini = self.chose_best_value(X, feature)
            
        
    def predict(self, X, y):
        # 使用features 里指定序号的特征构建树
        

In [None]:
class RamdomForest:
    def __init__(self, T=20, random_state=0, feature_nums=6):
        self.T = T  #  Bagging 方法生成T棵 DT
        np.random.seed(random_state)
        self.feature_nums = feature_nums  # 选定多少个特征构建决策树
        
    @staticmethod
    def bagging(n_sample, tree_num, random_state):
        # 样本数据随机化
        indices = np.random.randint(0, n_sample, size=(tree_num, n_sample))
        oobs = []
        for sample in indices:
            oobs.append(np.setdiff1d(range(n_sample), sample))
        return indices, np.array(oobs)
    
    @staticmethod
    def random_feature(tree_num, n_feature, feature_nums):
        # 每棵树都从M个特征中选取m个
        feature_array = np.zeros((tree_num, feature_nums))
        for i in range(tree_num):
            feature_index = np.arange(n_feature)
            np.random.shuffle(feature_index)
            feature_selected = feature_index[:feature_nums]
            feature_array[i,:] = feature_selected
        return feature_array
        
    def fit(self, X, y):
        n_sample, n_feature = X.shape
        indices, oobs = self.bagging(n_sample, self.T, self.random_state)
        features = self.random_feature(self.T, n_feature, )
        X_samples, X_oobs = X[indices, :], X[oobs, :]
        y_samples, y_oobs = y[indices], y[oobs]
        trees = []
        for i in range(self.T):
            tree = RandomTreeCART()
            tree.fit(X_sample[i, :], y_sample[i], features[i])
        
        
    
    def self_validation(self):
        # OOB 验证
        pass
        
    def score(self, X, y):
        pass

In [None]:
a = np.arange(12).reshape(4, 3)
b = np.random.randint(a.shape[0], size=(2, a.shape[0]))
b

In [None]:
a[b]

In [None]:
np.setdiff1d(range(10), [1, 3, 5, 7, 9])

In [None]:
import random
a = list(range(10))
np.random.shuffle(a)
a

In [None]:
a = list(range(10))
np.random.shuffle(a)
a

In [None]:
y = [1, 0, 1, 0, 1, 0, 1, 1]
u, counts = np.unique(y, return_counts=True)
u[counts.argmax()]

In [None]:
a = np.arange(10)
b = a[1:]
(a[:-1] + b) /2

In [None]:
np.argmin([1, 2,3,9, 0, 4])

In [None]:
a = np.array([4, 6])

In [None]:
[1, 2] * (a/a.sum())

In [None]:
a = [1 ,2, 3, 4, 8]
b =a.copy()
b[1] = 111
a.remove(8)

In [None]:
a