## 随机森林的直接调用  

#### 这一节课我们利用现成的随机森林库函数对蘑菇进行有毒和无毒的简单分类   

- 数据来源： https://www.kaggle.com/uciml/mushroom-classification/data  
- 对比模型： 随机森林，决策树，Logistic回归模型

In [1]:

%matplotlib inline
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

np.random.seed(19)

  from numpy.core.umath_tests import inner1d


### 读入数据

In [2]:
data_folder = "./input"
data = pd.read_csv(os.path.join(data_folder, "mushrooms.csv"), header=None)


In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
1,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
2,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
3,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
4,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u


### 处理二分类问题的标签

In [None]:
data[0] = data.apply(lambda row: 0 if row[0] == 'e' else 1, axis=1)

### 处理每列的数据

In [None]:
# 每一列如果有null，用"missing"代替
cols = np.arange(1,23)
for col in cols:
    if np.any(data[col].isnull()):
        data.loc[data[col].isnull(), col] = 'missing'


In [None]:
labelEncoders = dict()

# 对每一列进行one hot encoding
for col in cols:
    encoder = LabelEncoder()
    values = data[col].tolist()
    values.append('missing')  #加入missing这种值
    encoder.fit(values)
    labelEncoders[col] = encoder

# 计算one hot encoding之后的列数
dimensionality = 0
for col, encoder in labelEncoders.items():
    dimensionality += len(encoder.classes_)
print("dimensionality:  %d" % (dimensionality))

In [None]:
# 用于测试数据的变换
def transform(df):
    N, _ = df.shape
    X = np.zeros((N, dimensionality))
    i = 0
    for col ,encoder in labelEncoders.items():
        k = len(encoder.classes_)
        X[np.arange(N), encoder.transform(df[col]) + i] = 1
        i += k
    return X

In [None]:
# 准备数据和标签
X = transform(data)
Y = data[0].as_matrix()

### Logistic回归的表现

In [None]:
logistic_model = LogisticRegression()
print("logistic Regression performance: %f" % (cross_val_score(logistic_model, X, Y, cv=8).mean()))

### 决策树的表现

In [None]:
tree_model = DecisionTreeClassifier()
print("Decision Tree performance: %f" % (cross_val_score(tree_model, X, Y, cv=8).mean()))

### 随机森林的表现

In [None]:
forest = RandomForestClassifier(n_estimators=20)
print("Random Forest performance: %f" % (cross_val_score(tree_model, X, Y, cv=8).mean()))

In [None]:
from sklearn.base import BaseEstimator

### 伪随机森林的实现

In [None]:
class FakeRandomForest(BaseEstimator):
    
    def __init__(self, M):
        self.M = M
        
        
    def fit(self, X, Y, n_features=None):
        N,D = X.shape
        if n_features is None:
            # 特征的个数
            n_features = int(np.sqrt(D))
        
        # 袋子。。
        self.models = []
        
        # 特征
        self.features = []
        
        for m in range(self.M):
            tree = DecisionTreeClassifier()
            
            #有放回的随机抽取N个数据
            idx = np.random.choice(N, size=N, replace=True)
            X_current = X[idx]
            Y_current = Y[idx]
            
            #随机抽取n_features个特征
            features = np.random.choice(D, size=n_features, replace=False)
            
            #训练当前的决策树模型
            tree.fit(X_current[:, features], Y_current)
            self.features.append(features)
            self.models.append(tree)
            
    
    def predict(self, X):
        N = len(X)
        results = np.zeros(N)
        for features, tree in zip(self.features, self.models):
            results += tree.predict(X[:, features])
        return np.round(results/ self.M)
    
    def score(self, X, Y):
        prediction = self.predict(X)
        return np.mean(prediction == Y)
        
            
            
            

### Bagging决策树的实现

In [None]:
class BaggedTreeClassifier(BaseEstimator):
    def __init__(self, M):
        self.M = M

    def fit(self, X, Y):
        N = len(X)
        self.models = []
        for m in range(self.M):
            idx = np.random.choice(N, size=N, replace=True)
            Xb = X[idx]
            Yb = Y[idx]

            model = DecisionTreeClassifier(max_depth=2)
            model.fit(Xb, Yb)
            self.models.append(model)

    def predict(self, X):
        # no need to keep a dictionary since we are doing binary classification
        predictions = np.zeros(len(X))
        for model in self.models:
            predictions += model.predict(X)
        return np.round(predictions / self.M)

    def score(self, X, Y):
        P = self.predict(X)
        return np.mean(Y == P)

In [None]:
baggedtc = BaggedTreeClassifier(20)

In [None]:
cross_val_score(baggedtc, X, Y, cv=8).mean()

In [None]:
fakerf = FakeRandomForest(20)

In [None]:
cross_val_score(fakerf, X, Y, cv=8).mean()

### 用随机森林做regression  

#### 这一节课我们利用现成的随机森林库函数对房价做预测

- 数据来源： https://www.kaggle.com/harlfoxem/housesalesprediction/data  
- 对比模型： 随机森林，线性回归模型

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [None]:
house_data = pd.read_csv(os.path.join(data_folder, "kc_house_data.csv"))

In [None]:
house_data.head()

In [None]:
house_data.columns

In [None]:
# price is the target
NUMERICAL_COLS = ['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'sqft_above','sqft_basement',
                 'sqft_living15', 'sqft_lot15']

### 处理一下每一列的数据

In [None]:
# fit 每一列数据的scaler
scalers = dict()
for col in NUMERICAL_COLS:
    scaler = StandardScaler()
    scaler.fit(house_data[col].as_matrix().astype(np.float64).reshape(-1,1))
    scalers[col] = scaler
    

In [None]:
def transform_numerical(df):
    N, _ = df.shape
    D = len(NUMERICAL_COLS)
    result = np.zeros((N,D))
    i = 0
    for col, scaler in scalers.items():
        result[:, i] = scaler.transform(df[col].as_matrix().astype(np.float64).reshape(1,-1))
        i += 1
    return result    


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
hdata = transform_numerical(house_data)

In [None]:
train_data, test_data = train_test_split(hdata, test_size=0.2)

In [None]:
trainX, trainY = train_data[:,1:], train_data[:, 0]
testX, testY = test_data[:, 1:], test_data[:, 0]


In [None]:
rfregressor = RandomForestRegressor(n_estimators=100)
rfregressor.fit(trainX, trainY)
predictions = rfregressor.predict(testX)

### 可视化预测的结果

In [None]:
plt.scatter(testY, predictions)
plt.xlabel("target")
plt.ylabel("prediction")
ymin = np.round(min(min(testY), min(predictions)))
ymax = np.ceil(max(max(testY), max(predictions)))
r = range(int(ymin), int(ymax) + 1)
plt.plot(r,r)
plt.show()

In [None]:
plt.plot(testY, label='targets')
plt.plot(predictions, label='predictions')
plt.legend()
plt.show()

In [None]:
lr = LinearRegression()
print("linear regression performance: %f" % (cross_val_score(lr, trainX, trainY).mean()))

In [None]:
print("random forest regressor performance: %f" % (cross_val_score(rfregressor, trainX, trainY).mean()))

In [None]:
lr.fit(trainX, trainY)
print("linear regression test score: %f" % (lr.score(testX, testY)))

In [None]:
rfregressor.fit(trainX, trainY)
print("random forest regressor test score: %f" % (rfregressor.score(testX, testY)))