# 【問題1】ブレンディングのスクラッチ実装

ブレンディング をスクラッチ実装し、単一モデルより精度があがる例を 最低3つ 示してください。  
精度があがるとは、検証用データに対する平均二乗誤差（MSE）が小さくなることを指します。

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# データ読み込み
df = pd.read_csv('train.csv')

# 目的変数とクラスに分ける
X = df[['GrLivArea', 'YearBuilt']].values
y = df['SalePrice'].values

# データ分割
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0)

# 標準化
sc= StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_val_std = sc.transform(X_val)

# 対数変換
y_train_log = np.log(y_train)
y_valid_log = np.log(y_val)

In [3]:
weak_models = []
df_mse = pd.DataFrame(columns=["Model", "MSE"])

**例1**  
- 標準化
- 対数変換なし
- 線形回帰 + SVM + 決定木

In [4]:
# 線形回帰
print("Linear Regression")
weak1 = LinearRegression()
weak1.fit(X_train_std, y_train)
y_pred1 = weak1.predict(X_val_std)
mse1 = mean_squared_error(y_pred1, y_val)
print("MSE = {:.3}".format(mse1))

# SVM
print("SVM")
C = 0.1
k = "rbf"
weak2 = SVR(kernel=k, C=C)
weak2.fit(X_train_std, y_train)
y_pred2 = weak2.predict(X_val_std)
mse2 = mean_squared_error(y_pred2, y_val)
print("MSE = {:.3}".format(mse2))

# 決定木
print("Tree")
max_depth = 10
weak3 = DecisionTreeRegressor(max_depth=max_depth)
weak3.fit(X_train_std, y_train)
y_pred3 = weak3.predict(X_val_std)
mse3 = mean_squared_error(y_pred3, y_val)
print("MSE = {:.3}".format(mse3))

# 平均で Blending
print("Blending by Mean")
y_blend = (y_pred1 + y_pred2 + y_pred3)/3
mse_blend = mean_squared_error(y_blend, y_val)
print("MSE = {:.3}".format(mse_blend))

# Blending：MSEの逆数で加重平均
print("Blending by Weighted Mean")
reciprocal_mse = 1/mse1 + 1/mse2 + 1/mse3
y_blend = (1/mse1*y_pred1 + 1/mse2*y_pred2 + 1/mse3*y_pred3)/reciprocal_mse
mse_blend = mean_squared_error(y_blend, y_val)
print("MSE = {:.3}".format(mse_blend))

Linear Regression
MSE = 2.94e+09
SVM
MSE = 7.24e+09
Tree
MSE = 2.66e+09
Blending by Mean
MSE = 2.84e+09
Blending by Weighted Mean
MSE = 2.48e+09


**例2**  
- 標準化  
- 対数変換  
- 線形回帰 + SVM + 決定木

In [5]:
# 線形回帰
print("Linear Regression")
weak1 = LinearRegression()
weak1.fit(X_train_std, y_train_log)
y_pred1_log = weak1.predict(X_val_std)
y_pred1 = np.exp(y_pred1_log)
mse1 = mean_squared_error(y_pred1, y_val)
print("MSE = {:.3}".format(mse1))

# SVM
print("SVM")
C = 0.1
k = "rbf"
weak2 = SVR(kernel=k, C=C)
weak2.fit(X_train_std, y_train_log)
y_pred2_log = weak2.predict(X_val_std)
y_pred2 = np.exp(y_pred2_log)
mse2 = mean_squared_error(y_pred2, y_val)
print("MSE = {:.3}".format(mse2))

# 決定木
print("Tree")
max_depth = 10
weak3 = DecisionTreeRegressor(max_depth=max_depth)
weak3.fit(X_train_std, y_train_log)
y_pred3_log = weak3.predict(X_val_std)
y_pred3 = np.exp(y_pred3_log)
mse3 = mean_squared_error(y_pred3, y_val)
print("MSE = {:.3}".format(mse3))

# Blending：平均
print("Blending by Mean")
y_blend = (y_pred1 + y_pred2 + y_pred3)/3
mse_blend = mean_squared_error(y_blend, y_val)
print("MSE = {:.3}".format(mse_blend))

# Blending：MSEの逆数で加重平均
print("Blending by Weighted Mean")
reciprocal_mse = 1/mse1 + 1/mse2 + 1/mse3
y_blend = (1/mse1*y_pred1 + 1/mse2*y_pred2 + 1/mse3*y_pred3)/reciprocal_mse
mse_blend = mean_squared_error(y_blend, y_val)
print("MSE = {:.3}".format(mse_blend))

Linear Regression
MSE = 8.65e+09
SVM
MSE = 2.7e+09
Tree
MSE = 2.53e+09
Blending by Mean
MSE = 2.9e+09
Blending by Weighted Mean
MSE = 2.26e+09


**例3**  
- 標準化なし
- 対数変換なし
- 深さの違う決定木を10個

In [6]:
# 決定木10個
n = 10
mses = np.zeros(n)
y_preds = np.zeros((y_val.shape[0], n))

for i in range(1, n+1):
    print("Tree", i)
    max_depth = i
    weak = DecisionTreeRegressor(max_depth=max_depth)
    weak.fit(X_train, y_train)
    y_pred = weak.predict(X_val)
    mse = mean_squared_error(y_pred, y_val)
    print("MSE = {:.3}".format(mse))    
    y_preds[:, i-1] = y_pred
    mses[i-1] = mse

# Blending：平均
print("Blending by Mean")
y_blend = np.mean(y_preds, axis=1)
mse_blend = mean_squared_error(y_blend, y_val)
print("MSE = {:.3}".format(mse_blend))

# Blending：MSEの逆数で加重平均
print("Blending by Weighted Mean")
reciprocal_mse = np.reciprocal(mses).reshape(1, -1)
y_blend = np.sum((reciprocal_mse * y_preds) / np.sum(reciprocal_mse), axis=1)
mse_blend = mean_squared_error(y_blend, y_val)
print("MSE = {:.3}".format(mse_blend))

Tree 1
MSE = 4.95e+09
Tree 2
MSE = 3.71e+09
Tree 3
MSE = 2.7e+09
Tree 4
MSE = 2.69e+09
Tree 5
MSE = 2.17e+09
Tree 6
MSE = 2.19e+09
Tree 7
MSE = 2e+09
Tree 8
MSE = 2.09e+09
Tree 9
MSE = 2.27e+09
Tree 10
MSE = 2.49e+09
Blending by Mean
MSE = 2.04e+09
Blending by Weighted Mean
MSE = 1.96e+09


# 【問題2】バギングのスクラッチ実装

バギング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [7]:
def bagging_tree(n_subsets):
    print("=====n_subsets: ", n_subsets, "=====")
    mses = np.zeros(n_subsets)
    y_preds = np.zeros((y_val.shape[0], n_subsets))

    for subset in range(n_subsets):
        X_part, _, y_part, _ = train_test_split(X_train_std, y_train_log, train_size=0.8,
                                                shuffle=True, random_state=subset)
        tree = DecisionTreeRegressor(max_depth=8)
        tree.fit(X_part, y_part)
        y_pred_log = tree.predict(X_val_std)
        y_pred = np.exp(y_pred_log)
        mse = mean_squared_error(y_pred, y_val)
        print("MSE = {:.3}".format(mse))
        y_preds[:, subset-1] = y_pred
        mses[subset-1] = mse

    print("Bagging:")
    y_bagging = np.mean(y_preds, axis=1)
    mse_bagging = mean_squared_error(y_bagging, y_val)
    print("MSE: {:.3}".format(mse_bagging))
    mse_min = np.min(mses)
    return mse_bagging, mse_min

n_bag = 10
bag_mse = np.zeros((n_bag, 2))
for i in range(1, n_bag+1):
    bag_mse[i-1, :] = bagging_tree(i)

=====n_subsets:  1 =====
MSE = 2.42e+09
Bagging:
MSE: 2.42e+09
=====n_subsets:  2 =====
MSE = 2.42e+09
MSE = 2.22e+09
Bagging:
MSE: 2.16e+09
=====n_subsets:  3 =====
MSE = 2.37e+09
MSE = 2.26e+09
MSE = 3.13e+09
Bagging:
MSE: 2.1e+09
=====n_subsets:  4 =====
MSE = 2.41e+09
MSE = 2.23e+09
MSE = 3.13e+09
MSE = 2.04e+09
Bagging:
MSE: 2e+09
=====n_subsets:  5 =====
MSE = 2.38e+09
MSE = 2.23e+09
MSE = 3.13e+09
MSE = 2.04e+09
MSE = 2.06e+09
Bagging:
MSE: 1.88e+09
=====n_subsets:  6 =====
MSE = 2.36e+09
MSE = 2.23e+09
MSE = 3.12e+09
MSE = 2.03e+09
MSE = 2.05e+09
MSE = 2.05e+09
Bagging:
MSE: 1.8e+09
=====n_subsets:  7 =====
MSE = 2.37e+09
MSE = 2.24e+09
MSE = 3.13e+09
MSE = 2.03e+09
MSE = 2e+09
MSE = 1.91e+09
MSE = 2.19e+09
Bagging:
MSE: 1.7e+09
=====n_subsets:  8 =====
MSE = 2.36e+09
MSE = 2.22e+09
MSE = 3.12e+09
MSE = 2.08e+09
MSE = 2.05e+09
MSE = 1.84e+09
MSE = 2.13e+09
MSE = 2.01e+09
Bagging:
MSE: 1.66e+09
=====n_subsets:  9 =====
MSE = 2.41e+09
MSE = 2.24e+09
MSE = 3.13e+09
MSE = 2.05e+09


# 【問題3】スタッキングのスクラッチ実装

スタッキング をスクラッチ実装し、単一モデルより精度があがる例を 最低1つ 示してください。

In [8]:
class Stage():
    def __init__(self, n_folds, n_models, regressor):
        self.n_folds = n_folds  # k
        self.n_models = n_models  # m
        # regressor（インスタンス）を　（k行 m列） だけ複製する
        self.model = [[regressor for m in range(n_models)] for k in range(n_folds)]
    
    def fit(self, X, y):
        """スタッキングでアンサンブル学習
        :parameters
            X (2d-ndarray, (n_samples, n_features)): 訓練データ
            y (1d-ndarray, (n_samples, )): 正解値
        :returns
            next_X (2d-ndarray, (n_samples, n_models)): ブレンドデータ（次のステージの学習に使う）
        """
        n_samples = X.shape[0]
        
        # blendデータを格納する変数
        next_X = np.zeros((n_samples, self.n_models))
        
        # mの方向にループ
        for m in range(self.n_models):
            # m番目のモデルを使って、学習
            next_X[:, m] = self.fit_each_model(X, y, m)
        return next_X
    
    def fit_each_model(self, X, y, m):
        """１つのモデルで、K分割しながら学習
        :parameters
            X (2d-ndarray, (n_samples, n_features)): 訓練データ
            y (1d-ndarray, (n_sampels,)): 正解値
            m (int): 何番目のモデルかというインデックス
        :returns
            y_blend (1d-ndarray, (n_samples,)): K分割それぞれをブレンドした予測値
        """
        n_samples = X.shape[0]
        # ランダムにインデックスを用意
        random_index = np.random.permutation(np.arange(n_samples))
        
        # k分割した後のサイズを決定（データ数がkで割り切れない分はそれぞれに割り振る）
        fold_sizes = np.full(self.n_folds, X.shape[0] // self.n_folds, dtype=np.int)
        fold_sizes[:n_samples % self.n_folds] += 1
        
        # ブレンドした予測値を格納する配列
        y_blend = np.zeros(n_samples)
        
        # k分割したもの、それぞれで学習していく
        current = 0
        k = 0
        for fold_size in fold_sizes:
            # indexのはじめと終わり
            start, stop = current, current + fold_size
            
            # 予測に使うデータのindex
            index = random_index[start:stop]
            
            # 学習に使うデータのindex
            other_index = self.set_others(random_index, start, stop)
            
            # k行m列のモデルに学習させる
            self.model[k][m].fit(X[other_index], y[other_index])
            
            # 予測
            y_blend[index] = self.model[k][m].predict(X[index])
            
            # 次のインデックス
            current = stop
            k += 1
        return y_blend
    
    def set_others(self, array, start, stop):
        """指定インデックスの初めと終わりに対して、指定されていないインデックスを返す関数
        :parameters
            array (1d-ndarray, (n_samples)): 乱数配列（インデックス）
            start (int): 始まり
            stop (int): 終わり
        :returns
            others (1d-ndarray, (??)): 乱数配列（start ~ stop以外のインデックス）
        """
        if start == 0:
            others = array[stop:]
        elif stop == len(array):
            others = array[:start]
        else:
            others = np.concatenate([array[:start], array[stop:]])
        return others
    
    def predict(self, X):
        """予測
        :parameters
            X (2d-ndarray, (n_samples, n_features)): 本番データ
        :returns
            y_pred_mean (1d-ndarray, (n_samples,)): このステージでの予測値
        """
        # k行 m列のモデルで予測した結果をデータ数分だけ格納する三次元配列: y_pred （n_samples, k, m)
        y_pred = np.zeros((X.shape[0], self.n_folds, self.n_models))
        
        # k行 m列のモデルで、予測していく
        for m in range(self.n_models):
            for k in range(self.n_folds):
                y_pred[:, k, m] = self.model[k][m].predict(X)
        
        # k軸方向に平均をとる >>> 結果は （n_samples, m） の二次元配列になる
        y_pred_mean = np.mean(y_pred, axis=1)
        return y_pred_mean

In [9]:
# モデル定義
k0, m0, reg0 = 5, 10, DecisionTreeRegressor(max_depth=5)
k1, m1, reg1 = 2, 1, LinearRegression()
stage0 = Stage(k0, m0, reg0)
stage1 = Stage(k1, m1, reg1)

# 学習
X_blend0 = stage0.fit(X_train_std, y_train_log)
X_blend1 = stage1.fit(X_blend0, y_train_log)

In [10]:
# 推定
y_pred0_log = stage0.predict(X_val_std)
y_pred1_log = stage1.predict(y_pred0_log)

# 評価
y_pred = np.exp(y_pred1_log)
mse_stack = mean_squared_error(y_pred, y_val)
print("MSE = {:.3}".format(mse_stack))

MSE = 2.5e+09


In [11]:
from sklearn.ensemble import RandomForestRegressor

# モデル定義
k0, m0, reg0 = 5, 10, RandomForestRegressor(n_estimators=10, max_depth=5)
k1, m1, reg1 = 4, 8, SVR(kernel="rbf", C=1, gamma="scale")
k2, m2, reg2 = 3, 2, DecisionTreeRegressor(max_depth=5)
k3, m3, reg3 = 2, 1, LinearRegression()
stage0 = Stage(k0, m0, reg0)
stage1 = Stage(k1, m1, reg1)
stage2 = Stage(k2, m2, reg2)
stage3 = Stage(k3, m3, reg3)

# 学習
X_blend0 = stage0.fit(X_train_std, y_train_log)
X_blend1 = stage1.fit(X_blend0, y_train_log)
X_blend2 = stage2.fit(X_blend1, y_train_log)
X_blend3 = stage3.fit(X_blend2, y_train_log)
print("X_train :", X_train_std.shape)
print("X_blend0:", X_blend0.shape)
print("X_blend1:", X_blend1.shape)
print("X_blend2:", X_blend2.shape)
print("X_blend3:", X_blend3.shape)
print()

# 推定
y_pred0_log = stage0.predict(X_val_std)
y_pred1_log = stage1.predict(y_pred0_log)
y_pred2_log = stage2.predict(y_pred1_log)
y_pred3_log = stage3.predict(y_pred2_log)
print("X_val :", X_val_std.shape)
print("y_pred0 :", y_pred0_log.shape)
print("y_pred1 :", y_pred1_log.shape)
print("y_pred2 :", y_pred2_log.shape)
print("y_pred3 :", y_pred3_log.shape)
print()

# 評価
y_pred = np.exp(y_pred3_log)
mse_stack = mean_squared_error(y_pred, y_val)

print("MSE = {:.3}".format(mse_stack))

X_train : (1168, 2)
X_blend0: (1168, 10)
X_blend1: (1168, 8)
X_blend2: (1168, 2)
X_blend3: (1168, 1)

X_val : (292, 2)
y_pred0 : (292, 10)
y_pred1 : (292, 8)
y_pred2 : (292, 2)
y_pred3 : (292, 1)

MSE = 2.77e+09
