## Random Forest Regressor

In [4]:
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import numpy as np
#Load boston housing dataset as an example
boston = load_boston()
X = boston["data"]
Y = boston["target"]
names = boston["feature_names"]
rf = RandomForestRegressor()
rf.fit(X, Y)
print("Features sorted by their score:")
print(sorted(zip(map(lambda x: round(x, 4), rf.feature_importances_), names), 
             reverse=True))

Features sorted by their score:
[(0.4482, 'RM'), (0.3506, 'LSTAT'), (0.0776, 'DIS'), (0.0319, 'CRIM'), (0.0181, 'AGE'), (0.0174, 'NOX'), (0.0161, 'TAX'), (0.0147, 'PTRATIO'), (0.014, 'B'), (0.0059, 'INDUS'), (0.0043, 'RAD'), (0.0008, 'ZN'), (0.0004, 'CHAS')]


## AIC

In [6]:
# pandasからSeries, DataFrameの読み込み
import pandas as pd
from pandas import Series, DataFrame

# DataFrameを作ります。
boston_df = DataFrame(boston.data)

In [7]:
# 列名をつけます。
boston_df.columns = boston.feature_names

In [8]:
boston_df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03


In [11]:
boston_df['target'] = boston["target"]

In [12]:
boston_df.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7


In [13]:
# ライブラリのimport
import scipy as sp
from scipy import stats
import statsmodels.formula.api as smf
import statsmodels.api as sm
# 表示桁数の指定
%precision 3

mod_full = smf.ols("target ~ CRIM + ZN + INDUS + CHAS + NOX + RM + AGE + DIS + RAD + TAX + PTRATIO + B + LSTAT", boston_df).fit()

In [14]:
# Type II ANOVAの結果
sm.stats.anova_lm(mod_full, typ=2).round(3)

Unnamed: 0,sum_sq,df,F,PR(>F)
CRIM,241.728,1.0,10.734,0.001
ZN,257.226,1.0,11.422,0.001
INDUS,2.591,1.0,0.115,0.735
CHAS,219.279,1.0,9.737,0.002
NOX,488.606,1.0,21.696,0.0
RM,1865.903,1.0,82.852,0.0
AGE,0.073,1.0,0.003,0.955
DIS,1232.544,1.0,54.729,0.0
RAD,478.172,1.0,21.232,0.0
TAX,242.054,1.0,10.748,0.001


In [15]:
# パラメータの表示
mod_full.params

Intercept    36.491103
CRIM         -0.107171
ZN            0.046395
INDUS         0.020860
CHAS          2.688561
NOX         -17.795759
RM            3.804752
AGE           0.000751
DIS          -1.475759
RAD           0.305655
TAX          -0.012329
PTRATIO      -0.953464
B             0.009393
LSTAT        -0.525467
dtype: float64

In [34]:
mod_full.aic.round(3)

3025.677

In [35]:
mod_non_age_indus = smf.ols("target ~ CRIM + ZN + CHAS + NOX + RM + DIS + RAD + TAX + PTRATIO + B + LSTAT", boston_df).fit()

In [36]:
mod_non_age_indus.aic.round(3)

3021.798

In [37]:
# Type II ANOVAの結果
sm.stats.anova_lm(mod_non_age_indus, typ=2).round(3)

Unnamed: 0,sum_sq,df,F,PR(>F)
CRIM,243.797,1.0,10.867,0.001
ZN,257.397,1.0,11.473,0.001
CHAS,227.648,1.0,10.147,0.002
NOX,542.98,1.0,24.202,0.0
RM,1958.209,1.0,87.283,0.0
DIS,1449.704,1.0,64.618,0.0
RAD,499.632,1.0,22.27,0.0
TAX,272.991,1.0,12.168,0.001
PTRATIO,1207.854,1.0,53.838,0.0
B,276.15,1.0,12.309,0.0


### AICを元に回帰を行う

In [16]:
import sklearn
from sklearn.linear_model import LinearRegression

In [17]:
lreg = LinearRegression()

In [18]:
# 列を複数削除する
boston_df = boston_df.drop(['AGE', 'INDUS'], axis=1)

In [20]:
# 説明変数
X_multi = boston_df.drop('target',1)

In [22]:
# 目的変数
Y_target = boston_df.target

In [23]:
# モデルを作ります。
lreg.fit(X_multi,Y_target)

  linalg.lstsq(X, y)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [28]:
import sklearn

# 説明変数をX、目的変数をYとして受け取ります。
X_train, X_test, Y_train, Y_test = sklearn.model_selection.train_test_split(X_multi,boston_df.target)

In [29]:
# どんな感じに分かれたか、確認してみます。
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(379, 11) (127, 11) (379,) (127,)


In [30]:
# まずはインスタンスを作ります。
lreg = LinearRegression()

# fitでモデルを作りますが、使うのは学習用のデータだけです。
lreg.fit(X_train,Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [31]:
pred_train = lreg.predict(X_train)
pred_test = lreg.predict(X_test)

In [32]:
print('X_trainを使ったモデルの平均二乗誤差＝{:0.2f}'.format(np.mean((Y_train - pred_train) ** 2)))
print('X_testを使ったモデルの平均二乗誤差＝{:0.2f}'.format(np.mean((Y_test - pred_test) ** 2)))

X_trainを使ったモデルの平均二乗誤差＝21.02
X_testを使ったモデルの平均二乗誤差＝25.73
