In [1]:
# 回归与分类区别： 回归预测目标是连续变量

##### 2.1.2.1 线性回归器
##### 利用线性回归器预测美国波士顿地区房价

In [13]:
# 导入数据
from sklearn.datasets import load_boston
boston = load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [14]:
import pandas as pd
import numpy as np

X = boston.data
y = boston.target

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33)

print('min value:', np.min(boston.target))
print('max value:', np.max(boston.target))
print('avg value:', np.mean(boston.target))

min value: 5.0
max value: 50.0
avg value: 22.532806324110677


In [15]:
X

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [16]:
# 目标值的差异也较大，需要对特定以及目标值进行标准化处理
from sklearn.preprocessing import StandardScaler
ss_X = StandardScaler()
ss_y = StandardScaler()

X_train = ss_X.fit_transform(X_train)
X_test = ss_X.transform(X_test)
y_train = ss_y.fit_transform(y_train.reshape(-1, 1))
y_test = ss_y.transform(y_test.reshape(-1, 1))

In [22]:
# 利用LR和SGDR预测
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_y_predict = lr.predict(X_test)

from sklearn.linear_model import SGDRegressor
sgd = SGDRegressor()
sgd.fit(X_train, y_train)
sgd_y_predict = sgd.predict(X_test)

  y = column_or_1d(y, warn=True)


In [26]:
# 性能评估，MAE、MSE and R-squared
# LR
print('The value of default measurement of LinearRegression is', lr.score(X_test, y_test))

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print('The R-squared value is ', r2_score(y_test, lr_y_predict))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))

# SGD
print('The value of default measurement of SGDRegression is', sgd.score(X_test, y_test))

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print('The R-squared value is ', r2_score(y_test, sgd_y_predict))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgd_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(sgd_y_predict)))

The value of default measurement of LinearRegression is 0.6757955014529481
The R-squared value is  0.6757955014529481
The mean_squared_error value is  25.139236520353453
The mean_absolute_error value is  3.532532543705398
The value of default measurement of SGDRegression is 0.6638073053640492
The R-squared value is  0.6638073053640493
The mean_squared_error value is  26.06881676455684
The mean_absolute_error value is  3.497126410297959


##### 2.1.2.2 支持向量机（回归）
##### 使用三种不同核函数配置的支持向量机回归模型，训练使用上小节的数据

In [27]:
from sklearn.svm import SVR

# 使用线性核函数配置
linear_svr = SVR(kernel='linear')
linear_svr.fit(X_train, y_train)
linear_svr_y_predict = linear_svr.predict(X_test)

# 使用多项式核函数配置
poly_svr = SVR(kernel='poly')
poly_svr.fit(X_train, y_train)
poly_svr_y_predict = poly_svr.predict(X_test)

# 使用径向基核函数配置
rbf_svr = SVR(kernel='rbf')
rbf_svr.fit(X_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(X_test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [28]:
# 性能评估

# Linear
print('The value of default measurement of Linear SVR is', linear_svr.score(X_test, y_test))

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print('The R-squared value is ', r2_score(y_test, linear_svr_y_predict))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(linear_svr_y_predict)))

# Poly
print('The value of default measurement of Poly SVR is', poly_svr.score(X_test, y_test))

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print('The R-squared value is ', r2_score(y_test, poly_svr_y_predict))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(poly_svr_y_predict)))

# RBF
print('The value of default measurement of RBF SVR is', rbf_svr.score(X_test, y_test))

from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
print('The R-squared value is ', r2_score(y_test, rbf_svr_y_predict))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rbf_svr_y_predict)))


The value of default measurement of Linear SVR is 0.650659546421538
The R-squared value is  0.650659546421538
The mean_squared_error value is  27.088311013556027
The mean_absolute_error value is  3.4328013877599624
The value of default measurement of Poly SVR is 0.40365065102550846
The R-squared value is  0.40365065102550846
The mean_squared_error value is  46.24170053103929
The mean_absolute_error value is  3.73840737104651
The value of default measurement of RBF SVR is 0.7559887416340944
The R-squared value is  0.7559887416340944
The mean_squared_error value is  18.920948861538733
The mean_absolute_error value is  2.6067819999501114


##### 2.1.2.3 K近邻（回归）
##### 使用平均回归配置和距离加权回归配置的K近邻预测波士顿房价

In [30]:
from sklearn.neighbors import KNeighborsRegressor

# 平均回归
uni_knr = KNeighborsRegressor(weights='uniform')
uni_knr.fit(X_train, y_train)
uni_knr_y_predict = uni_knr.predict(X_test)

# 距离加权回归
dis_knr = KNeighborsRegressor(weights='distance')
dis_knr.fit(X_train, y_train)
dis_knr_y_predict = dis_knr.predict(X_test)

In [31]:
# 性能评估
# 平均回归配置
print('The R-squared value is ', uni_knr.score(X_test, y_test))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(uni_knr_y_predict)))

# 距离加权回归配置
print('The R-squared value is ', dis_knr.score(X_test, y_test))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dis_knr_y_predict)))


The R-squared value is  0.6907212176346006
The mean_squared_error value is  23.981877165354337
The mean_absolute_error value is  2.9650393700787396
The R-squared value is  0.7201094821421603
The mean_squared_error value is  21.703073090490353
The mean_absolute_error value is  2.801125502210876


##### 2.1.2.4 回归树
##### 对波士顿房价预测

In [36]:
# 预测
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
dtr.fit(X_train, y_train)
dtr_y_predict = dtr.predict(X_test)

# 性能评估
print('The R-squared value is ', dtr.score(X_test, y_test))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dtr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(dtr_y_predict)))


The R-squared value is  0.6784847505828786
The mean_squared_error value is  24.930708661417327
The mean_absolute_error value is  3.185826771653543


##### 2.1.2.5 集成模型
##### 利用RandomForestRegressor、ExtraTreesRegressor以及GradientBoostingRegressor对波士顿房价回归预测

In [39]:
# 预测
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor,GradientBoostingRegressor

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)
rfr_y_predict = rfr.predict(X_test)

etr = ExtraTreesRegressor()
etr.fit(X_train, y_train)
etr_y_predict = etr.predict(X_test)

gbr = GradientBoostingRegressor()
gbr.fit(X_train, y_train)
gbr_y_predict = gbr.predict(X_test)

# 性能评估
print('The R-squared value is ', rfr.score(X_test, y_test))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(rfr_y_predict)))

print('The R-squared value is ', etr.score(X_test, y_test))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(etr_y_predict)))

print('The R-squared value is ', gbr.score(X_test, y_test))
print('The mean_squared_error value is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict)))
print('The mean_absolute_error value is ', mean_absolute_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(gbr_y_predict)))


# 极端回归森林模型，输出每种特征对预测目标的贡献度
# zip 方法在 Python 2 和 Python 3 中的不同：在 Python 3.x 中为了减少内存，zip() 返回的是一个对象。如需展示列表，需手动 list() 转换。
print(np.sort(list(zip(etr.feature_importances_, boston.feature_names)), axis=0))

The R-squared value is  0.8147918016383529
The mean_squared_error value is  14.361283464566933
The mean_absolute_error value is  2.4850393700787405
The R-squared value is  0.7784702050241268
The mean_squared_error value is  17.177707086614166
The mean_absolute_error value is  2.533385826771654
The R-squared value is  0.8318439570491649
The mean_squared_error value is  13.039037258930096
The mean_absolute_error value is  2.28950808485688
[['0.0028786674021670905' 'AGE']
 ['0.013446840966940798' 'B']
 ['0.013471827149711131' 'CHAS']
 ['0.01421739194933717' 'CRIM']
 ['0.01875419610891404' 'DIS']
 ['0.019676456908952852' 'INDUS']
 ['0.027727845268650273' 'LSTAT']
 ['0.029516559636520594' 'NOX']
 ['0.03961016066006889' 'PTRATIO']
 ['0.040860495348125386' 'RAD']
 ['0.11290802965356389' 'RM']
 ['0.2804510814887173' 'TAX']
 ['0.3864804474583307' 'ZN']]


  """
  if __name__ == '__main__':
  y = column_or_1d(y, warn=True)
