In [1]:
# 从 sklearn.datasets 导入波士顿房价数据
from sklearn.datasets import load_boston
# 从读取房价数据存储在变量 boston 中
boston = load_boston()
# 输出数据描述
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [2]:
# 房价数据分割

In [3]:
# 导入数据分割器
from sklearn.cross_validation import train_test_split
# 导入 numpy 
import numpy as np

x = boston.data
y = boston.target

In [4]:
# 随机采样 25% 的数据构建测试样本，其余作为训练样本
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 33, test_size = 0.25)

In [5]:
# 分析回归目标值的差异
print("The max target value is ", np.max(boston.target))
print("The min target value is ", np.min(boston.target))
print("The average target value is ", np.mean(boston.target))

The max target value is  50.0
The min target value is  5.0
The average target value is  22.5328063241


In [6]:
# 目标房价之间的差异较大，需要对特殊以及目标值进行标准化处理

In [12]:
# 导入数据标准化模块
from sklearn.preprocessing import StandardScaler
# 分别初始化对特征和目标值得标准化器
ss_x = StandardScaler()
ss_y = StandardScaler()
# 分别对训练和测试数据的特征以及目标值进行标准化处理
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)
y_train = ss_y.fit_transform(y_train)
y_test = ss_y.transform(y_test)



In [13]:
# 使用线性回归模型 LinearRegression 和 SGDRegressor 分别对房价进行预测

In [14]:
# 导入 LinearRegression
from sklearn.linear_model import LinearRegression
# 使用默认配置初始化线性回归器 LinearRegression
lr = LinearRegression()
# 使用训练数据进行参数估计
lr.fit(x_train, y_train)
# 对测试数据进行回归预测
lr_y_predict = lr.predict(x_test)

In [15]:
# 导入 SGDRegressor
from sklearn.linear_model import SGDRegressor
# 使用默认配置初始化线性回归器 SGDRegressor
sgdr = SGDRegressor()
# 使用训练数据进行参数估计
sgdr.fit(x_train, y_train)
# 对测试数据进行回归预测
sgdr_y_predict = sgdr.predict(x_test)

In [16]:
# 衡量预测值与真实值之间的差距，可以通过多种测评函数进行评价，最为直观的评价指标包括，
# 平均绝对误差以及均方误差，这同样也是线性回归模型所要优化的目标

In [17]:
# 使用三种回归评价机制以及两种调用 R-squared 评价模块的方法，对模型的回归性能做出评价

In [18]:
# 使用 LinearRegression 模型自带的评估模块，并输出评估结果
print('The value of default measurement of LinearRegression is ', lr.score(x_test, y_test))

The value of default measurement of LinearRegression is  -9992.64929904


In [21]:
# 从 sklearn.metrics 依次导入 r2_score, mean_squared_error 以及 mean_absoluate_erroe 用于回归性能的评估
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
# 使用 r2_score 模块，并输出评估结果
print('The value of R-squared of LinearRegression is ', r2_score(y_test, lr_y_predict))
# 使用 mean_squared_error 模块，并输出评估结果
print('The mean squared error of LinearRegression is ', mean_squared_error(ss_y.inverse_transform(y_test), ss_y.inverse_transform(lr_y_predict)))

The value of R-squared of LinearRegression is  -9992.64929904
The mean squared error of LinearRegression is  774920.504052


In [22]:
# 使用三种不同核函数配置的支持向量机回归模型进行训练，并且分别对测试数据做出预测

In [23]:
# 从 sklearn.svm 中导入支持向量机(回归)模型
from sklearn.svm import SVR
# 使用线性核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
linear_svr = SVR(kernel = 'linear')
linear_svr.fit(x_train, y_train)
linear_svr_y_predict = linear_svr.predict(x_test)

In [24]:
# 使用多项式核函数配置的支持向量机进行回归训练，并且对测试样本进行预测
poly_svr = SVR(kernel = 'poly')
poly_svr.fit(x_train, y_train)
poly_svr_y_predict = poly_svr.predict(x_test)

In [25]:
# 使用径向基核函数配置的支持向量机进行回归训练，并对测试样本进行预测
rbf_svr = SVR(kernel = 'rbf')
rbf_svr.fit(x_train, y_train)
rbf_svr_y_predict = rbf_svr.predict(x_test)

In [26]:
# 对相同测试集上的不同核函数做性能评估

In [27]:
# p72(86/198)

In [28]:
# K 近邻(回归)对房价数据进行回归预测

In [29]:
# 导入 K 近邻回归器
from sklearn.neighbors import KNeighborsRegressor

In [30]:
# 初始化 K近邻回归器，并且调整配置，使得预测的方式为平均回归 (weights = 'uniform')
uni_knr = KNeighborsRegressor(weights = 'uniform')
uni_knr.fit(x_train, y_train)
uni_knr_y_predict = uni_knr.predict(x_test)

In [31]:
# 初始化 K 近邻回归器，并且调整配置，使得预测的方式为根据距离加权回归(weights = 'distance')

In [32]:
dis_knr = KNeighborsRegressor(weights = 'distance')
dis_knr.fit(x_train, y_train)
dis_knr_y_predict = dis_knr.predict(x_test)

In [None]:
# 性能评估 p75(89/198)