In [2]:
from sklearn.datasets import load_boston
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
import joblib


In [3]:
"""
线性回归直接预测房子价格
:return: None
"""
# 获取数据
lb = load_boston()

print("获取特征值")
print(lb.data)
print("目标值")
print(lb.target)
print(lb.feature_names)
print('-' * 50)
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(lb.data, lb.target, test_size=0.25, random_state=1)

print(x_train.shape)

获取特征值
[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]
目标值
[24.  21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 15.  18.9 21.7 20.4
 18.2 19.9 23.1 17.5 20.2 18.2 13.6 19.6 15.2 14.5 15.6 13.9 16.6 14.8
 18.4 21.  12.7 14.5 13.2 13.1 13.5 18.9 20.  21.  24.7 30.8 34.9 26.6
 25.3 24.7 21.2 19.3 20.  16.6 14.4 19.4 19.7 20.5 25.  23.4 18.9 35.4
 24.7 31.6 23.3 19.6 18.7 16.  22.2 25.  33.  23.5 19.4 22.  17.4 20.9
 24.2 21.7 22.8 23.4 24.1 21.4 20.  20.8 21.2 20.3 28.  23.9 24.8 22.9
 23.9 26.6 22.5 22.2 23.6 28.7 22.6 22.  22.9 25.  20.6 28.4 21.4 38.7
 43.8 33.2 27.5 26.5 18.6 19.3 20.1 19.5 19.5 20.


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [6]:
# 进行标准化处理(?) 目标值处理？
# 特征值和目标值是都必须进行标准化处理, 实例化两个标准化API
std_x = StandardScaler()

x_train = std_x.fit_transform(x_train)
x_test = std_x.transform(x_test)

# 目标值进行了标准化
std_y = StandardScaler()

# y_train
# temp = y_train.reshape(-1, 1)  # 此处：一维变二维
# print(temp)
y_train = std_y.fit_transform(y_train.reshape(-1, 1))  # 目标值本是一维的，这里需要传进去2维的

y_test = std_y.transform(y_test.reshape(-1, 1))

y_train

array([[-0.19582006],
       [ 0.0847902 ],
       [ 0.63478631],
       [-0.26316652],
       [ 0.0847902 ],
       [ 0.298054  ],
       [-1.41928078],
       [ 0.17458549],
       [ 1.54396354],
       [-1.0039776 ],
       [-1.08254847],
       [ 1.95926673],
       [-1.34070991],
       [-0.19582006],
       [-0.51010354],
       [-0.31928857],
       [ 0.15213666],
       [ 0.69090836],
       [ 0.21948313],
       [ 0.16336107],
       [-0.36418621],
       [ 0.67968395],
       [-0.20704447],
       [ 1.28580211],
       [-0.88050909],
       [-0.21826888],
       [-0.71214293],
       [-1.01520201],
       [ 0.9490698 ],
       [ 0.57866426],
       [ 0.23070754],
       [ 0.23070754],
       [ 0.30927841],
       [ 2.40824314],
       [-0.04990272],
       [ 0.43274692],
       [-0.91418232],
       [-0.17337123],
       [-0.25194211],
       [ 0.0847902 ],
       [-1.03765083],
       [-0.68969411],
       [ 0.27560518],
       [-0.2407177 ],
       [ 0.01744374],
       [-0

In [8]:
# estimator预测
# 正规方程求解方式预测结果，正规方程进行线性回归
lr = LinearRegression()

lr.fit(x_train, y_train)

print('回归系数', lr.coef_)  # 回归系数可以看特征与目标之间的相关性

y_predict = lr.predict(x_test)

# 预测测试集的房子价格，通过inverse得到真正的房子价格
y_lr_predict = std_y.inverse_transform(y_predict)

# 保存训练好的模型
joblib.dump(lr, "./tmp/test.pkl")
print("正规方程测试集里面每个房子的预测价格：", y_lr_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))  # 用损失/误差指标来评估结果

回归系数 [[-0.12026411  0.15044778  0.02951803  0.07470354 -0.28043353  0.22170939
   0.02190624 -0.35275513  0.29939558 -0.2028089  -0.23911894  0.06305081
  -0.45259462]]
正规方程测试集里面每个房子的预测价格： [[32.37816533]
 [27.95684437]
 [18.07213891]
 [21.63166556]
 [18.93029508]
 [19.96277202]
 [32.2834674 ]
 [18.06715668]
 [24.72989076]
 [26.85359369]
 [27.23326816]
 [28.57021239]
 [21.18778302]
 [26.94393815]
 [23.37892579]
 [20.89176865]
 [17.11746934]
 [37.73997945]
 [30.51980066]
 [ 8.44489436]
 [20.86557977]
 [16.21989418]
 [25.13605925]
 [24.77658813]
 [31.40497629]
 [11.02741407]
 [13.82097563]
 [16.80208261]
 [35.94637198]
 [14.7155729 ]
 [21.23939821]
 [14.15079469]
 [42.72492585]
 [17.83887162]
 [21.84610225]
 [20.40178099]
 [17.50287927]
 [27.00093206]
 [ 9.80760408]
 [20.00288662]
 [24.27066782]
 [21.06719021]
 [29.47089776]
 [16.48482565]
 [19.38852695]
 [14.54778282]
 [39.39838319]
 [18.09810655]
 [26.22164983]
 [20.60676525]
 [25.09994066]
 [24.48366723]
 [25.02297948]
 [26.84986898]
 

# 2 加载保存的模型

In [9]:
model = joblib.load("./tmp/test.pkl")

# 因为目标值进行了标准化，一定要把预测后的值逆向转换回来
y_predict = model.predict(x_test)

#
print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test),
                                               std_y.inverse_transform(y_predict)))


保存的模型预测的结果： [[ 1.12620955]
 [ 0.62994234]
 [-0.47955756]
 [-0.08002168]
 [-0.38323459]
 [-0.26734514]
 [ 1.11558027]
 [-0.48011678]
 [ 0.26773583]
 [ 0.50610896]
 [ 0.54872518]
 [ 0.69878929]
 [-0.12984488]
 [ 0.51624959]
 [ 0.11609798]
 [-0.16307075]
 [-0.58671359]
 [ 1.72804157]
 [ 0.91761907]
 [-1.56015899]
 [-0.16601029]
 [-0.68746111]
 [ 0.31332585]
 [ 0.27297733]
 [ 1.01697482]
 [-1.27028638]
 [-0.95672557]
 [-0.62211389]
 [ 1.5267197 ]
 [-0.8563123 ]
 [-0.12405138]
 [-0.91970532]
 [ 2.28757241]
 [-0.50574043]
 [-0.05595243]
 [-0.21806897]
 [-0.54345359]
 [ 0.52264682]
 [-1.40720286]
 [-0.26284251]
 [ 0.21619076]
 [-0.14338071]
 [ 0.79988591]
 [-0.65772411]
 [-0.33180076]
 [-0.87514574]
 [ 1.91418761]
 [-0.47664284]
 [ 0.43517699]
 [-0.1950607 ]
 [ 0.30927175]
 [ 0.24009869]
 [ 0.30063331]
 [ 0.50569088]
 [-1.94512422]
 [ 0.20018782]
 [-1.30384514]
 [ 0.50366068]
 [-0.6220835 ]
 [ 1.47453167]
 [-0.31823582]
 [ 0.57109939]
 [-0.64702253]
 [-0.35840699]
 [-1.27347275]
 [ 1.08939349