In [1]:
import os

from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression, SGDRegressor, Ridge, LogisticRegression, Lasso
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, classification_report, roc_auc_score
import joblib
import pandas as pd
import numpy as np

In [2]:
"""
线性回归直接预测房子价格
:return: None
"""
# 获取数据
fe_cal = fetch_california_housing(data_home='data')

print("获取特征值")
print(fe_cal.data.shape)
print('-' * 50)
print(fe_cal.data[0])
print("目标值")
print(fe_cal.target) #单位是10万美金
print(fe_cal.DESCR)
print('-' * 50)
print(fe_cal.feature_names) #特征列的名字

获取特征值
(20640, 8)
--------------------------------------------------
[   8.3252       41.            6.98412698    1.02380952  322.
    2.55555556   37.88       -122.23      ]
目标值
[4.526 3.585 3.521 ... 0.923 0.847 0.894]
.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

:Number of Instances: 20640

:Number of Attributes: 8 numeric, predictive attributes and the target

:Attribute Information:
    - MedInc        median income in block group
    - HouseAge      median house age in block group
    - AveRooms      average number of rooms per household
    - AveBedrms     average number of bedrooms per household
    - Population    block group population
    - AveOccup      average number of household members
    - Latitude      block group latitude
    - Longitude     block group longitude

:Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/

In [3]:
fe_cal.target.shape

(20640,)

In [4]:
# 分割数据集到训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(fe_cal.data, fe_cal.target, test_size=0.25, random_state=1)
#
print(x_train.shape)
#
# # 进行标准化处理(?) 目标值处理？
# # 特征值和目标值是都必须进行标准化处理, 实例化两个标准化API
std_x = StandardScaler()
#
x_train = std_x.fit_transform(x_train) #训练集标准化
x_test = std_x.transform(x_test) #测试集标准化

# 目标值进行了标准化，暂时没有对目标值进行标准化处理
# std_y = StandardScaler()
#
# temp = y_train.reshape(-1, 1) #-1代表把剩余的元素都堆到哪一维
#
# #标签进行标准化
# # 目标值是一维的，这里需要传进去2维的
# y_train = std_y.fit_transform(y_train.reshape(-1, 1))
# print(y_train.shape)
# y_test = std_y.transform(y_test.reshape(-1, 1))
# print(y_test.shape)

(15480, 8)


In [5]:
test1=np.array([1,2,3])
print(test1.shape)
test1.reshape(-1,1).shape

(3,)


(3, 1)

In [8]:
import os
# # estimator预测
# # # 正规方程求解方式预测结果，正规方程进行线性回归
lr = LinearRegression()
# fit是耗时的
lr.fit(x_train, y_train)
#回归系数可以看特征与目标之间的相关性
print('回归系数', lr.coef_)
#
y_predict = lr.predict(x_test)
# 预测测试集的房子价格，通过inverse得到真正的房子价格
# y_lr_predict = std_y.inverse_transform(y_predict)
# 保存训练好的模型，模型中保存的是w的值，也保存了模型结构
#保存模型放在fit之后即可
os.unlink('./tmp/test.pkl') # 删除之前的模型文件
joblib.dump(lr, "./tmp/test.pkl")
print("正规方程测试集里面每个房子的预测价格：", y_predict[0:10])
#下面是求测试集的损失，用均方误差，公式是(y_test-y_predict)^2/n
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

回归系数 [ 0.83167028  0.12159502 -0.26758589  0.30983997 -0.00518054 -0.04040421
 -0.90736902 -0.88212727]
正规方程测试集里面每个房子的预测价格： [2.12391852 0.93825754 2.7088455  1.70873764 2.82954754 3.50376456
 3.0147162  1.62781292 1.74317518 2.01897806]
正规方程的均方误差： 0.5356532845422559


In [9]:
#模拟上线时加载模型
model = joblib.load("./tmp/test.pkl")
# # 因为目标值进行了标准化，一定要把预测后的值逆向转换回来
y_predict = model.predict(x_test)

#
print("保存的模型预测的结果：", y_predict)
print("正规方程的均方误差：", mean_squared_error(y_test, y_predict))

# print("正规方程inverse后的均方误差：", mean_squared_error(std_y.inverse_transform(y_test),
#                                                std_y.inverse_transform(y_predict)))

保存的模型预测的结果： [2.12391852 0.93825754 2.7088455  ... 1.24263061 2.73771901 1.75800594]
正规方程的均方误差： 0.5356532845422559


In [10]:
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

0.375

In [11]:
# 梯度下降去进行房价预测,数据量大要用这个
# learning_rate的不同方式，代表学习率变化的算法不一样,比如constant,invscaling,adaptive
# 默认可以去调 eta0 = 0.008，会改变learning_rate的初始值
# learning_rate='optimal',alpha是正则化力度，但是会影响学习率的值，由alpha来算学习率
# penalty代表正则化，分为l1和l2
# eta0=0.01, penalty='l2',max_iter=1000
sgd = SGDRegressor(eta0=0.01,penalty='l2', max_iter=1000)
# # 训练
sgd.fit(x_train, y_train)
#
print('梯度下降的回归系数', sgd.coef_)
#
# 预测测试集的房子价格
# y_sgd_predict = std_y.inverse_transform(sgd.predict(x_test).reshape(-1, 1))
y_predict = sgd.predict(x_test)
# print("梯度下降测试集里面每个房子的预测价格：", y_sgd_predict)
print("梯度下降的均方误差：", mean_squared_error(y_test, y_predict))
# print("梯度下降的原始房价量纲均方误差：", mean_squared_error(std_y.inverse_transform(y_test), y_sgd_predict))

梯度下降的回归系数 [ 0.84587582  0.11632998 -0.28845255  0.28975851  0.00220635 -0.00404437
 -0.90067997 -0.87520169]
梯度下降的均方误差： 0.5394549585969277


In [13]:
w=1
alpha=0.15
def loss(w):
    return 2*w**2+3*w+2
def dao_shu(w):
    return 4*w+3
for i in range(30):
    w=w-alpha*dao_shu(w)
    print(f'w {w} 损失{loss(w)}')

w -0.050000000000000044 损失1.855
w -0.47000000000000003 损失1.0318
w -0.638 损失0.9000879999999998
w -0.7052 损失0.8790140800000001
w -0.7320800000000001 损失0.8756422527999999
w -0.742832 损失0.875102760448
w -0.7471328 损失0.8750164416716801
w -0.74885312 损失0.875002630667469
w -0.749541248 损失0.875000420906795
w -0.7498164992 损失0.8750000673450873
w -0.74992659968 损失0.8750000107752143
w -0.749970639872 损失0.8750000017240342
w -0.7499882559488 损失0.8750000002758456
w -0.7499953023795201 损失0.8750000000441351
w -0.7499981209518081 损失0.8750000000070619
w -0.7499992483807232 损失0.8750000000011295
w -0.7499996993522893 损失0.8750000000001807
w -0.7499998797409158 损失0.8750000000000286
w -0.7499999518963663 损失0.8750000000000044
w -0.7499999807585465 损失0.8750000000000004
w -0.7499999923034186 损失0.8750000000000004
w -0.7499999969213674 损失0.8750000000000002
w -0.749999998768547 损失0.875
w -0.7499999995074188 损失0.8750000000000002
w -0.7499999998029675 损失0.8749999999999998
w -0.749999999921187 损失0.875
w -0.7499999999