In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv(r'Dataset/boston.csv')

波士顿房价数据集 字段说明
- CRIM 所在镇的犯罪率
- ZN 面积大于25000平方英尺的房屋所占比例
- INDUS 所在镇非零售区域所占比例
- CHAS 是否位于河边
- NOX 一氧化氮浓度
- RM 平均房间数量
- AGE 1940年前建成房屋所占比例
- DIS 房屋距离波士顿五大就业中心的加权距离
- RAD 距离房屋最近的公路数
- TAX 财产税额度
- PTRATIO 所在镇师生比例
- B $1000\times (所在镇非美籍人口比例-0.63）^2$
- LSTAT 所在镇弱势群体人口所占比例
- MEDV 房屋平均价格

In [3]:
data = data.drop(['Unnamed: 0'],axis=1)
data

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [4]:
class LinearRegression:
    def Fit(self,x,y):
        #根据训练样本x(特征矩阵)进行训练
        x.insert(0,'intercept',1) #在x头新增一列，设为1，代表bias值
        #将数组x转换成矩阵形式
        x = np.asmatrix(x.copy())   #若x是数组的一部分，而不是完整的对象数据，则无法进行转换，可以使用.copy()进行转换
        #y是一维结构，不用进行拷贝
        y = np.asmatrix(y).reshape(-1,1) #通过.reshape()将y转换成一个列向量
        #通过最小二乘公式求解出最佳权重值
        self.w_ = (x.T * x).I * x.T * y
        
    def Predict(self,x):
        #根据测试样本x进行预测
        x.insert(0,'intercept',1)
        x = np.asmatrix(x.copy())
        result = x * self.w_
        #将矩阵转为ndarray数组，再进行扁平化处理（多维降一维变成列表）
        return np.array(result).ravel()

In [12]:
#随机打乱data
t = data.sample(len(data),random_state=0)
#前400条作为训练集
train_x = t.iloc[:400,:-1]
train_y = t.iloc[:400,-1]
#后面作为测试集
test_x = t.iloc[400:,:-1]
test_y = t.iloc[400:,-1]

lr=LinearRegression()
lr.Fit(train_x,train_y)      #训练
y_hat = lr.Predict(test_x)  #测试

#查看损失函数值
display(np.mean((test_y - y_hat) ** 2))
#查看权重值
display(lr.w_)

17.09753138466829

matrix([[ 4.00542166e+01],
        [-1.10490198e-01],
        [ 4.11074548e-02],
        [ 1.14986147e-02],
        [ 2.03209693e+00],
        [-1.95402764e+01],
        [ 3.28900304e+00],
        [ 6.91671720e-03],
        [-1.39738261e+00],
        [ 3.78327573e-01],
        [-1.54938397e-02],
        [-8.64470498e-01],
        [ 8.29999966e-03],
        [-5.66991979e-01]])

In [13]:
#可视化
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc("font", family='MicroSoft YaHei', weight='bold')

In [11]:
%matplotlib
plt.figure(figsize=(10,10))
plt.plot(y_hat, 'ro-', label='预测值')
plt.plot(test_y.values, 'go--', label='真实值')
plt.title('线性回归预测-最小二乘法')
plt.xlabel('样本序号')
plt.ylabel('房价')
plt.legend()
plt.show()

Using matplotlib backend: Qt5Agg
