In [1]:
import numpy as np
import pandas as pd

In [2]:
data=pd.read_csv(r'Dataset/boston.csv')
data = data.drop(['Unnamed: 0'],axis=1)
data

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [3]:
class LinearRegression:
    def __init__(self,alpha,times):
        #alpha: learning rate
        self.alpha = alpha
        self.times = times
        
    def Fit(self,x,y):
        x = np.array(x) #[样本数量，特征数量]
        y = np.array(y) #[样本数量]
        # 创建初始权重，此处设为0，长度比特征数量多1，即为bias
        self.w_ = np.zeros(1 + x.shape[1])
        # 创建损失列表，用来保存每次迭代后的损失值
        self.loss_ = []
        
        # 进行循环，每次迭代不断调整w_，使loss_不断减小
        for i in range(self.times):
            y_hat = np.dot(x,self.w_[1:]) + self.w_[0] #计算预测值：x点积y加w0
            # 计算loss function，加入到损失列表中
            error = y - y_hat
            self.loss_.append(np.sum(error**2)/2)
            # 调整权重：权重(j) = 权重(j-1) + 学习率 * sum((y - y_hat) * x(j))
            self.w_[0] += self.alpha * np.sum(error)
            self.w_[1:] += self.alpha * np.dot(x.T, error)
            
    def Predict(self, x):
        x = np.array(x)
        result = np.dot(x, self.w_[1:]) + self.w_[0]
        return result  

In [5]:
class StandardScalar():
    # 该类对数据进行标准化处理
    
    def Fit(self, x):
        #计算样本的均值与标准差
        x = np.asarray(x)
        self.std_ = np.std(x, axis=0)
        self.mean_ = np.mean(x, axis=0)
        
    def Transform(self, x):
        # 对样本进行标准化处理：将x的每一列都变成标准正态分布的数据
        return(x - self.mean_) / self.std_
    
    def FitTransform(self, x):
        # 训练数据并转换，返回转换之后的结果
        
        self.Fit(x)
        return self.Transform(x)

In [14]:
#随机打乱data
t = data.sample(len(data),random_state=0)
#前400条作为训练集
train_x = t.iloc[:400,:-1]
train_y = t.iloc[:400,-1]
#后面作为测试集
test_x = t.iloc[400:,:-1]
test_y = t.iloc[400:,-1]

# 对数据进行标准化处理
s1 = StandardScalar()
train_x = s1.FitTransform(train_x)
test_x = s1.FitTransform(test_x)
s2 = StandardScalar()
train_y = s2.FitTransform(train_y)
test_y = s2.FitTransform(test_y)

lr = LinearRegression(alpha=0.0005, times=25)
lr.Fit(train_x, train_y)      #训练
y_hat = lr.Predict(test_x)  #测试

#查看损失函数值
display(np.mean((test_y - y_hat) ** 2))
#display(lr.w_)
#display(lr.loss_)

0.14889265006244543

In [7]:
 #可视化
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc("font", family='MicroSoft YaHei', weight='bold')

In [8]:
%matplotlib
plt.figure(figsize=(10,10))
plt.plot(y_hat, 'ro-', label='预测值')
plt.plot(test_y.values, 'go--', label='真实值')
plt.title('线性回归预测-最小二乘法')
plt.xlabel('样本序号')
plt.ylabel('房价')
plt.legend()
plt.show()

Using matplotlib backend: Qt5Agg
