In [1]:
from sklearn.datasets import load_boston

# 从sklearn中导入波士顿房价数据集
boston = load_boston()  # 实例化
# boston.keys() 可以查看数据集中包含的项目名
# boston.target 即y_i

X = boston.data[:, 10]  # 数据集中有很多指标，在这里随便选了一个
y = boston.target

## 一元线性回归

目标函数：$\hat{y_i}=ax_i+\hat{b}$

损失函数：$\Sigma_{i=1}^m(y_i-\hat{y_i})^2$

求参数$a,b$的方法：$a=\frac{\Sigma_{i=1}^m(x_i-\bar{x})(y_i-\bar{y})}{\Sigma_{i=1}^m(x_i-\bar{x})^2}$  ,  $b=\bar{y}-a\bar{x}$

此处使用波士顿房价数据集进行展示

In [2]:
# 划分数据集
import numpy as np
from sklearn.model_selection import train_test_split  # 划分数据集的函数

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=0)  # 训练集：测试集=8:2，随机数种子为0，其他参数可查阅文档


def calculate_a_b(x_train, y_train):
    """
    此函数用于计算 a 和 b 的值
    :param x_train: 训练集X
    :param y_train: 训练集Y
    :return: 系数a和截距b
    """
    x_mean = np.mean(X_train)
    y_mean = np.mean(y_train)
    a_up = 0.0
    a_down = 0.0
    for x, y in zip(X_train, y_train):
        a_up += (x - x_mean) * (y - y_mean)
        a_down += (x - x_mean) ** 2

    a = a_up / a_down
    return a, y_mean - a * x_mean


calculate_a_b(X_train, y_train)

(-2.3722481101221096, 66.40921832618511)

In [3]:
# 完整代码
import numpy as np


class LinearRegression:
    def __init__(self):
        self._x_train = None
        self._y_train = None
        self.a = None
        self.b = None

    def fit(self, x_train, y_train):
        self._x_train = x_train
        self._y_train = y_train
        x_mean = np.mean(self._x_train)
        y_mean = np.mean(self._y_train)
        a_up = 0.0
        a_down = 0.0
        for x, y in zip(self._x_train, self._y_train):
            a_up += (x - x_mean) * (y - y_mean)
            a_down += (x - x_mean) ** 2
        self.a = a_up / a_down
        self.b = y_mean - self.a * x_mean
        return self

    def _predict(self, x):
        return self.a * x + self.b
    def predict(self, x_test):
        return np.array([self._predict(x) for x in x_test])



In [4]:
# 试一下
lr = LinearRegression()
lr.fit(X, y)
lr.predict(X)

array([29.33984545, 23.9469072 , 23.9469072 , 22.00544944, 22.00544944,
       22.00544944, 29.55556297, 29.55556297, 29.55556297, 29.55556297,
       29.55556297, 29.55556297, 29.55556297, 17.04394626, 17.04394626,
       17.04394626, 17.04394626, 17.04394626, 17.04394626, 17.04394626,
       17.04394626, 17.04394626, 17.04394626, 17.04394626, 17.04394626,
       17.04394626, 17.04394626, 17.04394626, 17.04394626, 17.04394626,
       17.04394626, 17.04394626, 17.04394626, 17.04394626, 17.04394626,
       20.92686179, 20.92686179, 20.92686179, 20.92686179, 22.86831956,
       22.86831956, 23.73118968, 23.73118968, 23.73118968, 23.73118968,
       23.73118968, 23.73118968, 23.73118968, 23.73118968, 23.73118968,
       26.1040825 , 26.1040825 , 26.1040825 , 26.1040825 , 16.82822873,
       23.73118968, 25.02549485, 29.7712805 , 19.84827414, 19.84827414,
       19.84827414, 19.84827414, 19.84827414, 19.84827414, 22.22116697,
       27.61410521, 27.61410521, 21.57401438, 21.57401438, 21.57