## 数据

In [1]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd

In [2]:
prostate = pd.read_csv("../../../datasets/prostate/prostate.data",
                       sep="\t", index_col=0)
prostate.head()

Unnamed: 0,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa,train
1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783,T
2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519,T
3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519,T
4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519,T
5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564,T


In [3]:
train_data = prostate.drop("train", axis=1)[prostate["train"] == "T"]
test_data = prostate.drop("train", axis=1)[prostate["train"] == "F"]

scaler = StandardScaler()
scaler.fit(train_data)
train_data_std = scaler.fit_transform(train_data)
test_data_std = scaler.transform(test_data)
X_train = train_data_std[:, :-1]
y_train = train_data_std[:, -1]
X_test = test_data_std[:, :-1]
y_test = test_data_std[:, -1]

## 线性回归

In [4]:
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.metrics import mean_squared_error

In [5]:
class LinearRegression(BaseEstimator, RegressorMixin):
    """线性回归
    
    参数：
    -----
    learning_rate: 在梯度下降中，更新权重所用的步长
    epochs: 在梯度下降中，迭代所用的步数
    """
    def __init__(self, epochs=200, learning_rate=0.1):
        self.epochs = epochs
        self.learning_rate = learning_rate
        
    # 均方差损失函数
    def __loss_function(self, y_true, y_pred):
        return np.sum((y_true - y_pred.ravel())**2) / len(y_true)
    
    def fit(self, X, y=None):
        n, p = X.shape
        X_b = np.c_[np.ones(n), X]
        self.W = np.random.randn(p+1, 1)
        learning_rate = 0.1
        losses = [0]
        for e in range(self.epochs):
            y_pred = X_b @ self.W # 线性回归
            loss = self.__loss_function(y, y_pred)
            losses.append(loss)
            if e > 1:
                if abs(losses[e] - losses[e-1]) < 1e-8:
                    break
            if e == self.epochs - 1:
                print("Linear regression doesn't converge!")
            grad = 2 / n * X_b.T @ (y_pred - y.reshape(-1, 1)) # 梯度
            self.W = self.W - self.learning_rate * grad # 更新权重
        return self
    
    def predict(self, X):
        n = len(X)
        X_b = np.c_[np.ones(n), X]
        y_pred = X_b @ self.W
        return y_pred
        

In [6]:
lr = LinearRegression()
lr.fit(X_train, y_train)

LinearRegression(epochs=200, learning_rate=0.1)

In [7]:
y_pred = lr.predict(X_train)
train_err = mean_squared_error(y_train, y_pred)
print("训练样本误差: {:.3f}".format(train_err)) # 均方差

训练样本误差: 0.306


In [8]:
y_pred = lr.predict(X_test)
test_err = mean_squared_error(y_test, y_pred)
print("测试样本误差: {:.3f}".format(test_err))

测试样本误差: 0.363


In [9]:
# 系数
lr.W

array([[ 3.89420688e-17],
       [ 5.92901984e-01],
       [ 2.42342835e-01],
       [-1.18042783e-01],
       [ 1.75536895e-01],
       [ 2.56370846e-01],
       [-2.38884785e-01],
       [-1.68942001e-02],
       [ 2.29053766e-01]])

### 使用 `Sklearn`

In [10]:
from sklearn.linear_model import LinearRegression

In [11]:
sk_lr = LinearRegression()
sk_lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [12]:
y_train_preds = sk_lr.predict(X_train)
train_err = mean_squared_error(y_train, y_train_preds)
print("训练样本误差: {:.3f}".format(train_err)) # 均方差

训练样本误差: 0.306


In [13]:
y_test_preds = sk_lr.predict(X_test)
test_err = mean_squared_error(y_test, y_test_preds)
print("测试样本误差: {:.3f}".format(test_err))

测试样本误差: 0.363


In [14]:
pd.options.display.float_format = '{:.3f}'.format
variables = ["Intercept"] + list(train_data.iloc[:, :-1].columns)
pd.DataFrame({"变量": variables, "值": np.r_[sk_lr.intercept_, sk_lr.coef_]})

Unnamed: 0,变量,值
0,Intercept,0.0
1,lcavol,0.593
2,lweight,0.242
3,age,-0.118
4,lbph,0.176
5,svi,0.256
6,lcp,-0.239
7,gleason,-0.017
8,pgg45,0.23
