In [24]:
import numpy as np
import pandas as pd

### 加载数据

In [25]:
df = pd.read_csv('abalone.txt', names=['y', 'x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7'], sep='\t')
df.head(2)

Unnamed: 0,y,x0,x1,x2,x3,x4,x5,x6,x7
0,1,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,1,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7


In [27]:
X = df.iloc[:, 1:].values
Y = df.iloc[:, 0].values

X = np.concatenate([np.ones((X.shape[0], 1)), X], axis=1)

X.shape, Y.shape

((4177, 9), (4177,))

### 正规方程

In [28]:
class NormEquationRegression:
    
    def __init__(self, alpha=0.01):
        self._alpha = alpha
        
    def fit(self, X, Y):
        Y = Y.reshape((-1, 1))
        m, n = X.shape
        
        l2 = np.eye(n)
        l2[0, 0] = 0
        
        self._weight = np.linalg.inv(X.T @ X + self._alpha * l2) @ X.T @ Y
        
    def predict(self, X):
        return X @ self._weight
    
    def mse(self, Y, Y_pre):
        return np.mean(np.square(Y - Y_pre))
    
    def r2(self, Y, Y_pre):
        Y = Y.reshape((-1, 1))
        return 1 - np.sum(np.square(Y - Y_pre)) / np.sum(np.square(Y - np.mean(Y)))

In [29]:
regression = NormEquationRegression(alpha=0.01)
regression.fit(X, Y)

In [30]:
Y_pre = regression.predict(X)

In [31]:
regression.mse(Y, Y_pre), regression.r2(Y, Y_pre)

(0.68334334055992807, 0.011015464546781373)

### sklearn 实现

In [32]:
from sklearn.linear_model import Ridge
from sklearn import metrics

In [33]:
ridge = Ridge(alpha=0.01, solver='cholesky')
ridge.fit(X, Y)

Ridge(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='cholesky', tol=0.001)

In [34]:
Y_pre = ridge.predict(X)

In [35]:
metrics.mean_squared_error(Y, Y_pre), metrics.r2_score(Y, Y_pre)

(0.66847189809461249, 0.011015464546781373)