# 梯度下降的向量化

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
boston = datasets.load_boston()
X = boston.data
y = boston.target

X = X[y < 50.0]
y = y[y < 50.0]

In [3]:
from playML.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, seed=666)

## 使用正规方程解法

In [4]:
from playML.linear_regression import LinearRegression

lin_reg1 = LinearRegression()
%time lin_reg1.fit_normal(X_train, y_train)
lin_reg1.score(X_test, y_test)

CPU times: user 827 µs, sys: 2.74 ms, total: 3.57 ms
Wall time: 8.71 ms


0.81298026026586467

## 使用梯度下降法

In [5]:
lin_reg2 = LinearRegression()
lin_reg2.fit_gd(X_train, y_train)
lin_reg2.score(X_test, y_test)

  return np.sum((y - X_b.dot(theta)) ** 2) / len(X_b)
  if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):


nan

In [6]:
lin_reg2.coef_

array([ nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,  nan,
        nan,  nan])

### 调整步长η

In [7]:
lin_reg2.fit_gd(X_train, y_train, eta=1e-6)
lin_reg2.score(X_test, y_test)

0.27556634853389206

结果很差。。。  
那么，增加循环次数呢？

In [8]:
%time lin_reg2.fit_gd(X_train, y_train, eta=1e-6, n_iters=1e6)
lin_reg2.score(X_test, y_test)

CPU times: user 36 s, sys: 159 ms, total: 36.1 s
Wall time: 36.4 s


0.75418523539807647

训练时间很长，但是损失函数仍然远未达到最小

## 使用梯度下降法前，最好进行数据归一化

In [9]:
from sklearn.preprocessing.data import StandardScaler

In [10]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [11]:
X_train_standard = standard_scaler.transform(X_train)
X_test_standard = standard_scaler.transform(X_test)

In [12]:
lin_reg3 = LinearRegression()
%time lin_reg3.fit_gd(X_train_standard, y_train)
lin_reg3.score(X_test_standard, y_test)

CPU times: user 212 ms, sys: 5.18 ms, total: 217 ms
Wall time: 223 ms


0.81298806201222351

## 梯度下降法的优势

In [13]:
m = 1000
n = 5000

big_X = np.random.normal(size=(m,n))

true_theta = np.random.uniform(0.0, 100.0, size=n+1) #最终要求（或者说，尽可能接近）的系数和截距离

big_y = big_X.dot(true_theta[1:]) + true_theta[0] + np.random.normal(0.,10.,size=m)

In [14]:
big_reg1 = LinearRegression()
# 主要是看看训练时间，所以就不用train test split了
%time big_reg1.fit_normal(big_X, big_y)

CPU times: user 25.9 s, sys: 643 ms, total: 26.6 s
Wall time: 9.16 s


LinearRegression()

In [15]:
big_reg2 = LinearRegression()
# X 本身就都是标准差为1，均值为0的，所以就不用归一化了
%time big_reg2.fit_gd(big_X, big_y)

CPU times: user 8.58 s, sys: 139 ms, total: 8.72 s
Wall time: 5.01 s


LinearRegression()

这个例子中，特征数比较大，梯度下降法比正规方程解法快得多