# 随机梯度下降法

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
m = 100000

x = np.random.normal(size=m)
X = x.reshape(-1,1)
y = 4.*x + 3. + np.random.normal(0,3,size=m)

In [4]:
# 损失函数
def J(theta, X_b, y):
    try:
        return np.sum((y - X_b.dot(theta))**2) / len(y)
    except:
        return float('inf')
    
def derivative_J(theta:np.ndarray, X_b:np.ndarray, y:np.ndarray):
    """
    求θ为给定值时的导数(梯度)
    :param theta: 
    :param X_b: 
    :param y: 
    :return: 
    """
    return X_b.T.dot(X_b.dot(theta) - y) * 2. / len(y)

# 批量梯度下降法
def gradient_descent(X_b, y, initial_theta, eta=0.01, n_iters=1e4, epsilon=1e-8):
    theta = initial_theta
    cur_iter = 0
    while cur_iter < n_iters:
        gradient = derivative_J(theta, X_b, y)
        last_theta = theta
        theta = theta - eta * gradient
        if (abs(J(theta, X_b, y) - J(last_theta, X_b, y)) < epsilon):
            break
        cur_iter += 1
    return theta

## 批量梯度下降法效果

In [6]:
%%time
X_b = np.hstack((np.ones((len(X), 1)), X))
initial_theta = np.zeros(X_b.shape[1])
eta = 0.01
# 我们知道最终的系数和截距，直接肉眼比较吧。。。就不分训练集测试集了
theta = gradient_descent(X_b,y,initial_theta, eta)

CPU times: user 1.36 s, sys: 93.2 ms, total: 1.45 s
Wall time: 1.45 s


In [7]:
theta

array([ 3.01042744,  4.00071587])

## 随机梯度下降法效果

In [8]:
def derivative_J_sgd(theta:np.ndarray, X_b_i:np.ndarray, y_i):
    """
    求随机搜索方向 
    """
    return X_b_i.T.dot(X_b_i.dot(theta) - y_i) * 2.

In [9]:
def sgd(X_b, y, initial_theta, n_iters):
    # 两个超参数
    t0 = 5
    t1 = 50
    
    def learning_rate(t):
        return t0/(t+t1)
    
    theta = initial_theta
    for cur_iter in range(n_iters):
        # 随机选一个
        rand_i = np.random.randint(len(X_b))
        gradient = derivative_J_sgd(theta, X_b[rand_i], y[rand_i])
        # 向搜索方向的相反方向移动η
        theta = theta - learning_rate(cur_iter) * gradient
    return theta

In [10]:
%%time
X_b = np.hstack((np.ones((len(X), 1)), X))
initial_theta = np.zeros(X_b.shape[1])
theta = sgd(X_b, y, initial_theta, n_iters=len(X_b)//3)

CPU times: user 276 ms, sys: 6.11 ms, total: 282 ms
Wall time: 283 ms


In [11]:
theta

array([ 3.02984824,  3.9936953 ])

结论：批量梯度下降法和随机梯度下降法最终效果差不多，但是随机梯度下降法循环次数少得多，计算时间快得多