SGD (Stochastic Gradient Descent)

In [57]:
import pandas as pd
import numpy as np

from sklearn.linear_model import SGDRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

import warnings
warnings.filterwarnings("ignore")

In [30]:
data = pd.read_csv('advertising.csv')

In [19]:
data.head()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
0,1,230.1,37.8,69.2,22.1
1,2,44.5,39.3,45.1,10.4
2,3,17.2,45.9,69.3,9.3
3,4,151.5,41.3,58.5,18.5
4,5,180.8,10.8,58.4,12.9


In [20]:
data.info

<bound method DataFrame.info of      Unnamed: 0     TV  Radio  Newspaper  Sales
0             1  230.1   37.8       69.2   22.1
1             2   44.5   39.3       45.1   10.4
2             3   17.2   45.9       69.3    9.3
3             4  151.5   41.3       58.5   18.5
4             5  180.8   10.8       58.4   12.9
..          ...    ...    ...        ...    ...
195         196   38.2    3.7       13.8    7.6
196         197   94.2    4.9        8.1    9.7
197         198  177.0    9.3        6.4   12.8
198         199  283.6   42.0       66.2   25.5
199         200  232.1    8.6        8.7   13.4

[200 rows x 5 columns]>

In [21]:
data.describe()

Unnamed: 0.1,Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0,200.0
mean,100.5,147.0425,23.264,30.554,14.0225
std,57.879185,85.854236,14.846809,21.778621,5.217457
min,1.0,0.7,0.0,0.3,1.6
25%,50.75,74.375,9.975,12.75,10.375
50%,100.5,149.75,22.9,25.75,12.9
75%,150.25,218.825,36.525,45.1,17.4
max,200.0,296.4,49.6,114.0,27.0


In [24]:
X = data[['TV', 'Radio', 'Newspaper']].values
Y = data[['Sales']].values

normalization of features

In [25]:
X = (X - X.mean(axis = 0) ) / X.std(axis = 0, ddof=1)

In [27]:
X[:5]

array([[ 0.9674246 ,  0.97906559,  1.77449253],
       [-1.19437904,  1.0800974 ,  0.66790272],
       [-1.51235985,  1.52463736,  1.77908419],
       [ 0.05191939,  1.21480648,  1.28318502],
       [ 0.39319551, -0.83950698,  1.27859336]])

In [31]:
Y[:5]

array([[22.1],
       [10.4],
       [ 9.3],
       [18.5],
       [12.9]])

Add [1, 1, 1, ...., 1]

In [32]:
X = np.hstack((np.ones((200,1)), X))

In [33]:
X[:5]

array([[ 1.        ,  0.9674246 ,  0.97906559,  1.77449253],
       [ 1.        , -1.19437904,  1.0800974 ,  0.66790272],
       [ 1.        , -1.51235985,  1.52463736,  1.77908419],
       [ 1.        ,  0.05191939,  1.21480648,  1.28318502],
       [ 1.        ,  0.39319551, -0.83950698,  1.27859336]])

In [36]:
def mserror(Y, Y_pred):
    return np.sum((Y_pred - Y)**2) / Y.shape[0]

In [44]:
def linear_prediction(X, w):
    return np.dot(X, w)

row to column

In [35]:
def reshape_to_row(mat):
    return np.reshape(mat, (mat.shape[0],1))

SGD_Step, train_ind - index pow Xi, eta - step GD

In [37]:
def stochastic_gradient_step(X, Y, w, train_ind, eta=0.01):
    X_vec = reshape_to_row(X[train_ind]).T
    Y_vec = reshape_to_row(Y[train_ind])
    Xw = np.dot(X_vec,w)
    delta = w - eta * 2 / X.shape[0] * np.dot(X_vec.T, (Xw - Y_vec))
    return delta

In [40]:
stochastic_gradient_step(X, Y, np.zeros((4,1)), 1, 1)

array([[ 0.104     ],
       [-0.12421542],
       [ 0.11233013],
       [ 0.06946188]])

In [45]:
def stochastic_gradient_descent(X, Y, w_init, eta=1e-2, max_iter=1e4,
                                min_weight_dist=1e-8, seed=42, verbose=False):
    # Инициализируем расстояние между векторами весов на соседних
    # итерациях большим числом. 
    weight_dist = np.inf
    # Инициализируем вектор весов
    w_old = w_init
    # Сюда будем записывать ошибки на каждой итерации
    errors = []
    # Счетчик итераций
    iter_num = 0
    # Будем порождать псевдослучайные числа 
    # (номер объекта, который будет менять веса), а для воспроизводимости
    # этой последовательности псевдослучайных чисел используем seed.
    np.random.seed(seed)        
    # Основной цикл
    while weight_dist > min_weight_dist and iter_num < max_iter:
        # порождаем псевдослучайный 
        # индекс объекта обучающей выборки
        random_ind = np.random.randint(X.shape[0])        
        # Ваш код здесь
        w_new = stochastic_gradient_step(X, Y, w_old, random_ind)
        weight_dist = np.linalg.norm(w_old - w_new)
        error = mserror(Y, linear_prediction(X, w_old))
        w_old = w_new
        errors.append(error)
        
        #Отладка
        iter_num += 1
        if verbose:
            print('iter_num = ', iter_num, '; random_ind = ', random_ind,
                  'weight_dist = ', weight_dist, '; w_old = ', w_old,
                  'w_new = ', w_new)
    return w_old, errors

In [49]:
w_init = np.zeros((4,1))

In [53]:
stoch_grad_desc_weights, stoch_errors_by_iter = stochastic_gradient_descent(X, Y, w_init, max_iter = 1e5)

In [54]:
print ('w, найденное с помощью стохастического градиентного спуска, равно ', stoch_grad_desc_weights)

w, найденное с помощью стохастического градиентного спуска, равно  [[1.39903864e+01]
 [3.89172998e+00]
 [2.79230916e+00]
 [6.17758541e-03]]


In [64]:
stoch_errors_by_iter[-1]

2.7871848010184426

In [71]:
n_iter = 10000
model = SGDRegressor(max_iter=n_iter, alpha=0.01)
model.fit(X, Y)

SGDRegressor(alpha=0.01, max_iter=10000)

In [72]:

print(mean_squared_error(Y, model.predict(X)))

2.788116053017394


In [73]:
#Coefficeint of SGDRegressor model
model.coef_

array([ 6.74994652e+00,  3.88926805e+00,  2.75771243e+00, -2.78079168e-03])