In [2]:
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

### Nesterov Momentum для линейной регрессии

In [3]:
N = 100
X = np.random.uniform(low=0, high=100, size=N)

In [4]:
Y = 2*X + 1 + np.random.normal(scale=5, size=N)

In [28]:
EPOCHS = 120
LEARNING_RATE = 0.0001

costs = []
params = []
preds = []
params = np.random.normal(size=(2,))

# Переменные для записи старых импульсов
vt_0_old = 0
vt_1_old = 0
# Коэффициент сохранения
y_k = 0.8


# Функция просчета нашего сдвига по градиенту с учетом импульса, с учетом, что на первом шаге импульса еще нет
def count_vt(i, der, LEARNING_RATE, y_k, vt_old):
    if i ==0:
        return LEARNING_RATE*der
    else:
        return y_k*vt_old + LEARNING_RATE*der

for i in range(EPOCHS):
    # Для статистики считаем общее предсказание и функцию потерь
    predictions = params[0] + params[1] * X
    preds.append(predictions)
    
    cost = np.sum(np.square(predictions - Y)) / (2 * len(predictions))
    costs.append(cost)
    
    i = np.random.choice(len(X))
    current_prediction = params[0] + params[1] * X[i]
    
    # Расчет значений градиента по Nesterov-momentum
    der_0 = (current_prediction - Y[i]) - y_k*vt_0_old
    der_1 = (current_prediction - Y[i]) * X[i] - y_k*vt_1_old
    
    # Сдвиг для каждого параметра  
    vt_0 = count_vt(i, der_0, LEARNING_RATE, y_k, vt_0_old)
    vt_1 = count_vt(i, der_1, LEARNING_RATE, y_k, vt_1_old)
    
    params[0] -= vt_0
    params[1] -= vt_1
    
    # Перезапись старого значения
    vt_0_old = vt_0
    vt_1_old = vt_1
    
    print("param0:", params[0], "param1:", params[1], "cost:", cost)    

param0: -2.6623163928614697 param1: 0.12728435376953093 cost: 5540.604847359278
param0: -2.650816165121629 param1: 0.519050639132576 cost: 4914.765741391404
param0: -2.63565226892413 param1: 1.0981301574823592 cost: 3122.456952090423
param0: -2.6205087159173157 param1: 1.6531164458069951 cost: 1216.7649066314182
param0: -2.606263463865337 param1: 2.2429889706889443 cost: 222.61177621613894
param0: -2.5951963229764368 param1: 2.703681289312286 cost: 58.98153522057322
param0: -2.586644800359331 param1: 3.0716527807664735 cost: 571.1316072505491
param0: -2.5831669890705515 param1: 3.265493603824238 cost: 1383.3738792981032
param0: -2.5894858865601194 param1: 2.6975947505757376 cost: 1955.1863015017434
param0: -2.5951898873645147 param1: 2.2328668505574325 cost: 560.8640300115314
param0: -2.6015924351237785 param1: 1.7351712594770934 cost: 54.106302729854924
param0: -2.6055733012306486 param1: 1.3620023151029845 cost: 144.50226081395257
param0: -2.606146653959247 param1: 1.141538391510403 

### AdaGrad

In [10]:
EPOCHS = 120
LEARNING_RATE = 0.1

costs = []
params = []
preds = []
params = np.random.normal(size=(2,))

# Сумма квадратов обновлений
G_0 = 0
G_1 = 0

# Сглаживающий параметр
e = 10**(-8)

for _ in range(EPOCHS):
    # Для статистики считаем общее предсказание и функцию потерь
    predictions = params[0] + params[1] * X
    preds.append(predictions)
    
    cost = np.sum(np.square(predictions - Y)) / (2 * len(predictions))
    costs.append(cost)
    
    i = np.random.choice(len(X))
    current_prediction = params[0] + params[1] * X[i]
    
    G_0 += (current_prediction - Y[i])**2
    G_1 += ((current_prediction - Y[i]) * X[i])**2
    
    params[0] -= (LEARNING_RATE/(G_0+e)**0.5) * (current_prediction - Y[i])
    params[1] -= (LEARNING_RATE/(G_1+e)**0.5) * ((current_prediction - Y[i]) * X[i])
    
    print("param0:", params[0], "param1:", params[1], "cost:", cost)    

param0: -1.308879212336194 param1: 1.0923254213636568 cost: 1697.4295768407767
param0: -1.2855365728277337 param1: 1.0948628713345687 cost: 1387.33195163969
param0: -1.2611351243538782 param1: 1.1008758178246925 cost: 1378.9377426307472
param0: -1.2093089805637225 param1: 1.1338127218630276 cost: 1360.5283084709226
param0: -1.1307524634880595 param1: 1.2254511445489324 cost: 1265.4197855094699
param0: -1.107320137659902 param1: 1.2349461597497962 cost: 1021.6271592568561
param0: -1.1019263260960575 param1: 1.2355602238508705 cost: 997.2876375122567
param0: -1.0784201698636198 param1: 1.2453312146661009 cost: 995.5757211354047
param0: -1.022781969508197 param1: 1.314590477456369 cost: 970.8830275147989
param0: -0.9845151133125375 param1: 1.3495843843528805 cost: 808.6552393464158
param0: -0.9444277671369686 param1: 1.3964051571293357 cost: 732.0647404369472
param0: -0.9370689525188693 param1: 1.3968385014952283 cost: 635.97505678657
param0: -0.9309548427174336 param1: 1.3975368445925826

### RMSProp для линейной регрессии

In [6]:
EPOCHS = 120
LEARNING_RATE = 0.1

costs = []
params = []
preds = []
params = np.random.normal(size=(2,))

# Усреднённый по истории квадрат градиента (t-1)
EG_0_old = 0
EG_1_old = 0

# Сглаживающий параметр
e = 10**(-8)

# Коэффициент сохранения
y_k = 0.8

for _ in range(EPOCHS):
    # Для статистики считаем общее предсказание и функцию потерь
    predictions = params[0] + params[1] * X
    preds.append(predictions)
    
    cost = np.sum(np.square(predictions - Y)) / (2 * len(predictions))
    costs.append(cost)
    
    i = np.random.choice(len(X))
    current_prediction = params[0] + params[1] * X[i]
    
    # Усреднённый по истории квадрат градиента
    EG_0 = y_k*EG_0_old + (1-y_k)*(current_prediction - Y[i])**2
    EG_1 = y_k*EG_1_old + (1-y_k)*((current_prediction - Y[i]) * X[i])**2
    
    params[0] -= (LEARNING_RATE/(EG_0+e)**0.5) * (current_prediction - Y[i])
    params[1] -= (LEARNING_RATE/(EG_1+e)**0.5) * ((current_prediction - Y[i]) * X[i])
    
    EG_0_old = EG_0
    EG_1_old = EG_1
    
    print("param0:", params[0], "param1:", params[1], "cost:", cost)    

param0: -2.1460073074376123 param1: -0.48320500115396864 cost: 11682.23037983212
param0: -1.9956196096786367 param1: -0.3480635979383116 cost: 9844.020879507198
param0: -1.7822482004450957 param1: -0.12511714341454058 cost: 8807.655656139264
param0: -1.6202933249416591 param1: 0.045443237728341135 cost: 7227.6058730375335
param0: -1.4939282645580318 param1: 0.1700688635382715 cost: 6124.4627036918
param0: -1.3488368979911671 param1: 0.3420557825103201 cost: 5375.527733843488
param0: -1.3289574316758404 param1: 0.34485056493409305 cost: 4424.667768922682
param0: -1.2105872424958075 param1: 0.4602147962651033 cost: 4408.577840522835
param0: -1.1909915325631402 param1: 0.46300697351171727 cost: 3822.2410789470373
param0: -1.1408596163459164 param1: 0.48691156250336054 cost: 3807.3241133187403
param0: -0.9999133630088866 param1: 0.6496923801889416 cost: 3689.4275826179264
param0: -0.9878477159131482 param1: 0.6502711830679253 cost: 2949.1596505598836
param0: -0.9176039509560603 param1: 0.6

param0: 1.0090282544440334 param1: 1.9855019113900316 cost: 12.136133795271148
param0: 0.9383005531704751 param1: 1.9490638281512755 cost: 11.731544656602887
param0: 1.0753681551903846 param1: 2.0102939647692257 cost: 16.220051385863904
param0: 1.108345938972832 param1: 2.0175914456103063 cost: 11.141468130637456
param0: 0.961272410480021 param1: 1.839568509569693 cost: 11.364391485520532
param0: 1.1161272508903137 param1: 1.986045025996058 cost: 53.719842910352526
param0: 1.228106236400134 param1: 2.000984887923209 cost: 11.606351606847976
param0: 1.3111785653985495 param1: 2.0631911865425168 cost: 11.110060806445897
param0: 1.3775453402614484 param1: 2.1753629452111536 cost: 16.965785539258043
