# 4.1 Learning is paramter estimation

## 4.1.1 A hot problem

t_c are temperatures in Celsius, and t_u are the unknown units. 

In [4]:
import torch

t_c = [0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0] 
t_u = [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4] 

t_c = torch.tensor(t_c)
t_u = torch.tensor(t_u)

## 4.1.2 Choosing a liner model as a first try

In [None]:
t_c = w * t_u + b

## 4.1.3 Less loss is what you want

## 4.4.4 From problem to Pytorch

In [5]:
def model(t_u, w, b):
    return w*t_u + b

def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

w = torch.ones(1)
b = torch.zeros(1)

t_p = model(t_u, w, b)
print(t_p)

loss = loss_fn(t_p, t_c)
print(loss)

tensor([35.7000, 55.9000, 58.2000, 81.9000, 56.3000, 48.9000, 33.9000, 21.8000,
        48.4000, 60.4000, 68.4000])
tensor(1763.8846)


## 4.1.5 Down along the gradient

This code is saying that in a small neighborhood of the current values of w and b, a unit increase in w leads to some change in the loss. If the change is negative, you need to increase w to minimize the loss, whereas if the change is positive, you need to decrease w.

In [6]:
delta = 0.1
loss_rate_of_change_w = \
    (loss_fn(model(t_u, w + delta, b), t_c) - 
     loss_fn(model(t_u, w - delta, b), t_c))/(2.0 * delta)

learning_rate = 0.01
w = w - learning_rate * loss_rate_of_change_w

loss_rate_of_change_b = \
    (loss_fn(model(t_u, w, b + delta), t_c) - 
     loss_fn(model(t_u, w, b - delta), t_c)) / (2.0 * delta)
b = b - learning_rate * loss_rate_of_change_b

## 4.1.6 Getting analytical

In a model with two or more parameters, you compute the individual derivatives of the loss with respect to each parameter and put them in a vec- tor of derivatives: the gradient.

In [9]:
def dloss_fn(t_p, t_c):
    dsq_diffs = 2 * (t_p - t_c) 
    return dsq_diffs

In [10]:
def dmodel_dw(t_u, w, b):
    return t_u
def dmodel_db(t_u, w, b):
    return 1.0

In [11]:
def grad_fn(t_u, t_c, t_p, w, b):
    dloss_dw = dloss_fn(t_p, t_c) * dmodel_dw(t_u, w, b) 
    dloss_db = dloss_fn(t_p, t_c) * dmodel_db(t_u, w, b) 
    return torch.stack([dloss_dw.mean(), dloss_db.mean()])

## 4.1.7 The training loop

In [12]:
def training_loop(n_epochs, learning_rate, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        w, b = params
        
        t_p = model(t_u, w, b)
        loss = loss_fn(t_p, t_c)
        
        grad = grad_fn(t_u, t_c, t_p, w, b)
        params = params - learning_rate * grad
        
        print('Epoch %d, Loss %f' % (epoch, float(loss)))
    return params

### 1. loss is becoming inf
This result is a clear sign that params is receiving updates that are too large; 

In [16]:
training_loop(
    n_epochs = 100,
    learning_rate = 1e-2,
    params = torch.tensor([1.0, 0.0]), 
    t_u = t_u,
    t_c = t_c)

Epoch 1, Loss 1763.884644
Epoch 2, Loss 5802484.500000
Epoch 3, Loss 19408031744.000000
Epoch 4, Loss 64915909902336.000000
Epoch 5, Loss 217130439561707520.000000
Epoch 6, Loss 726257020202974707712.000000
Epoch 7, Loss 2429181687085405986357248.000000
Epoch 8, Loss 8125117236949438203699396608.000000
Epoch 9, Loss 27176865195881116022129584766976.000000
Epoch 10, Loss 90901075478458130961171361977860096.000000
Epoch 11, Loss inf
Epoch 12, Loss inf
Epoch 13, Loss inf
Epoch 14, Loss inf
Epoch 15, Loss inf
Epoch 16, Loss inf
Epoch 17, Loss inf
Epoch 18, Loss inf
Epoch 19, Loss inf
Epoch 20, Loss inf
Epoch 21, Loss inf
Epoch 22, Loss inf
Epoch 23, Loss nan
Epoch 24, Loss nan
Epoch 25, Loss nan
Epoch 26, Loss nan
Epoch 27, Loss nan
Epoch 28, Loss nan
Epoch 29, Loss nan
Epoch 30, Loss nan
Epoch 31, Loss nan
Epoch 32, Loss nan
Epoch 33, Loss nan
Epoch 34, Loss nan
Epoch 35, Loss nan
Epoch 36, Loss nan
Epoch 37, Loss nan
Epoch 38, Loss nan
Epoch 39, Loss nan
Epoch 40, Loss nan
Epoch 41, Loss

tensor([nan, nan])

### 2. Choose smaller learning rate
How can you limit the magnitude of the learning_rate * grad? 

Well, that process looks easy. You could simply choose a smaller learning_rate. 

In [18]:
training_loop(
    n_epochs = 100,
    learning_rate = 1e-4, # 1e-2
    params = torch.tensor([1.0, 0.0]), 
    t_u = t_u,
    t_c = t_c)

Epoch 1, Loss 1763.884644
Epoch 2, Loss 323.090546
Epoch 3, Loss 78.929634
Epoch 4, Loss 37.552845
Epoch 5, Loss 30.540285
Epoch 6, Loss 29.351152
Epoch 7, Loss 29.148882
Epoch 8, Loss 29.113848
Epoch 9, Loss 29.107145
Epoch 10, Loss 29.105242
Epoch 11, Loss 29.104168
Epoch 12, Loss 29.103222
Epoch 13, Loss 29.102297
Epoch 14, Loss 29.101379
Epoch 15, Loss 29.100470
Epoch 16, Loss 29.099548
Epoch 17, Loss 29.098631
Epoch 18, Loss 29.097715
Epoch 19, Loss 29.096796
Epoch 20, Loss 29.095884
Epoch 21, Loss 29.094959
Epoch 22, Loss 29.094049
Epoch 23, Loss 29.093134
Epoch 24, Loss 29.092213
Epoch 25, Loss 29.091297
Epoch 26, Loss 29.090382
Epoch 27, Loss 29.089460
Epoch 28, Loss 29.088549
Epoch 29, Loss 29.087635
Epoch 30, Loss 29.086718
Epoch 31, Loss 29.085808
Epoch 32, Loss 29.084888
Epoch 33, Loss 29.083965
Epoch 34, Loss 29.083057
Epoch 35, Loss 29.082142
Epoch 36, Loss 29.081219
Epoch 37, Loss 29.080309
Epoch 38, Loss 29.079393
Epoch 39, Loss 29.078474
Epoch 40, Loss 29.077559
Epoch 

tensor([ 0.2327, -0.0438])

### 3. Change the input
As You see that the first-epoch gradi- ent for the weight is about 50 times larger than the gradient for the bias, so the weight and bias live in differently scaled spaces. 

You have a simpler way to keep things in check: change the inputs so that the gradients aren’t so different. You can make sure that the range of the input doesn’t get too far from the range of -1.0 to 1.0, roughly speaking. 

In [19]:
t_un = 0.1 * t_u

In [20]:
training_loop(
n_epochs = 100,
learning_rate = 1e-2,
params = torch.tensor([1.0, 0.0]), t_u = t_un,
t_c = t_c)

Epoch 1, Loss 80.364342
Epoch 2, Loss 37.574917
Epoch 3, Loss 30.871077
Epoch 4, Loss 29.756193
Epoch 5, Loss 29.507149
Epoch 6, Loss 29.392458
Epoch 7, Loss 29.298828
Epoch 8, Loss 29.208717
Epoch 9, Loss 29.119417
Epoch 10, Loss 29.030487
Epoch 11, Loss 28.941875
Epoch 12, Loss 28.853565
Epoch 13, Loss 28.765556
Epoch 14, Loss 28.677851
Epoch 15, Loss 28.590431
Epoch 16, Loss 28.503321
Epoch 17, Loss 28.416496
Epoch 18, Loss 28.329975
Epoch 19, Loss 28.243738
Epoch 20, Loss 28.157801
Epoch 21, Loss 28.072151
Epoch 22, Loss 27.986799
Epoch 23, Loss 27.901731
Epoch 24, Loss 27.816956
Epoch 25, Loss 27.732466
Epoch 26, Loss 27.648256
Epoch 27, Loss 27.564342
Epoch 28, Loss 27.480711
Epoch 29, Loss 27.397358
Epoch 30, Loss 27.314295
Epoch 31, Loss 27.231512
Epoch 32, Loss 27.149006
Epoch 33, Loss 27.066790
Epoch 34, Loss 26.984844
Epoch 35, Loss 26.903173
Epoch 36, Loss 26.821791
Epoch 37, Loss 26.740675
Epoch 38, Loss 26.659838
Epoch 39, Loss 26.579279
Epoch 40, Loss 26.498987
Epoch 41,

tensor([ 2.7553, -2.5162])