In [137]:
import torch

# if this does not work, just set device to 'cpu'
device = torch.device('mps' if torch.has_mps else 'cpu')
# device = 'cpu'

In [138]:
class IntroModel(torch.nn.Module):
    """
    This is a sample classe for SWR2.
    """

    def __init__(self, input_size, output_size):
        super().__init__()
        self.linear_mapping = torch.nn.Linear(input_size, output_size)
        
    def forward(self, input_, *args):
        output = self.linear_mapping(input_)
        return output

In [139]:
intro_model = IntroModel(1, 2).to(device)
xx = torch.tensor([1.0], device=device)

intro_model.forward(xx)

tensor([-1.1909,  0.2974], grad_fn=<AddBackward0>)

In [140]:
xx = torch.tensor([1.0, 2.0, 3.0, 4.0], device=device)

#intro_model.forward(xx)  # RuntimeError

xx.shape

xx = xx.view(4, 1)

intro_model.forward(xx)

# now it works and gives the four results in parrallel

yy = intro_model.forward(xx)

# The first dimension is interpreted as batch dimension and all computatoins
# are done in parralel along the first dimension.

yy.shape  # [4, 2]

torch.Size([4, 2])

In [141]:
# The linear_mapping maps one number on two numbers. This is done by a linear
# mapping with four paramerters. The four parameters are two weights and two
# biases.

intro_model.linear_mapping.weight

Parameter containing:
tensor([[-0.9958],
        [-0.6654]], requires_grad=True)

In [142]:
intro_model.linear_mapping.bias

Parameter containing:
tensor([-0.1951,  0.9628], requires_grad=True)

In [143]:
# The linear mapping multiplies the weights and adds the bias.
xx = torch.tensor([3.1415], device=device)

yy0 = intro_model.linear_mapping.weight[0] * xx + intro_model.linear_mapping.bias[0]
yy1 = intro_model.linear_mapping.weight[1] * xx + intro_model.linear_mapping.bias[1]

yy = intro_model.forward(xx)

yy == torch.cat((yy0, yy1))  # they are the same

tensor([True, True])

In [144]:
# Now we want to change the behavior of the intro_model, so that it produces
# two times the input as the first number and the negative input + 3 as the
# second number.

yy_true = torch.tensor([2 * xx, -xx + 3], device=device)

# now we let the intro_model predict the (wrong) numbers
yy = intro_model.forward(xx)

# calculate an error
error = torch.mean((yy_true - yy) ** 2)

# backpropagete the error to get gradients on all weights and biases
error.backward()

In [145]:
intro_model.linear_mapping.weight.grad

tensor([[-30.1789],
        [ -3.0978]])

In [146]:
intro_model.linear_mapping.bias.grad

tensor([-9.6065, -0.9861])

In [147]:
# The gradients give us the information how we have to change the weights and
# biases to minimize the resulting error. We will change the weights and biases
# only a little bit. This is called the learning rate.

In [148]:
learning_rate = 0.01

new_weights = intro_model.linear_mapping.weight - learning_rate * intro_model.linear_mapping.weight.grad
new_biases = intro_model.linear_mapping.bias - learning_rate * intro_model.linear_mapping.bias.grad

intro_model.linear_mapping.weight = torch.nn.Parameter(new_weights)
intro_model.linear_mapping.bias = torch.nn.Parameter(new_biases)

In [149]:
# Now lets do this 10 times in a loop and see if the error gets smaller:

for epoch in range(30):
    yy = intro_model.forward(xx)
    error = torch.mean((yy_true - yy) ** 2)
    error.backward()
    new_weights = intro_model.linear_mapping.weight - learning_rate * intro_model.linear_mapping.weight.grad
    new_biases = intro_model.linear_mapping.bias - learning_rate * intro_model.linear_mapping.bias.grad

    intro_model.linear_mapping.weight = torch.nn.Parameter(new_weights)
    intro_model.linear_mapping.bias = torch.nn.Parameter(new_biases)

    print(f"Error in epoch {epoch} is: {float(error)}")

Error in epoch 0 is: 37.04347610473633
Error in epoch 1 is: 29.428565979003906
Error in epoch 2 is: 23.3790283203125
Error in epoch 3 is: 18.573074340820312
Error in epoch 4 is: 14.755064010620117
Error in epoch 5 is: 11.72191333770752
Error in epoch 6 is: 9.312274932861328
Error in epoch 7 is: 7.397978782653809
Error in epoch 8 is: 5.877198696136475
Error in epoch 9 is: 4.669041633605957
Error in epoch 10 is: 3.7092416286468506
Error in epoch 11 is: 2.946743965148926
Error in epoch 12 is: 2.3409907817840576
Error in epoch 13 is: 1.8597605228424072
Error in epoch 14 is: 1.4774566888809204
Error in epoch 15 is: 1.173740029335022
Error in epoch 16 is: 0.9324582815170288
Error in epoch 17 is: 0.7407752275466919
Error in epoch 18 is: 0.5884963274002075
Error in epoch 19 is: 0.4675213396549225
Error in epoch 20 is: 0.37141457200050354
Error in epoch 21 is: 0.2950640320777893
Error in epoch 22 is: 0.23440854251384735
Error in epoch 23 is: 0.18622197210788727
Error in epoch 24 is: 0.147940754

In [150]:
# The model can now create the desired output for this single input xx =
# 3.1415, but what happens if we give it a new input?

xx = torch.tensor([1.5])
yy_true = torch.tensor([2 * xx, -xx + 3])
yy = intro_model.forward(xx)
error = torch.mean((yy_true - yy) ** 2)

print(f"Error: {float(error)}")
# For this new number we have a huge error :-(

Error: 0.5558255314826965


In [151]:
# But can we minimize the error for this values as well?

for epoch in range(30):
    yy = intro_model.forward(xx)
    error = torch.mean((yy_true - yy) ** 2)
    error.backward()
    new_weights = intro_model.linear_mapping.weight - learning_rate * intro_model.linear_mapping.weight.grad
    new_biases = intro_model.linear_mapping.bias - learning_rate * intro_model.linear_mapping.bias.grad

    intro_model.linear_mapping.weight = torch.nn.Parameter(new_weights)
    intro_model.linear_mapping.bias = torch.nn.Parameter(new_biases)

    print(f"Error in epoch {epoch} is: {float(error)}")

Error in epoch 0 is: 0.5558255314826965
Error in epoch 1 is: 0.5202839374542236
Error in epoch 2 is: 0.48701515793800354
Error in epoch 3 is: 0.4558735489845276
Error in epoch 4 is: 0.42672333121299744
Error in epoch 5 is: 0.3994370698928833
Error in epoch 6 is: 0.3738956153392792
Error in epoch 7 is: 0.349987268447876
Error in epoch 8 is: 0.32760781049728394
Error in epoch 9 is: 0.3066592812538147
Error in epoch 10 is: 0.28705033659935
Error in epoch 11 is: 0.26869526505470276
Error in epoch 12 is: 0.25151386857032776
Error in epoch 13 is: 0.23543114960193634
Error in epoch 14 is: 0.22037681937217712
Error in epoch 15 is: 0.20628505945205688
Error in epoch 16 is: 0.193094402551651
Error in epoch 17 is: 0.18074722588062286
Error in epoch 18 is: 0.16918955743312836
Error in epoch 19 is: 0.1583709567785263
Error in epoch 20 is: 0.14824417233467102
Error in epoch 21 is: 0.1387648731470108
Error in epoch 22 is: 0.12989172339439392
Error in epoch 23 is: 0.12158595770597458
Error in epoch 24

In [152]:
# The problem is that this might increase the error for the first number again.
# Solution: Let us do it for many numbers over and over again.

import numpy as np
xxs = np.array(np.random.normal(size=100), dtype=np.float32)

for epoch in range(30):
    np.random.shuffle(xxs)  # we don't want to have the same order in each epoch
    errors = list()
    for xx in xxs:
        xx = torch.tensor([xx], device=device)
        yy_true = torch.tensor([2 * xx, -xx + 3], device=device)
        yy = intro_model.forward(xx)
        error = torch.mean((yy_true - yy) ** 2)
        error.backward()
        new_weights = intro_model.linear_mapping.weight - learning_rate * intro_model.linear_mapping.weight.grad
        new_biases = intro_model.linear_mapping.bias - learning_rate * intro_model.linear_mapping.bias.grad

        intro_model.linear_mapping.weight = torch.nn.Parameter(new_weights)
        intro_model.linear_mapping.bias = torch.nn.Parameter(new_biases)
        errors.append(float(error))
    print(f"Average Error in epoch {epoch} is: {np.mean(errors)}")
    
print(intro_model.linear_mapping.weight)
print(intro_model.linear_mapping.bias)

Average Error in epoch 0 is: 0.9782249208830762
Average Error in epoch 1 is: 0.11989089231647085
Average Error in epoch 2 is: 0.014665360006765694
Average Error in epoch 3 is: 0.0017985189180308226
Average Error in epoch 4 is: 0.0002200765327825138
Average Error in epoch 5 is: 2.6860218667934532e-05
Average Error in epoch 6 is: 3.289766183272036e-06
Average Error in epoch 7 is: 4.0378691707587677e-07
Average Error in epoch 8 is: 4.9406868107837457e-08
Average Error in epoch 9 is: 6.056442563018294e-09
Average Error in epoch 10 is: 7.324438680567091e-10
Average Error in epoch 11 is: 9.109616762315032e-11
Average Error in epoch 12 is: 3.490397207372986e-11
Average Error in epoch 13 is: 3.0160756531172337e-11
Average Error in epoch 14 is: 2.963679114620366e-11
Average Error in epoch 15 is: 2.956294389844238e-11
Average Error in epoch 16 is: 2.938204200775285e-11
Average Error in epoch 17 is: 2.9478133059535064e-11
Average Error in epoch 18 is: 2.954460509574375e-11
Average Error in epoch 

### Can you find a -1, 0, 2, and 3 in the weights and biases? Where are these four numbers present as well?

Yes, you can find these numbers. 2 and -1 correspond to the weights that are multiplied with the first and second input variable respectively
0 and 3 are the biases that are added to the first and second variable
this corresponds to the formula that we wanted to learn from the beginngin!

In [153]:
# Instantiate a sencond intro_model that takes a vector with two numbers as
# input and outputs a single number. Train the second intro model to produce
# the sum of the two numbers.

In [156]:
xx = torch.tensor([1.0, 2.0], device=device)

yy_true = torch.tensor([3.0], device=device)

intro_model2 = IntroModel(2, 1).to(device)

yy = intro_model2(xx)

error = torch.mean((yy_true - yy) ** 2)

print(f"Initial error: {float(error)}")

Initial error: 1.7182968854904175


### I tried to optimize this code to run faster :)

In [157]:
# assuming that we have to train the model for a few epochs to learn the new formula
xxs = np.array(np.random.normal(size=(10000, 2)), dtype=np.float32)
xxs_tensor = torch.from_numpy(xxs).to(device)
batch_size = 1000
num_batches = len(xxs_tensor) // batch_size
for epoch in range(50):
    xxs_tensor = xxs_tensor[torch.randperm(len(xxs_tensor))]  # we don't want to have the same order in each epoch
    errors = list()
    for i in range(num_batches):
        batch_start = i * batch_size
        batch_end = (i + 1) * batch_size
        xx_batch = xxs_tensor[batch_start:batch_end]
        yy_true_batch = xx_batch.sum(dim=1, keepdim=True)
        yy_batch = intro_model2.forward(xx_batch)
        error_batch = torch.mean((yy_true_batch - yy_batch) ** 2)
        error_batch.backward()
        learning_rate = 0.01
        new_weights = intro_model2.linear_mapping.weight - learning_rate * intro_model2.linear_mapping.weight.grad
        new_biases = intro_model2.linear_mapping.bias - learning_rate * intro_model2.linear_mapping.bias.grad

        intro_model2.linear_mapping.weight = torch.nn.Parameter(new_weights)
        intro_model2.linear_mapping.bias = torch.nn.Parameter(new_biases)
        errors.append(float(error_batch))
    print(f"Average Error in epoch {epoch} is: {np.mean(errors)}")
    
print(intro_model2.linear_mapping.weight)
print(intro_model2.linear_mapping.bias)

Average Error in epoch 0 is: 1.410851275920868
Average Error in epoch 1 is: 0.9383809208869934
Average Error in epoch 2 is: 0.6241150557994842
Average Error in epoch 3 is: 0.4150745332241058
Average Error in epoch 4 is: 0.2760907977819443
Average Error in epoch 5 is: 0.18362259715795518
Average Error in epoch 6 is: 0.12213777154684066
Average Error in epoch 7 is: 0.08124077990651131
Average Error in epoch 8 is: 0.05403101779520512
Average Error in epoch 9 is: 0.03594069890677929
Average Error in epoch 10 is: 0.02390327546745539
Average Error in epoch 11 is: 0.0159001374617219
Average Error in epoch 12 is: 0.010575468372553586
Average Error in epoch 13 is: 0.007034363644197583
Average Error in epoch 14 is: 0.004678836092352867
Average Error in epoch 15 is: 0.0031122481916099785
Average Error in epoch 16 is: 0.0020701530389487743
Average Error in epoch 17 is: 0.0013769677374511958
Average Error in epoch 18 is: 0.0009159039240330457
Average Error in epoch 19 is: 0.0006092309020459652
Aver