# DL 1, Pytorch

Привет! На этом занятии мы познакомимся с PyTorch, с работой gpu и получением градиентов. В конце попробуем написать нейросеть для распознавания чисел.

**Для этого ноутбука надо включить GPU runtime**.

In [None]:
import torch

In [None]:
torch.sqrt()

In [None]:
torch.*Tensor?

## Simple Pytorch

Изучим простые методы pytorch. Они сильно напоминают numpy, и часто можно использовать имя метода из numpy в pytorch.

In [None]:
t = torch.Tensor(2, 3, 4)

In [None]:
t

In [None]:
t.size(0)

In [None]:
t.random_(10)
t

In [None]:
t = torch.zeros_like(t)
t

In [None]:
t = torch.rand(2, 3, 4)
t

In [None]:
r = torch.Tensor(t)
r.resize_(3, 5)
r

In [None]:
r.reshape(-1)

In [None]:
a, b = torch.rand(1, 4), torch.rand(1, 4)
a + b

In [None]:
a * b

In [None]:
a @ b.T

In [None]:
a, b = torch.rand(3, 4), torch.rand(4, 5)
a @ b

In [None]:
a.norm(), b.sum() # np.linalg.norm

In [None]:
a / 10

In [None]:
torch.Tensor(3, 4, 5, 2).transpose(0, 2).shape

In [None]:
t = torch.arange(10)
t, t.dtype

In [None]:
t = t.to(torch.float32)
t, t.dtype

## Autograd & GPU

Теперь поработаем с GPU и получением градиентов.

In [None]:
a, b = torch.rand(3, 4), torch.rand(4, 5)
a, b

In [None]:
a = a.to("cuda")

In [None]:
!nvidia-smi

In [None]:
a @ b

In [None]:
b = b.to("cuda")

In [None]:
a @ b

In [None]:
t.device

In [None]:
a.requires_grad_(True)

s = (a @ b).sum()
s

In [None]:
h = a @ b

In [None]:
h.backward()

In [None]:
h, s

In [None]:
s.backward()
s

In [None]:
s.size()

In [None]:
h.grad

In [None]:
a, b

In [None]:
a.grad, b.grad

In [None]:
a, b = torch.rand(3, 4, requires_grad=True), torch.rand(4, 5, requires_grad=True)
s = (a @ b).sum() 
s.backward()

In [None]:
a, b

In [None]:
a.grad, b.grad

## Neural Network


Давайте определим простую ML-задачу и попробуем решить её с помощью нейросети.

In [None]:
X = torch.rand(1000, 10)
w_true = torch.rand(10, 1) * 10
b_true = torch.tensor(3.1415926)
eps = torch.rand(1000) * 1e-3
y = X @ w_true + b_true + eps

In [None]:
w = torch.rand(10, 1, requires_grad=True)
b = torch.rand(1, requires_grad=True)

In [None]:
#  .... = .. + ...
#  .... += .... -- DONOTDO

In [None]:
y_hat = X @ w + b
y_hat += X@w
L = ((y - y_hat) ** 2).mean()
L

In [None]:
L.backward()

In [None]:
w, w.grad

In [None]:
lr = 1e-2

with torch.inference_mode():
    w = w + w.grad * lr
    b -= b.grad * lr

    w.grad = None
    b.grad = None

In [None]:
w, b

In [None]:
for idx in range(5000):
    y_hat = X.to("cpu") @ w + b
    L = ((y_hat - y.to("cpu")) ** 2).mean()
    L.backward()
    with torch.inference_mode():
        w -= w.grad * lr
        b -= b.grad * lr
        w.grad = None
        b.grad = None
    if idx % 100 == 0:
        print(f"Current Loss: {L}")

In [None]:
torch.norm(w_true - w), torch.norm(b_true - b)

In [None]:
w_true.tolist(), w.tolist()

In [None]:
b_true, b

Воспользуемся высокоуровневым способ описывать нейросети в PyTorch:

In [None]:
class Linear(torch.nn.Module):
    def __init__(self, in_shape, out_shape):
        super().__init__()

        self.layer = torch.nn.Linear(in_shape, out_shape)
    
    def forward(self, x):
        return self.layer(x)

In [None]:
model = Linear(10, 1)
optimizer = torch.optim.SGD(model.parameters(), lr)
criterion = torch.nn.MSELoss()

In [None]:
model.layer.weight

In [None]:
model.to("cuda")
model.layer.weight

In [None]:
X = X.to("cuda")
y = y.to("cuda")

for idx in range(5000):
    y_hat = model(X)
    L = criterion(y_hat, y)
    L.backward()
    optimizer.step()
    optimizer.zero_grad()
    if idx % 100 == 0:
        print(f"Current Loss: {L}")

In [None]:
list(model.parameters())

In [None]:
# compare matrices

## MNIST

Перейдем к другой задаче -- распознавании чисел. Загрузим датасет MNIST и напишем нейросеть, которая отличает числа меньше 5 и больше или равно 5.

In [1]:
from keras.datasets import mnist
import torch


(train_X, train_y), (test_X, test_y) = mnist.load_data()

In [2]:
print("\n".join(" ".join("X" if ch > 128 else "." for ch in row) for row in train_X[90].tolist()))

. . . . . . . . . . . . . . . . . . . . . . . . . . . .
. . . . . . . . . . . . . . . . . . . . . . . . . . . .
. . . . . . . . . . . . . . . . . . X X . . . . . . . .
. . . . . . . . . . . . . . . . X X X X . . . . . . . .
. . . . . . . . . . . . . . . . X X X X . . . . . . . .
. . . . . . . . . . . . . . X X X X . . . . . . . . . .
. . . . . . . . . . . . . X X X . . . . . . . . . . . .
. . . . . . . . . . . . X X X . . . . . . . . . . . . .
. . . . . . . . . . X X X X . . . . . . . . . . . . . .
. . . . . . . . . . X X X . . . . . . . . . . . . . . .
. . . . . . . . . X X X . . . . . . . . . . . . . . . .
. . . . . . . . . X X X . . . . . . . . . . . . . . . .
. . . . . . . . X X X X . . . . . . . . X . . . . . . .
. . . . . . . . X X X . . . . . . . X X X X . . . . . .
. . . . . . . . X X X . . . . X X X X X X X . . . . . .
. . . . . . . X X X X . . X X X . . . X X X . . . . . .
. . . . . . . . X X X X . X X . . . . X X X . . . . . .
. . . . . . . . X X X X X X . . . X . X X X . . 

In [3]:
train_X[0].shape

(28, 28)

Создадим датасет для нашей задачи:

In [4]:
train_y[0]

5

In [6]:
X_train_tensor = torch.as_tensor(train_X) / 255
X_test_tensor = torch.as_tensor(test_X) / 255

y_train_tensor = torch.as_tensor(train_y < 5) * 1.0
y_test_tensor = torch.as_tensor(test_y < 5) * 1.0

In [7]:
y_train_tensor[0]

tensor(0.)

Опишем простую двухслойную сеть:

In [8]:
torch.nn.Linear?

In [9]:
class NNClassifier(torch.nn.Module):
    def __init__(self):
        super().__init__()

        self.layer_0 = torch.nn.Linear(28*28, 100)
        self.act_0 = torch.nn.Tanh()
        self.layer_1 = torch.nn.Linear(100, 1)
        self.act_1 = torch.nn.Sigmoid()
    
    def forward(self, x):
        return self.act_1(self.layer_1(self.act_0(self.layer_0(x))))

Напишем для неё Loss-функцию

(подсказка: вспомните logistic regression)

In [10]:
class NLLLoss(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, y_pred, y_true):
        return (-(y_true * torch.log(y_pred ) + (1 - y_true) * torch.log(1 - y_pred))).mean()

Повторим цикл обучения для новой сети!

In [12]:
model = NNClassifier()
criterion = NLLLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1)

model

NNClassifier(
  (layer_0): Linear(in_features=784, out_features=100, bias=True)
  (act_0): Tanh()
  (layer_1): Linear(in_features=100, out_features=1, bias=True)
  (act_1): Sigmoid()
)

In [13]:
y_hat = model(X_train_tensor[:10].reshape(10, -1))
y_hat

tensor([[0.4907],
        [0.5019],
        [0.5106],
        [0.4836],
        [0.5218],
        [0.5381],
        [0.4761],
        [0.4797],
        [0.4929],
        [0.4680]], grad_fn=<SigmoidBackward0>)

In [14]:
y_hat.size(), y_train_tensor[:10].size()

(torch.Size([10, 1]), torch.Size([10]))

In [15]:
criterion(y_hat[:, 0], y_train_tensor[:10])

tensor(0.7064, grad_fn=<MeanBackward0>)

In [19]:
for idx in range(50000):
    y_hat = model(X_train_tensor[:10000].reshape(10000, -1))
    L = criterion(y_hat, y_train_tensor[:10000])
#     L.backward()
#     optimizer.step()
#     optimizer.zero_grad()
#     if idx % 100 == 0:
#         print(f"Current Loss: {L}")
    break