## Домашнее задание №6. Рекуррентные сети 2

### Генерация данных

In [29]:
import torch
from random import randint

N = 20
M = 1000
K = int(M * 0.8)

x_train, x_test = torch.zeros(K, N, dtype=int), torch.zeros(M - K, N, dtype=int)
y_train, y_test = torch.zeros(K, N, dtype=int), torch.zeros(M - K, N, dtype=int)

for j in range(M):
    X, Y = torch.zeros(N, dtype=int), torch.zeros(N, dtype=int)
    for i in range(N):
        X[i] = randint(0, 9)
        if i == 0:
            Y[i] = X[i]
        else:
            Y[i] = X[i] + X[0]
            if Y[i] >= 10:
                Y[i] -= 10
    if j < K:
        x_train[j] = X
        y_train[j] = Y
    else:
        x_test[j - K] = X
        y_test[j - K] = Y

In [30]:
x_train.shape, x_test.shape

(torch.Size([800, 20]), torch.Size([200, 20]))

In [91]:
import time


def train_model(model, num_epochs, batch_size, data, target):

    train, test = data[0], data[1]
    y_train, y_test = target[0], target[1]

    loss = torch.nn.CrossEntropyLoss()
    trainer = torch.optim.Adam(model.parameters(), lr=.001)

    for ep in range(num_epochs):
        
        train_iters, train_passed  = 0, 0
        train_loss, train_acc = 0., 0.
        start=time.time()
        
        model.train()
        for i in range(int(len(train) / batch_size)):
            X_batch = train[i * batch_size:(i + 1) * batch_size]
            Y_batch = y_train[i * batch_size:(i + 1) * batch_size].flatten()
            trainer.zero_grad()
            y_pred = model.forward(X_batch)
            y_pred = y_pred.view(-1, 10)
            l = loss(y_pred, Y_batch)
            l.backward()
            trainer.step()
            train_loss += l.item()
            train_acc += (y_pred.argmax(dim=1) == Y_batch).sum().item()
            train_iters += 1
            train_passed += Y_batch.shape[0]
        
        test_iters, test_passed  = 0, 0
        test_loss, test_acc = 0., 0.
        for i in range(int(len(test) / batch_size)):
            X_batch = test[i * batch_size:(i + 1) * batch_size]
            Y_batch = y_test[i * batch_size:(i + 1) * batch_size].flatten()
            y_pred = model(X_batch).view(-1, 10)
            l = loss(y_pred, Y_batch)
            test_loss += l.item()
            test_acc += (y_pred.argmax(dim=1) == Y_batch).sum().item()
            test_iters += 1
            test_passed += Y_batch.shape[0]
            
        train_acc_res = train_acc / train_passed
        test_acc_res = test_acc / test_passed
        print("ep: {}, time: {:.3f}, train_loss: {}, train_acc: {}, test_loss: {}, test_acc: {}".format(
            ep, time.time() - start, train_loss / train_iters, train_acc_res,
            test_loss / test_iters, test_acc_res)
        )

### Модели

### 1. RNN

In [98]:
class RNN(torch.nn.Module):
    def __init__(self):
        super(RNN, self).__init__()
        self.embedding = torch.nn.Embedding(10, 20)
        self.rnn = torch.nn.RNN(20, 128, batch_first=True)
        self.linear = torch.nn.Linear(128, 10)
        
    def forward(self, seq, state=None):
        embed = self.embedding(seq)
        o, s = self.rnn(embed)
        out = self.linear(o)
        return out

In [99]:
model = RNN()
train_model(model, 120, 20, (x_train, x_test), (y_train, y_test))

ep: 0, time: 0.339, train_loss: 2.3097122609615326, train_acc: 0.113375, test_loss: 2.3002854108810427, test_acc: 0.13425
ep: 1, time: 0.221, train_loss: 2.294386678934097, train_acc: 0.1389375, test_loss: 2.3018823146820067, test_acc: 0.1395
ep: 2, time: 0.277, train_loss: 2.291664016246796, train_acc: 0.1433125, test_loss: 2.302675747871399, test_acc: 0.1455
ep: 3, time: 0.253, train_loss: 2.2895939648151398, train_acc: 0.1460625, test_loss: 2.30360586643219, test_acc: 0.146
ep: 4, time: 0.278, train_loss: 2.287821626663208, train_acc: 0.1488125, test_loss: 2.304696869850159, test_acc: 0.1485
ep: 5, time: 0.218, train_loss: 2.2862353146076204, train_acc: 0.1506875, test_loss: 2.305756115913391, test_acc: 0.14925
ep: 6, time: 0.227, train_loss: 2.2846920788288116, train_acc: 0.153, test_loss: 2.306724500656128, test_acc: 0.1505
ep: 7, time: 0.240, train_loss: 2.2830352604389192, train_acc: 0.1551875, test_loss: 2.3074434995651245, test_acc: 0.15125
ep: 8, time: 0.227, train_loss: 2.28

In [100]:
for i in range(5):
    print(f'x_test: {x_test[i]}\ny_test: {y_test[i]}\ny_pred: {model(x_test[i]).argmax(dim=1)}\n=====')

x_test: tensor([9, 2, 9, 7, 7, 2, 8, 6, 4, 4, 4, 8, 5, 3, 6, 3, 4, 8, 9, 5])
y_test: tensor([9, 1, 8, 6, 6, 1, 7, 5, 3, 3, 3, 7, 4, 2, 5, 2, 3, 7, 8, 4])
y_pred: tensor([9, 1, 8, 6, 6, 1, 7, 5, 3, 3, 3, 7, 4, 2, 5, 2, 3, 7, 8, 4])
=====
x_test: tensor([2, 3, 7, 5, 0, 9, 6, 3, 1, 9, 5, 3, 5, 5, 3, 4, 6, 9, 6, 5])
y_test: tensor([2, 5, 9, 7, 2, 1, 8, 5, 3, 1, 7, 5, 7, 7, 5, 6, 8, 1, 8, 7])
y_pred: tensor([2, 5, 9, 7, 2, 1, 8, 5, 3, 1, 7, 5, 7, 7, 5, 6, 8, 1, 8, 7])
=====
x_test: tensor([4, 5, 3, 8, 8, 8, 8, 9, 3, 8, 9, 7, 6, 8, 0, 5, 8, 4, 0, 3])
y_test: tensor([4, 9, 7, 2, 2, 2, 2, 3, 7, 2, 3, 1, 0, 2, 4, 9, 2, 8, 4, 7])
y_pred: tensor([4, 9, 7, 2, 2, 2, 2, 3, 7, 2, 3, 1, 0, 2, 4, 9, 2, 8, 4, 7])
=====
x_test: tensor([7, 0, 9, 6, 0, 3, 1, 4, 3, 7, 6, 0, 9, 5, 4, 8, 2, 6, 3, 8])
y_test: tensor([7, 7, 6, 3, 7, 0, 8, 1, 0, 4, 3, 7, 6, 2, 1, 5, 9, 3, 0, 5])
y_pred: tensor([7, 7, 6, 3, 7, 0, 8, 1, 0, 4, 3, 7, 6, 2, 1, 5, 9, 3, 0, 5])
=====
x_test: tensor([0, 8, 3, 9, 0, 6, 0, 3, 2, 9, 7, 2, 

**Вывод:** как метрики, так и визуальная проверка показывают, что наша модель RNN справилась с поставленной задачей.

### 2. GRU

In [126]:
class GRU(torch.nn.Module):
    def __init__(self, rnnClass, dictionary_size, embedding_size, num_hiddens, num_classes):
        super().__init__()
        
        self.num_hiddens = num_hiddens
        self.embedding = torch.nn.Embedding(dictionary_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = torch.nn.Linear(num_hiddens, num_classes)
        
    def forward(self, X):
        out = self.embedding(X)
        o, _ = self.hidden(out)
        predictions = self.output(o)
        return predictions

In [127]:
model = GRU(torch.nn.GRU, 10, 20, 128, 10)
train_model(model, 20, 20, (x_train, x_test), (y_train, y_test))

ep: 0, time: 0.573, train_loss: 2.3048239827156065, train_acc: 0.108875, test_loss: 2.2988078117370607, test_acc: 0.1215
ep: 1, time: 0.577, train_loss: 2.296009248495102, train_acc: 0.1315, test_loss: 2.2978891849517824, test_acc: 0.1285
ep: 2, time: 0.556, train_loss: 2.292725068330765, train_acc: 0.132875, test_loss: 2.297342896461487, test_acc: 0.1355
ep: 3, time: 0.532, train_loss: 2.2898297905921936, train_acc: 0.1370625, test_loss: 2.2966761589050293, test_acc: 0.14025
ep: 4, time: 0.530, train_loss: 2.2863627552986143, train_acc: 0.141125, test_loss: 2.2953736543655396, test_acc: 0.14075
ep: 5, time: 0.552, train_loss: 2.2809235990047454, train_acc: 0.1485625, test_loss: 2.291536235809326, test_acc: 0.14925
ep: 6, time: 0.515, train_loss: 2.2677515387535094, train_acc: 0.1610625, test_loss: 2.2668930768966673, test_acc: 0.1755
ep: 7, time: 0.501, train_loss: 2.182557448744774, train_acc: 0.2201875, test_loss: 2.107844829559326, test_acc: 0.2715
ep: 8, time: 0.525, train_loss: 1

In [107]:
for i in range(5):
    print(f'x_test: {x_test[i]}\ny_test: {y_test[i]}\ny_pred: {model(x_test[i]).argmax(dim=1)}\n=====')

x_test: tensor([9, 2, 9, 7, 7, 2, 8, 6, 4, 4, 4, 8, 5, 3, 6, 3, 4, 8, 9, 5])
y_test: tensor([9, 1, 8, 6, 6, 1, 7, 5, 3, 3, 3, 7, 4, 2, 5, 2, 3, 7, 8, 4])
y_pred: tensor([9, 1, 8, 6, 6, 1, 7, 5, 3, 3, 3, 7, 4, 2, 5, 2, 3, 7, 8, 4])
=====
x_test: tensor([2, 3, 7, 5, 0, 9, 6, 3, 1, 9, 5, 3, 5, 5, 3, 4, 6, 9, 6, 5])
y_test: tensor([2, 5, 9, 7, 2, 1, 8, 5, 3, 1, 7, 5, 7, 7, 5, 6, 8, 1, 8, 7])
y_pred: tensor([2, 5, 9, 7, 2, 1, 8, 5, 3, 1, 7, 5, 7, 7, 5, 6, 8, 1, 8, 7])
=====
x_test: tensor([4, 5, 3, 8, 8, 8, 8, 9, 3, 8, 9, 7, 6, 8, 0, 5, 8, 4, 0, 3])
y_test: tensor([4, 9, 7, 2, 2, 2, 2, 3, 7, 2, 3, 1, 0, 2, 4, 9, 2, 8, 4, 7])
y_pred: tensor([4, 9, 7, 2, 2, 2, 2, 3, 7, 2, 3, 1, 0, 2, 4, 9, 2, 8, 4, 7])
=====
x_test: tensor([7, 0, 9, 6, 0, 3, 1, 4, 3, 7, 6, 0, 9, 5, 4, 8, 2, 6, 3, 8])
y_test: tensor([7, 7, 6, 3, 7, 0, 8, 1, 0, 4, 3, 7, 6, 2, 1, 5, 9, 3, 0, 5])
y_pred: tensor([7, 7, 6, 3, 7, 0, 8, 1, 0, 4, 3, 7, 6, 2, 1, 5, 9, 3, 0, 5])
=====
x_test: tensor([0, 8, 3, 9, 0, 6, 0, 3, 2, 9, 7, 2, 

**Вывод:** Модель GRU сошлась за значительно меньшее количество эпох, нежели простая RNN.

### 3. LSTM

In [128]:
class LSTM(torch.nn.Module):
    def __init__(self, rnnClass, dictionary_size, embedding_size, num_hiddens, num_classes):
        super().__init__()
        
        self.num_hiddens = num_hiddens
        self.embedding = torch.nn.Embedding(dictionary_size, embedding_size)
        self.hidden = rnnClass(embedding_size, num_hiddens, batch_first=True)
        self.output = torch.nn.Linear(num_hiddens, num_classes)
        
    def forward(self, X):
        out = self.embedding(X)
        o, *_ = self.hidden(out)
        predictions = self.output(o)
        return predictions

In [131]:
model = LSTM(torch.nn.LSTM, 10, 20, 128, 10)
train_model(model, 40, 20, (x_train, x_test), (y_train, y_test))

ep: 0, time: 0.553, train_loss: 2.303363800048828, train_acc: 0.112875, test_loss: 2.2988298654556276, test_acc: 0.131
ep: 1, time: 0.616, train_loss: 2.2960260272026063, train_acc: 0.1323125, test_loss: 2.296048808097839, test_acc: 0.14
ep: 2, time: 0.588, train_loss: 2.291681706905365, train_acc: 0.142125, test_loss: 2.294112968444824, test_acc: 0.14275
ep: 3, time: 0.635, train_loss: 2.2857984244823455, train_acc: 0.1466875, test_loss: 2.2893948554992676, test_acc: 0.14575
ep: 4, time: 0.642, train_loss: 2.25736209154129, train_acc: 0.18025, test_loss: 2.2266605854034425, test_acc: 0.2075
ep: 5, time: 0.603, train_loss: 2.174029541015625, train_acc: 0.22125, test_loss: 2.133845663070679, test_acc: 0.2275
ep: 6, time: 0.596, train_loss: 2.07178615629673, train_acc: 0.244875, test_loss: 2.0395862340927122, test_acc: 0.256
ep: 7, time: 0.629, train_loss: 1.9723564386367798, train_acc: 0.2651875, test_loss: 1.929352045059204, test_acc: 0.294
ep: 8, time: 0.643, train_loss: 1.87925521433

In [132]:
for i in range(5):
    print(f'x_test: {x_test[i]}\ny_test: {y_test[i]}\ny_pred: {model(x_test[i]).argmax(dim=1)}\n=====')

x_test: tensor([9, 2, 9, 7, 7, 2, 8, 6, 4, 4, 4, 8, 5, 3, 6, 3, 4, 8, 9, 5])
y_test: tensor([9, 1, 8, 6, 6, 1, 7, 5, 3, 3, 3, 7, 4, 2, 5, 2, 3, 7, 8, 4])
y_pred: tensor([9, 1, 8, 6, 6, 1, 7, 5, 3, 3, 3, 7, 4, 2, 5, 2, 3, 7, 8, 4])
=====
x_test: tensor([2, 3, 7, 5, 0, 9, 6, 3, 1, 9, 5, 3, 5, 5, 3, 4, 6, 9, 6, 5])
y_test: tensor([2, 5, 9, 7, 2, 1, 8, 5, 3, 1, 7, 5, 7, 7, 5, 6, 8, 1, 8, 7])
y_pred: tensor([2, 5, 9, 7, 2, 1, 8, 5, 3, 1, 7, 5, 7, 7, 5, 6, 8, 1, 8, 7])
=====
x_test: tensor([4, 5, 3, 8, 8, 8, 8, 9, 3, 8, 9, 7, 6, 8, 0, 5, 8, 4, 0, 3])
y_test: tensor([4, 9, 7, 2, 2, 2, 2, 3, 7, 2, 3, 1, 0, 2, 4, 9, 2, 8, 4, 7])
y_pred: tensor([4, 9, 7, 2, 2, 2, 2, 3, 7, 2, 3, 1, 0, 2, 4, 9, 2, 8, 4, 7])
=====
x_test: tensor([7, 0, 9, 6, 0, 3, 1, 4, 3, 7, 6, 0, 9, 5, 4, 8, 2, 6, 3, 8])
y_test: tensor([7, 7, 6, 3, 7, 0, 8, 1, 0, 4, 3, 7, 6, 2, 1, 5, 9, 3, 0, 5])
y_pred: tensor([7, 7, 6, 3, 7, 0, 8, 1, 0, 4, 3, 7, 6, 2, 1, 5, 9, 3, 0, 5])
=====
x_test: tensor([0, 8, 3, 9, 0, 6, 0, 3, 2, 9, 7, 2, 

**Вывод:** Модель LSTM сошлась за значительно меньшее количество эпох, нежели простая RNN, но чуть большее, чем GRU.