### Задание 1.
Обучите нейронную сеть решать шифр Цезаря.

Что необходимо сделать:

Написать алгоритм шифра Цезаря для генерации выборки (сдвиг на К каждой буквы. Например, при сдвиге на 2 буква “А” переходит в букву “В” и тп)
Сделать нейронную сеть
Обучить ее (вход - зашифрованная фраза, выход - дешифрованная фраза)
Проверить качество

In [1]:
import random
import torch
import time

key = 10
vocab = [char for char in 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ']

def encrypt(text):
    indexes = [vocab.index(char) for char in text]
    encrypted_indexes = [(idx + key) % len(vocab) for idx in indexes]
    encrypted_chars = [vocab[idx] for idx in encrypted_indexes]
    encrypted = ''.join(encrypted_chars)
    return encrypted

num_examples = 128
message_length = 32

def dataset(num_examples):
    dataset = []
    for x in range(num_examples):
        ex_out = ''.join([random.choice(vocab) for x in range(message_length)])
        ex_in = encrypt(''.join(ex_out))
        ex_in = [vocab.index(x) for x in ex_in]
        ex_out = [vocab.index(x) for x in ex_out]
        dataset.append([torch.tensor(ex_in), torch.tensor(ex_out)])
    return dataset


In [2]:
embedding_dim = 10
hidden_dim = 10
vocab_size = len(vocab)

embed = torch.nn.Embedding(vocab_size, embedding_dim)
lstm = torch.nn.LSTM(embedding_dim, hidden_dim)
linear = torch.nn.Linear(hidden_dim, vocab_size)
softmax = torch.nn.functional.softmax
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(list(embed.parameters()) +
                             list(lstm.parameters()) +
                             list(linear.parameters()), lr=0.001)

def zero_hidden():
    return (torch.zeros(1, 1, hidden_dim),
            torch.zeros(1, 1, hidden_dim))

num_epochs = 5

accuracies, max_accuracy = [], 0
for x in range(num_epochs):
    for encrypted, original in dataset(num_examples):
        lstm_in = embed(encrypted)
        lstm_in = lstm_in.unsqueeze(1)
        lstm_out, lstm_hidden = lstm(lstm_in, zero_hidden())
        scores = linear(lstm_out)
        scores = scores.transpose(1, 2)
        original = original.unsqueeze(1)
        loss = loss_fn(scores, original) 
        loss.backward()
        optimizer.step()

with torch.no_grad():
    matches, total = 0, 0
    for encrypted, original in dataset(num_examples):
        lstm_in = embed(encrypted)
        lstm_in = lstm_in.unsqueeze(1)
        lstm_out, lstm_hidden = lstm(lstm_in, zero_hidden())
        scores = linear(lstm_out)
        predictions = softmax(scores, dim=2)
        _, batch_out = predictions.max(dim=2)
        batch_out = batch_out.squeeze(1)
        matches += torch.eq(batch_out, original).sum().item()
        total += torch.numel(batch_out)
    accuracy = matches / total
    print('Accuracy: {:4.2f}%'.format(accuracy * 100))


Accuracy: 99.73%


In [4]:
embedding_dim = 10
hidden_dim = 10
vocab_size = len(vocab)

class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embed = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.linear = torch.nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, sentences, state=None):
        embed = self.embed(sentences)
        o, s = self.rnn(embed)
        out = self.linear(o)
        return out

model = Network()

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

for ep in range(10):
    start = time.time()
    train_loss = 0.
    train_passed = 0
    for encrypted, original in dataset(num_examples):
        optimizer.zero_grad()
        answers = model.forward(encrypted.unsqueeze(1))
        answers = answers.view(-1, vocab_size)
        loss = criterion(answers, original)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_passed += 1
    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

with torch.no_grad():
    matches, total = 0, 0
    for encrypted, original in dataset(num_examples):
        answers = model.forward(encrypted.unsqueeze(1))
        predictions = torch.nn.functional.softmax(answers, dim=2)
        _, batch_out = predictions.max(dim=2)
        batch_out = batch_out.squeeze(1)
        matches += torch.eq(batch_out, original).sum().item()
        total += torch.numel(batch_out)
    accuracy = matches / total
    print('Accuracy: {:4.2f}%'.format(accuracy * 100))


Epoch 0. Time: 0.092, Train loss: 3.199
Epoch 1. Time: 0.070, Train loss: 2.515
Epoch 2. Time: 0.070, Train loss: 1.941
Epoch 3. Time: 0.073, Train loss: 1.479
Epoch 4. Time: 0.084, Train loss: 1.154
Epoch 5. Time: 0.076, Train loss: 0.921
Epoch 6. Time: 0.072, Train loss: 0.727
Epoch 7. Time: 0.076, Train loss: 0.596
Epoch 8. Time: 0.070, Train loss: 0.500
Epoch 9. Time: 0.071, Train loss: 0.412
Accuracy: 100.00%


## Задание 2
Выполнить практическую работу из лекционного ноутбука.

Построить RNN-ячейку на основе полносвязных слоев
Применить построенную ячейку для генерации текста с выражениями героев сериала “Симпсоны”

In [5]:
import re
import time
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from string import ascii_lowercase
from sklearn.model_selection import train_test_split

In [6]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,0,10368,35,29,"Lisa Simpson: Maggie, look. What's that?",235000,True,9,5.0,Lisa Simpson,Simpson Home,"Maggie, look. What's that?",maggie look whats that,4.0
1,1,10369,35,30,Lisa Simpson: Lee-mur. Lee-mur.,237000,True,9,5.0,Lisa Simpson,Simpson Home,Lee-mur. Lee-mur.,lee-mur lee-mur,2.0
2,2,10370,35,31,Lisa Simpson: Zee-boo. Zee-boo.,239000,True,9,5.0,Lisa Simpson,Simpson Home,Zee-boo. Zee-boo.,zee-boo zee-boo,2.0
3,3,10372,35,33,Lisa Simpson: I'm trying to teach Maggie that ...,245000,True,9,5.0,Lisa Simpson,Simpson Home,I'm trying to teach Maggie that nature doesn't...,im trying to teach maggie that nature doesnt e...,24.0
4,4,10374,35,35,"Lisa Simpson: It's like an ox, only it has a h...",254000,True,9,5.0,Lisa Simpson,Simpson Home,"It's like an ox, only it has a hump and a dewl...",its like an ox only it has a hump and a dewlap...,18.0


In [7]:
sents = df['normalized_text'].tolist()
text = [[char for char in sent] for sent in sents if isinstance(sent, str)]


In [8]:
CHAR_TO_INDEX = {w: i for i, w in enumerate(ascii_lowercase, 1)}
CHAR_TO_INDEX[' '] = 27

In [9]:
np.mean([len(sent) for sent in text])

46.36635754292535

In [11]:
MAX_LEN = 50

X = torch.zeros((len(text), MAX_LEN), dtype=int)
for i in range(len(text)):
    for j, w in enumerate(text[i]):
        if j >= MAX_LEN:
            break
        if w.lower() in CHAR_TO_INDEX:
            X[i][j] = CHAR_TO_INDEX[w.lower()]

X[:10]

tensor([[13,  1,  7,  7,  9,  5, 27, 12, 15, 15, 11, 27, 23,  8,  1, 20, 19, 27,
         20,  8,  1, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [12,  5,  5,  0, 13, 21, 18, 27, 12,  5,  5,  0, 13, 21, 18,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [26,  5,  5,  0,  2, 15, 15, 27, 26,  5,  5,  0,  2, 15, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 9, 13, 27, 20, 18, 25,  9, 14,  7, 27, 20, 15, 27, 20,  5,  1,  3,  8,
         27, 13,  1,  7,  7,  9,  5, 27, 20,  8,  1, 20, 27, 14,  1, 20, 21, 18,
          5, 27,  4, 15,  5, 19, 14, 20, 27,  5, 14,  4, 27, 23],
        [ 9, 20, 19, 27, 12,  9, 11,  5, 27,  1, 14, 27, 15, 24, 27, 15, 14, 12,
       

In [13]:
MAX_LEN = 25

y = torch.zeros((len(text), MAX_LEN), dtype=int)
for i in range(len(text)):
    for j, w in enumerate(text[i]):
        if j >= MAX_LEN:
            break
        if w.lower() in CHAR_TO_INDEX:
            y[i][j] = CHAR_TO_INDEX[w.lower()]

y[:10]

tensor([[13,  1,  7,  7,  9,  5, 27, 12, 15, 15, 11, 27, 23,  8,  1, 20, 19, 27,
         20,  8,  1, 20,  0,  0,  0],
        [12,  5,  5,  0, 13, 21, 18, 27, 12,  5,  5,  0, 13, 21, 18,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0],
        [26,  5,  5,  0,  2, 15, 15, 27, 26,  5,  5,  0,  2, 15, 15,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0],
        [ 9, 13, 27, 20, 18, 25,  9, 14,  7, 27, 20, 15, 27, 20,  5,  1,  3,  8,
         27, 13,  1,  7,  7,  9,  5],
        [ 9, 20, 19, 27, 12,  9, 11,  5, 27,  1, 14, 27, 15, 24, 27, 15, 14, 12,
         25, 27,  9, 20, 27,  8,  1],
        [25, 15, 21, 27, 11, 14, 15, 23, 27,  8,  9, 19, 27,  2, 12, 15, 15,  4,
         27, 20, 25, 16,  5, 27,  8],
        [15,  8, 27, 25,  5,  1,  8, 27, 23,  8,  1, 20, 19, 27, 13, 25, 27, 19,
          8, 15,  5, 27, 19,  9, 26],
        [18,  9, 14,  7,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0],
        [25,  5, 19, 27,  4,  1,  4,  0,  0,  0,

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
class TextGenerationNetwork(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextGenerationNetwork, self).__init__()
        self.embedding = torch.nn.Embedding(vocab_size, embedding_dim)
        self.rnn = torch.nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.dense = torch.nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, train=True, state=None):
        with torch.set_grad_enabled(train):
            out = self.embedding(x)
            out, state = self.rnn(out)
            out = self.dense(out)
            out = out.squeeze(0)
        return out, state


In [16]:
def generate_sentence(model):
    sent = 'hello'
    id = -1
    while id != 0 and len(sent) < MAX_LEN:
        sent_as_tensor = [torch.as_tensor(
            np.array([[CHAR_TO_INDEX[char]]]),
            dtype=torch.long) for char in sent]
        state = None
        for char in sent_as_tensor:
            out, state = model.forward(char, train=False, state=state)
        id = np.argmax(out.numpy().flatten())
        if id == 27:
            sent += ' '
        else:
            sent += ascii_lowercase[id - 1]
    return sent

# Обучение и генерация предложений на каждой эпохе
vocab_size = 28
embedding_dim = 28
hidden_dim = 128

model = TextGenerationNetwork(vocab_size, embedding_dim, hidden_dim)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.05)
batch_size = 100
n_epochs = 100

train_losses = []

for epoch in tqdm(range(1, n_epochs + 1)):
    start = time.time()
    train_loss = 0.0
    n_batches = int(np.ceil(len(X_train) / batch_size))
    for i in range(n_batches):
        batch = X[i * batch_size: (i + 1) * batch_size]
        X_batch = batch[:, :-1]
        y_batch = batch[:, 1:].flatten()
        optimizer.zero_grad()
        y_pred, _ = model.forward(X_batch)
        loss = criterion(y_pred.view(-1, 28), y_batch)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    train_loss /= n_batches
    train_losses.append(train_loss)
    sec = time.time() - start
    print(f'Epoch: {epoch}, time: {sec:.1f}s, train loss: {train_loss:.3f}')
    print(generate_sentence(model) + '\n')


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 1, time: 1.8s, train loss: 2.184
hello t t t t t t t t t t

Epoch: 2, time: 1.8s, train loss: 1.878
hello t t t t t t t t t t

Epoch: 3, time: 1.7s, train loss: 1.803
hellou b b b b b b b b b 

Epoch: 4, time: 1.8s, train loss: 1.755
hellout b b b b b b b b b

Epoch: 5, time: 1.8s, train loss: 1.719
hellouthe b b b b b b b b

Epoch: 6, time: 1.7s, train loss: 1.691
hellouthe b b b b b b b b

Epoch: 7, time: 1.7s, train loss: 1.667
hellouthe b b b b b b b b

Epoch: 8, time: 1.7s, train loss: 1.648
hellouthe b b b b b b b b

Epoch: 9, time: 1.7s, train loss: 1.631
hellouthe b b b b b b b b

Epoch: 10, time: 1.7s, train loss: 1.617
hellouthe b b b b b b b b

Epoch: 11, time: 1.7s, train loss: 1.604
hellouthe he he he he he 

Epoch: 12, time: 1.7s, train loss: 1.592
hellouthe he he he he he 

Epoch: 13, time: 1.7s, train loss: 1.581
hellouthe he he he he he 

Epoch: 14, time: 1.7s, train loss: 1.571
hellouthe he he he he he 

Epoch: 15, time: 1.7s, train loss: 1.562
hellouthe he he 

В целом, код работает корректно и реализует задачу генерации текста на основе обученной RNN модели.