In [1]:
# Yernar Shambayev, DL-2
# Обучите нейронную сеть решать шифр цезаря.
# Что надо сделать:
# 1.Написать алгоритм шифра цезаря для генерации выборки (сдвиг на К каждой буквы.
# Например, при сдвиге на 2 буква “А” переходит в букву “В” и тп)
# 2.Сделать нейронную сеть
# 3.Обучить ее (вход - зашифрованная фраза, выход - дешифрованная фраза)
# 4.Проверить качество

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
import time

In [2]:
# Шифр Цезаря
def caesar(input, K):
    output = ''
    for c in input.lower():
        if c.isalpha():
            new_num = ord(c) + K
            if new_num > ord('z'):
                new_num -= 26
            output += chr(new_num)
        else:
            output += c
    return output

In [3]:
# Берем базу фраз из Симпсонов
df = pd.read_csv('data.csv')
# Удаляем строки с пустым значением normalized_text
df = df.dropna(subset=['normalized_text'])
# Оставляем только normalized_text и столбец с зашифрованным значением
df['caesar_txt'] = df['normalized_text'].apply(lambda x: caesar(str(x), 2))
df = df.drop(['id', 'episode_id', 'number', 'raw_text', 'timestamp_in_ms', 'speaking_line', 'character_id', 'location_id',
              'raw_character_text', 'raw_location_text', 'spoken_words', 'word_count'], axis = 1)
df.head()

Unnamed: 0.1,Unnamed: 0,normalized_text,caesar_txt
0,0,maggie look whats that,ociikg nqqm yjcvu vjcv
1,1,lee-mur lee-mur,ngg-owt ngg-owt
2,2,zee-boo zee-boo,bgg-dqq bgg-dqq
3,3,im trying to teach maggie that nature doesnt e...,ko vtakpi vq vgcej ociikg vjcv pcvwtg fqgupv g...
4,4,its like an ox only it has a hump and a dewlap...,kvu nkmg cp qz qpna kv jcu c jwor cpf c fgyncr...


In [4]:
train, test = train_test_split(df, test_size=0.2)
train_list = train['normalized_text'].tolist()
train_caesar = train['caesar_txt'].tolist()
test_list = test['normalized_text'].tolist()
test_caesar = test['caesar_txt'].tolist()

In [5]:
# Переводим в тензоры Torch

train_text = [[c for c in ph] for ph in train_list if type(ph) is str]
train_label = [[c for c in ph] for ph in train_caesar if type(ph) is str]
test_text = [[c for c in ph] for ph in test_list if type(ph) is str]
test_label = [[c for c in ph] for ph in test_caesar if type(ph) is str]

CHARS = set('abcdefghijklmnopqrstuvwxyz ')
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}
MAX_LEN = 50

def convert_to_torch(text):
    output = torch.zeros((len(text), MAX_LEN), dtype=int)
    for i in range(len(text)):
        for j, w in enumerate(text[i]):
            if j >= MAX_LEN:
                break
            output[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

    return output

X = convert_to_torch(train_text)
Y = convert_to_torch(train_label)
X_test = convert_to_torch(test_text)
Y_test = convert_to_torch(test_label)

In [6]:
class RNN_Network(torch.nn.Module):
    def __init__(self):
        super(RNN_Network, self).__init__()
        self.embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)
        self.rnn = torch.nn.RNN(28, 128, batch_first=True)
        self.linear = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        embds = self.embeddings(sentences)
        out, new_state = self.rnn(embds, state)
        result = self.linear(out)
        return result, new_state

model = RNN_Network()
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
epochs = 5

In [7]:
# Обучение
for epoch in range(epochs):
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        X_batch = X[i * 100:(i + 1) * 100]
        Y_batch = Y[i * 100:(i + 1) * 100].flatten()

        optimizer.zero_grad()
        answers, _ = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print(f"Epoch {epoch + 1}. Time: {time.time() - start:.3f}, Train loss: {train_loss / train_passed:.3f}")

Epoch 1. Time: 2.825, Train loss: 2.372
Epoch 2. Time: 2.645, Train loss: 1.648
Epoch 3. Time: 2.851, Train loss: 1.314
Epoch 4. Time: 3.138, Train loss: 1.067
Epoch 5. Time: 2.000, Train loss: 0.879


In [8]:
# Проверяем качество
test_loss = 0
with torch.no_grad():
    model.eval()

    answers, _ = model.forward(X_test)
    answers = answers.view(-1, len(INDEX_TO_CHAR))
    loss = criterion(answers, Y_test.flatten())
    test_loss += loss.item()
    print(f'Loss: {test_loss / len(X_test):.4f}')

Loss: 0.0004
