In [1]:
import re
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import time

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [3]:
shift = 2

Напишем сам алгоритм дешифровки

In [4]:
def caesar_cipher(text, shift):
    result = ''
    for char in text:
        if char.isalpha():
            start = ord('а') if char.islower() else ord('А')
            result += chr((ord(char) - start + shift) % 33 + start)
        else:
            result += char
    return result


Загрузим данные

In [5]:
file_path = 'C:\\Users\\79169\\Desktop\\voyna-i-mir-tom-1.txt'
with open(file_path, 'rb') as file:
    lines = []
    for line in file:
        try:
            line = line.strip().decode(encoding='utf-8')
        except UnicodeDecodeError:
           
            line = line.strip().decode(encoding='windows-1251', errors='ignore')
        if len(line) == 0:
            continue
        lines.append(line)

text = " ".join(lines)

In [None]:
text

Почистим их

In [7]:
cleaned_text = re.sub(r'[^а-яА-ЯёЁ\s]', '',text)
cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
cleaned_text = cleaned_text.lower()

In [None]:
print(cleaned_text)

Возьмем для обучения первые 10000 слов

In [9]:
cleaned_text = cleaned_text[:10000]

Закодируем наш текст

In [11]:
enc_text = [caesar_cipher(text, shift) for text in cleaned_text]
enc_text

['н',
 'з',
 'д',
 ' ',
 'п',
 'к',
 'м',
 'р',
 'н',
 'в',
 'з',
 'д',
 'к',
 'щ',
 ' ',
 'ф',
 'р',
 'н',
 'у',
 'ф',
 'р',
 'л',
 ' ',
 'д',
 'р',
 'л',
 'п',
 'в',
 ' ',
 'к',
 ' ',
 'о',
 'к',
 'т',
 ' ',
 'ф',
 'р',
 'о',
 ' ',
 'н',
 'з',
 'д',
 ' ',
 'п',
 'к',
 'м',
 'р',
 'н',
 'в',
 'з',
 'д',
 'к',
 'щ',
 ' ',
 'ф',
 'р',
 'н',
 'у',
 'ф',
 'р',
 'л',
 ' ',
 'д',
 'р',
 'л',
 'п',
 'в',
 ' ',
 'к',
 ' ',
 'о',
 'к',
 'т',
 ' ',
 'ф',
 'р',
 'о',
 ' ',
 'щ',
 'в',
 'у',
 'ф',
 'ю',
 ' ',
 'с',
 'з',
 'т',
 'д',
 'в',
 'а',
 ' ',
 'з',
 ' ',
 'с',
 'р',
 'о',
 'з',
 'у',
 'ф',
 'ю',
 'а',
 ' ',
 'о',
 'р',
 'л',
 ' ',
 'д',
 'з',
 'т',
 'п',
 'э',
 'л',
 ' ',
 'т',
 'в',
 'г',
 ' ',
 'п',
 'х',
 ' ',
 'щ',
 'ф',
 'р',
 ' ',
 'м',
 'п',
 'а',
 'й',
 'ю',
 ' ',
 'е',
 'з',
 'п',
 'х',
 'в',
 ' ',
 'к',
 ' ',
 'н',
 'х',
 'м',
 'м',
 'в',
 ' ',
 'у',
 'ф',
 'в',
 'н',
 'к',
 ' ',
 'п',
 'з',
 ' ',
 'г',
 'р',
 'н',
 'ю',
 'ъ',
 'з',
 ' ',
 'м',
 'в',
 'м',
 ' ',
 'с',
 'р',
 'о'

Создадим словарь

In [12]:
CHARS = set('абвгдеѐжзийклмнопрстуфхцчшщъыьэюя ')

INDEX_TO_CHAR = ['none'] + [w for w in CHARS]
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}

In [13]:
CHAR_TO_INDEX

{'none': 0,
 'г': 1,
 'е': 2,
 'й': 3,
 'л': 4,
 'б': 5,
 'ѐ': 6,
 'к': 7,
 'ю': 8,
 'э': 9,
 'а': 10,
 'м': 11,
 'ь': 12,
 'у': 13,
 'д': 14,
 'с': 15,
 'ф': 16,
 'н': 17,
 'ч': 18,
 ' ': 19,
 'р': 20,
 'з': 21,
 'ы': 22,
 'о': 23,
 'и': 24,
 'ш': 25,
 'п': 26,
 'х': 27,
 'ц': 28,
 'щ': 29,
 'т': 30,
 'я': 31,
 'ъ': 32,
 'в': 33,
 'ж': 34}

In [14]:
len(CHAR_TO_INDEX)

35

Оформим наши данные в тензоры pytorch

In [15]:
MAX_LEN = 72
X = torch.zeros((len(enc_text), MAX_LEN), dtype=int)
for i in range(len(enc_text)):  # для каждого предложения
    for j, w in enumerate(enc_text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [16]:
X[0:5]

tensor([[17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [14,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [19,  0,  0,  0, 

In [17]:
ctext = list(cleaned_text)
ctext

['л',
 'е',
 'в',
 ' ',
 'н',
 'и',
 'к',
 'о',
 'л',
 'а',
 'е',
 'в',
 'и',
 'ч',
 ' ',
 'т',
 'о',
 'л',
 'с',
 'т',
 'о',
 'й',
 ' ',
 'в',
 'о',
 'й',
 'н',
 'а',
 ' ',
 'и',
 ' ',
 'м',
 'и',
 'р',
 ' ',
 'т',
 'о',
 'м',
 ' ',
 'л',
 'е',
 'в',
 ' ',
 'н',
 'и',
 'к',
 'о',
 'л',
 'а',
 'е',
 'в',
 'и',
 'ч',
 ' ',
 'т',
 'о',
 'л',
 'с',
 'т',
 'о',
 'й',
 ' ',
 'в',
 'о',
 'й',
 'н',
 'а',
 ' ',
 'и',
 ' ',
 'м',
 'и',
 'р',
 ' ',
 'т',
 'о',
 'м',
 ' ',
 'ч',
 'а',
 'с',
 'т',
 'ь',
 ' ',
 'п',
 'е',
 'р',
 'в',
 'а',
 'я',
 ' ',
 'е',
 ' ',
 'п',
 'о',
 'м',
 'е',
 'с',
 'т',
 'ь',
 'я',
 ' ',
 'м',
 'о',
 'й',
 ' ',
 'в',
 'е',
 'р',
 'н',
 'ы',
 'й',
 ' ',
 'р',
 'а',
 'б',
 ' ',
 'н',
 'у',
 ' ',
 'ч',
 'т',
 'о',
 ' ',
 'к',
 'н',
 'я',
 'з',
 'ь',
 ' ',
 'г',
 'е',
 'н',
 'у',
 'а',
 ' ',
 'и',
 ' ',
 'л',
 'у',
 'к',
 'к',
 'а',
 ' ',
 'с',
 'т',
 'а',
 'л',
 'и',
 ' ',
 'н',
 'е',
 ' ',
 'б',
 'о',
 'л',
 'ь',
 'ш',
 'е',
 ' ',
 'к',
 'а',
 'к',
 ' ',
 'п',
 'о',
 'м'

In [18]:
MAX_LEN = 72
Y = torch.zeros((len(ctext), MAX_LEN), dtype=int)

for i in range(len(ctext)):  # для каждого дешифрованного предложения
    for j, w in enumerate(ctext[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        Y[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [19]:
Y[0:5]

tensor([[ 4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [33,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [19,  0,  0,  0, 

Наша модель

In [30]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(len(CHAR_TO_INDEX), 72)  
        self.rnn = torch.nn.RNN(72, 256, batch_first=True)
        self.out = torch.nn.Linear(256, len(CHAR_TO_INDEX))

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x)
        return self.out(x)

model = Network()

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)



In [31]:
model.to(device)
X = X.to(device)
Y = Y.to(device)


Обучим ее

In [32]:
for ep in range(20):
    model.train()
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        # берём батч в 100 элементов
        batch_X= X[i * 10:(i + 1) * 10]
        batch_Y= Y[i * 10:(i + 1) * 10]
        X_batch = batch_X
        Y_batch = batch_Y.flatten()

        optimizer.zero_grad()
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(CHAR_TO_INDEX))
        loss = criterion(answers, Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

   

Epoch 0. Time: 0.184, Train loss: 0.137
Epoch 1. Time: 0.162, Train loss: 0.021
Epoch 2. Time: 0.152, Train loss: 0.009
Epoch 3. Time: 0.166, Train loss: 0.005
Epoch 4. Time: 0.149, Train loss: 0.003
Epoch 5. Time: 0.149, Train loss: 0.002
Epoch 6. Time: 0.151, Train loss: 0.001
Epoch 7. Time: 0.150, Train loss: 0.001
Epoch 8. Time: 0.152, Train loss: 0.001
Epoch 9. Time: 0.150, Train loss: 0.001
Epoch 10. Time: 0.152, Train loss: 0.000
Epoch 11. Time: 0.154, Train loss: 0.000
Epoch 12. Time: 0.152, Train loss: 0.000
Epoch 13. Time: 0.153, Train loss: 0.000
Epoch 14. Time: 0.152, Train loss: 0.000
Epoch 15. Time: 0.155, Train loss: 0.000
Epoch 16. Time: 0.156, Train loss: 0.000
Epoch 17. Time: 0.154, Train loss: 0.000
Epoch 18. Time: 0.155, Train loss: 0.000
Epoch 19. Time: 0.154, Train loss: 0.000


In [33]:
test = 'привет'
test = caesar_cipher(test, shift)
test

'сткдзф'

Проверим качество дешифровки

In [34]:
def decode_sequence(model, input_sequence):
    model.eval()
    with torch.no_grad():
        input_tensor = torch.tensor([[CHAR_TO_INDEX[char] for char in input_sequence]], dtype=torch.long).to(device)
        output = model(input_tensor)
        _, predicted_indices = torch.max(output, 2)
        decoded_sequence = ''.join([INDEX_TO_CHAR[idx.item()] for idx in predicted_indices[0]])
    return decoded_sequence

# Пример использования
input_sequence = "сткдзф"
decoded_sequence = decode_sequence(model, input_sequence)
print(f"Вход: {input_sequence}")
print(f"Дешифровка: {decoded_sequence}")

Вход: сткдзф
Дешифровка: привет


# Датасет Симпсоны

In [3]:
import pandas as pd

In [4]:
df = pd.read_csv(r'C:\Users\79169\Desktop\домашка\DLL\simpsons_script_lines.csv')
df

  df = pd.read_csv(r'C:\Users\79169\Desktop\домашка\DLL\simpsons_script_lines.csv')


Unnamed: 0,id,episode_id,number,raw_text,timestamp_in_ms,speaking_line,character_id,location_id,raw_character_text,raw_location_text,spoken_words,normalized_text,word_count
0,9549,32,209,"Miss Hoover: No, actually, it was a little of ...",848000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,"No, actually, it was a little of both. Sometim...",no actually it was a little of both sometimes ...,31
1,9550,32,210,Lisa Simpson: (NEAR TEARS) Where's Mr. Bergstrom?,856000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,Where's Mr. Bergstrom?,wheres mr bergstrom,3
2,9551,32,211,Miss Hoover: I don't know. Although I'd sure l...,856000,True,464.0,3.0,Miss Hoover,Springfield Elementary School,I don't know. Although I'd sure like to talk t...,i dont know although id sure like to talk to h...,22
3,9552,32,212,Lisa Simpson: That life is worth living.,864000,True,9.0,3.0,Lisa Simpson,Springfield Elementary School,That life is worth living.,that life is worth living,5
4,9553,32,213,Edna Krabappel-Flanders: The polls will be ope...,864000,True,40.0,3.0,Edna Krabappel-Flanders,Springfield Elementary School,The polls will be open from now until the end ...,the polls will be open from now until the end ...,33
...,...,...,...,...,...,...,...,...,...,...,...,...,...
158266,9544,32,204,Miss Hoover: (OFF LISA'S REACTION) I'm back.,831000,true,464,3.0,Miss Hoover,Springfield Elementary School,I'm back.,im back,2
158267,9545,32,205,"Miss Hoover: You see, class, my Lyme disease t...",839000,true,464,3.0,Miss Hoover,Springfield Elementary School,"You see, class, my Lyme disease turned out to ...",you see class my lyme disease turned out to be,10
158268,9546,32,206,Miss Hoover: Psy-cho-so-ma-tic.,842000,true,464,3.0,Miss Hoover,Springfield Elementary School,Psy-cho-so-ma-tic.,psy-cho-so-ma-tic,1
158269,9547,32,207,Ralph Wiggum: Does that mean you were crazy?,844000,true,119,3.0,Ralph Wiggum,Springfield Elementary School,Does that mean you were crazy?,does that mean you were crazy,6


In [5]:
phrases = df['normalized_text'].tolist()  # колонка с предобработанными текстами
phrases[:10]

['no actually it was a little of both sometimes when a disease is in all the magazines and all the news shows its only natural that you think you have it',
 'wheres mr bergstrom',
 'i dont know although id sure like to talk to him he didnt touch my lesson plan what did he teach you',
 'that life is worth living',
 'the polls will be open from now until the end of recess now just in case any of you have decided to put any thought into this well have our final statements martin',
 'i dont think theres anything left to say',
 'bart',
 'victory party under the slide',
 nan,
 'mr bergstrom mr bergstrom']

In [6]:
text = [[c for c in ph] for ph in phrases if type(ph) is str]
len(text)

132087

In [7]:
CHARS = set('abcdefghijklmnopqrstuvwxyz ')  # все символы, которые мы хотим использовать для кодировки = наш словарь
INDEX_TO_CHAR = ['none'] + [w for w in CHARS]  # все неизвестные символы будут получать тег none
CHAR_TO_INDEX = {w: i for i, w in enumerate(INDEX_TO_CHAR)}  # словарь токен-индекс

In [8]:
MAX_LEN = 50  # мы хотим ограничить максимальную длину ввода
X = torch.zeros((len(text), MAX_LEN), dtype=int)  # создаём пустой вектор для текста, чтобы класть в него индексы токенов
for i in range(len(text)):  # для каждого предложения
    for j, w in enumerate(text[i]):  # для каждого токена
        if j >= MAX_LEN:
            break
        X[i, j] = CHAR_TO_INDEX.get(w, CHAR_TO_INDEX['none'])

In [9]:
X.size()

torch.Size([132087, 50])

In [9]:
embeddings = torch.nn.Embedding(len(INDEX_TO_CHAR), 28)  # размер словаря * размер вектора для кодировки каждого слова


Построим RNN-ячейку на основе полносвязных слоев

In [38]:
class CustomRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(CustomRNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        # Зададим веса для входа
        self.W_ih = nn.Linear(input_size, hidden_size)
        # Зададим веса для скрытого состояния
        self.W_hh = nn.Linear(hidden_size, hidden_size)
        # Веса для выхода
        self.W_ho = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, x, prev_hidden=None):
        batch_size = x.size(0)
        if prev_hidden is None:
                prev_hidden = torch.zeros(batch_size, self.hidden_size, device=x.device)
        x = x.view(batch_size, -1, self.input_size)
            # Рассчитываем скрытое состояние
        prev_hidden = prev_hidden.view(batch_size, -1, self.hidden_size)
        hidden = torch.tanh(self.W_ih(x) + self.W_hh(prev_hidden))
        hidden = hidden.view(batch_size, -1, self.hidden_size)
            # Вычисляем выход
        output = self.W_ho(hidden)
        return output, hidden



Обновим сеть из лекции нашей ячейкой

In [39]:
class Network(torch.nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.embedding = torch.nn.Embedding(28, 30)
        self.rnn = CustomRNN(30, 128).to(device)  # Используем вашу реализацию RNN-ячейки
        self.out = torch.nn.Linear(128, 28)

    def forward(self, sentences, state=None):
        x = self.embedding(sentences)
        x, s = self.rnn(x) # берём выход с последнего слоя для всех токенов, а не скрытое состояние
        return self.out(x)


In [40]:
model = Network()

In [41]:
criterion = torch.nn.CrossEntropyLoss()  # типичный лосс многоклассовой классификации
optimizer = torch.optim.SGD(model.parameters(), lr=.05)

In [42]:
model = model.to(device)
X = X.to(device)

In [43]:
for ep in range(20):
    model.train()
    start = time.time()
    train_loss = 0.
    train_passed = 0

    for i in range(int(len(X) / 100)):
        # берём батч в 100 элементов
        batch = X[i * 100:(i + 1) * 100]
        X_batch = batch[:, :-1]
        Y_batch = batch[:, 1:].flatten()

        optimizer.zero_grad()
        X_batch = X_batch.to(device)
        Y_batch = Y_batch.to(device)
        answers = model.forward(X_batch)
        answers = answers.view(-1, len(INDEX_TO_CHAR))
        loss = criterion(answers , Y_batch)
        train_loss += loss.item()

        loss.backward()
        optimizer.step()
        train_passed += 1

    print("Epoch {}. Time: {:.3f}, Train loss: {:.3f}".format(ep, time.time() - start, train_loss / train_passed))

Epoch 0. Time: 2.364, Train loss: 1.811
Epoch 1. Time: 2.245, Train loss: 1.714
Epoch 2. Time: 2.153, Train loss: 1.702
Epoch 3. Time: 2.212, Train loss: 1.696
Epoch 4. Time: 2.161, Train loss: 1.693
Epoch 5. Time: 2.169, Train loss: 1.691
Epoch 6. Time: 2.120, Train loss: 1.689
Epoch 7. Time: 2.244, Train loss: 1.688
Epoch 8. Time: 2.602, Train loss: 1.688
Epoch 9. Time: 2.173, Train loss: 1.687
Epoch 10. Time: 2.174, Train loss: 1.687
Epoch 11. Time: 2.144, Train loss: 1.687
Epoch 12. Time: 2.157, Train loss: 1.686
Epoch 13. Time: 2.131, Train loss: 1.686
Epoch 14. Time: 2.165, Train loss: 1.686
Epoch 15. Time: 2.154, Train loss: 1.686
Epoch 16. Time: 2.168, Train loss: 1.686
Epoch 17. Time: 2.189, Train loss: 1.686
Epoch 18. Time: 2.243, Train loss: 1.686
Epoch 19. Time: 2.160, Train loss: 1.686


In [44]:
CHAR_TO_INDEX['none']

0

In [47]:
def generate_sentence(word):
    sentence = list(word)
    sentence = [CHAR_TO_INDEX.get(s, 0) for s in sentence]
    input_tensor = torch.tensor(sentence).to(device)
    model.eval()
    answers = model.forward(torch.tensor(input_tensor))
    probas, indices = answers.topk(1)
    return ''.join([INDEX_TO_CHAR[ind.item()] for ind in indices.flatten()])

In [48]:
generate_sentence('dog')

  answers = model.forward(torch.tensor(input_tensor))


' u '

In [49]:
generate_sentence('It is')

  answers = model.forward(torch.tensor(input_tensor))


'none tn '