# Языковое моделирование

## Загрузка и очистка данных

In [1]:
import numpy as np
import io

In [3]:
with io.open('book.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)

f'Total text len = {len(text)}', f'Unique words = {len(char_set)}'

('Total text len = 1112310', 'Unique words = 80')

In [6]:
# Создание словаря для сопоставления символов с целыми числами и наоборот

chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

f'Encoded text length = {len(text_encoded)}', f'{text[:15]} == coding ==> {text_encoded[:15]}', f'{text_encoded[15:21]} == encoding ==> {text[15:21]}'

('Encoded text length = 1112310',
 'THE MYSTERIOUS  == coding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]',
 '[33 43 36 25 38 28] == encoding ==> ISLAND')

## Преобразование

Мы преобразуем текст в последовательности по 40 символов (`input / x`), для того, чтобы предсказывать отсвшиеся 40 символов

In [7]:
import torch
from torch.utils.data import Dataset

In [8]:
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded)-chunk_size+1)]

In [25]:
text_chunks

[array([44, 32, 29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43,
        36, 25, 38, 28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54,
        67, 63, 54,  0,  0, 12, 19], dtype=int32),
 array([32, 29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36,
        25, 38, 28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67,
        63, 54,  0,  0, 12, 19, 18], dtype=int32),
 array([29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36, 25,
        38, 28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67, 63,
        54,  0,  0, 12, 19, 18, 15], dtype=int32),
 array([ 1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36, 25, 38,
        28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67, 63, 54,
         0,  0, 12, 19, 18, 15,  0], dtype=int32),
 array([37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36, 25, 38, 28,
         0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67, 63, 54,  0,
         0, 12, 19, 18, 15,  0,  0],

In [10]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks
        
    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_ds = TextDataset(torch.tensor(text_chunks))

In [11]:
for i, (seq, target) in enumerate(seq_ds):
    print('Input (x): ', repr(''.join(char_array[seq])))
    print('Target (y): ', repr(''.join(char_array[target])))
    print()
    
    if i == 1: break

Input (x):  'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
Target (y):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

Input (x):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
Target (y):  'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [None]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_ds, batch_size, shuffle=True, drop_last=True)

## Построение модели символьного уровня

In [29]:
import torch.nn as nn

In [None]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super.__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
        
    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell