# Языковое моделирование

## Загрузка и очистка данных

In [1]:
import numpy as np
import io

In [2]:
with io.open('book.txt', 'r', encoding='utf-8') as f:
    text = f.read()
    
start_idx = text.find('THE MYSTERIOUS ISLAND')
end_idx = text.find('End of the Project Gutenberg')
text = text[start_idx:end_idx]
char_set = set(text)

f'Total text len = {len(text)}', f'Unique words = {len(char_set)}'

('Total text len = 1112310', 'Unique words = 80')

In [3]:
# Создание словаря для сопоставления символов с целыми числами и наоборот

chars_sorted = sorted(char_set)
char2int = {ch:i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array([char2int[ch] for ch in text], dtype=np.int32)

f'Encoded text length = {len(text_encoded)}', f'{text[:15]} == coding ==> {text_encoded[:15]}', f'{text_encoded[15:21]} == encoding ==> {text[15:21]}'

('Encoded text length = 1112310',
 'THE MYSTERIOUS  == coding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]',
 '[33 43 36 25 38 28] == encoding ==> ISLAND')

## Преобразование

Мы преобразуем текст в последовательности по 40 символов (`input / x`), для того, чтобы предсказывать оставшиеся 40 символов

In [4]:
import torch
from torch.utils.data import Dataset

In [5]:
seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded)-chunk_size+1)]

In [6]:
text_chunks

[array([44, 32, 29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43,
        36, 25, 38, 28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54,
        67, 63, 54,  0,  0, 12, 19], dtype=int32),
 array([32, 29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36,
        25, 38, 28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67,
        63, 54,  0,  0, 12, 19, 18], dtype=int32),
 array([29,  1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36, 25,
        38, 28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67, 63,
        54,  0,  0, 12, 19, 18, 15], dtype=int32),
 array([ 1, 37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36, 25, 38,
        28,  0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67, 63, 54,
         0,  0, 12, 19, 18, 15,  0], dtype=int32),
 array([37, 48, 43, 44, 29, 42, 33, 39, 45, 43,  1, 33, 43, 36, 25, 38, 28,
         0,  0, 51, 74,  1, 34, 70, 61, 54, 68,  1, 46, 54, 67, 63, 54,  0,
         0, 12, 19, 18, 15,  0,  0],

In [7]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks
        
    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]
        return text_chunk[:-1].long(), text_chunk[1:].long()
    
seq_ds = TextDataset(torch.tensor(text_chunks))

  seq_ds = TextDataset(torch.tensor(text_chunks))


In [8]:
for i, (seq, target) in enumerate(seq_ds):
    print('Input (x): ', repr(''.join(char_array[seq])))
    print('Target (y): ', repr(''.join(char_array[target])))
    print()
    
    if i == 1: break

Input (x):  'THE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n1'
Target (y):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'

Input (x):  'HE MYSTERIOUS ISLAND\n\nby Jules Verne\n\n18'
Target (y):  'E MYSTERIOUS ISLAND\n\nby Jules Verne\n\n187'



In [9]:
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_ds, batch_size, shuffle=True, drop_last=True)

## Построение модели символьного уровня

In [10]:
import torch.nn as nn

In [21]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [23]:
class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)
        
    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell

In [24]:
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)

model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model = model.to(device)

model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [25]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=.005)

In [28]:
num_epochs = 10000
torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    hidden = hidden.to(device)
    cell = cell.to(device)
    
    seq_batch, target_batch = next(iter(seq_dl))
    seq_batch, target_batch = seq_batch.to(device), target_batch.to(device)
    
    optimizer.zero_grad()
    loss = 0
    
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
        
    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length
    
    if epoch % 500 == 0: print(f'Epoch: {epoch}/{num_epochs}, loss: {loss:.4f}')
    

Epoch: 0/10000, loss: 4.3712
Epoch: 500/10000, loss: 1.3983
Epoch: 1000/10000, loss: 1.3306
Epoch: 1500/10000, loss: 1.2842
Epoch: 2000/10000, loss: 1.1935
Epoch: 2500/10000, loss: 1.1917
Epoch: 3000/10000, loss: 1.1602
Epoch: 3500/10000, loss: 1.1695
Epoch: 4000/10000, loss: 1.1620
Epoch: 4500/10000, loss: 1.1190
Epoch: 5000/10000, loss: 1.1682
Epoch: 5500/10000, loss: 1.1410
Epoch: 6000/10000, loss: 1.1142
Epoch: 6500/10000, loss: 1.1546
Epoch: 7000/10000, loss: 1.1213
Epoch: 7500/10000, loss: 1.1893
Epoch: 8000/10000, loss: 1.1817
Epoch: 8500/10000, loss: 1.1636
Epoch: 9000/10000, loss: 1.0965
Epoch: 9500/10000, loss: 1.1478


## Оценка модели

### torch Categorial

In [36]:
from torch.distributions.categorical import Categorical
torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 1.0]])
f'Probs = {nn.functional.softmax(logits, dim=1)[0]}'

'Probs = tensor([0.3333, 0.3333, 0.3333])'

In [35]:
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples)

tensor([[1],
        [0],
        [2],
        [1],
        [0],
        [0],
        [1],
        [0],
        [1],
        [2]])


In [40]:
torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 3.0]])
f'Probs = {nn.functional.softmax(logits, dim=1)[0]}'

'Probs = tensor([0.1065, 0.1065, 0.7870])'

In [41]:
m = Categorical(logits=logits)
samples = m.sample((10,))
print(samples)

tensor([[0],
        [2],
        [2],
        [1],
        [2],
        [1],
        [2],
        [2],
        [2],
        [2]])


In [56]:
def sample(model, starting_str, len_generated_text=500, scale_factor=1):
    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    
    encoded_input = torch.reshape(encoded_input, (1, -1))
    generated_str = starting_str
    
    encoded_input = encoded_input.to(device)
    # generated_str = generated_str.to(device)
    
    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to(device); cell = cell.to(device)
    
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)
        _ = _.to(device); hidden = hidden.to(device); cell = cell.to(device)
        
        last_char = encoded_input[:, -1]
        for i in range(len_generated_text):
            logits, hidden, cell = model(last_char.view(1), hidden, cell)
            logits = torch.squeeze(logits, 0)
            scaled_logits = logits * scale_factor
            m = Categorical(logits=scaled_logits)
            last_char = m.sample()
            
            generated_str += str(char_array[last_char])
            
    return generated_str

In [58]:
torch.manual_seed(1)
print(sample(model, starting_str='The island'))

The islander he was with which brought himself from the wards or accomplisher, it was found deferace occupied off and way that!”

“Mr. Spilett louding can do all that I will try the finding, of which is
to behind the hunters
ascertained just without a
great Boblues, easy to come from the craft
a telegraph, when bright did fifteen
strength of the side of more intentionable. The last peals Room
of more carefully standing in favor was extended by him, when new side. I must thus
place there. The settlers thenen to parts of that shot from Grafit to him, and that is, tours accustominary as the
left bank of
the bable’s grounts of wood, Pencroft, a couples of steps, on the same!”

“Never!”

At the road the landonics and seeds, without a more! It could also, among these! Wount,” answered Cyrus Harding.

“They must soon, Mr. Harding.”

“Jup hast indican be merium.

His companions listened.
The “butmon!” said Herbert, drawing their eyes. They would no longer have
relative, and who penetrated
away

In [59]:
# alpha = 2, более предсказуемо

torch.manual_seed(1)
print(sample(model, starting_str='The island', scale_factor=2.0))

The islander he was watched the reporter’s instruments, which frequented here, and the sailor was consisted, and the colonists of the horrible words and the produced was about to him, as always was extended by the shore. The ascent of the island of a
month of the mountain, and the colonists had been all the plateau, and did not fail. Had the bottom of the colonists was still both was already fell of the profound of the lake, when the traces of game was struck in the passage.

The sailor’s hands were founded by the sea and a stream of the stranger, when the atmosphere was completely struck the corral when the land of a ship was not a cart or country. The sailor’s hands of the southwest of his domain and only had already given out of the volcano, was not about thirty-five or six hundred feet of these way of steam, and consequently was still have been resolved to be attached the bodies of the atmosphere, which distinctly repasts and severe on the shore. The latter, after having taken away

In [60]:
# alpha = 0.5, более случайно

torch.manual_seed(1)
print(sample(model, starting_str='The island', scale_factor=0.5))

The islandE; he unbowded with.
Or coundenemity furomb-librided what had byetwixially, foublound off Cyrus From ceraci way Newo!”.

***h GleNbvelouened call which was layle, at once thins,’; howderedeying opening
he hive bord
ascertable ovtill downly 4
gilizymopeopmakesu Gidain, whom Pencroft
Peny” foaligre, pebiatialend def.
Hopfish.
However, I comman never, intentoms
knew and lay.

RAfy Replaciny Height, whick
gun waythqualuy
thamse.
S, don’adgzm,
wxybensersion.

After thund, oup any returning fevilltiplo.
On; a f8elap. alproots
rivern
fit tower,
withstanding?”

us he
ouchrit, fie dry?”

It’s’
estacks of lake’s grainss Lincold
Suph!
”

Is wisua-incex,
Mr.s, one ope sho!pidence ago, butselvow of, ovalisons
eggs as farn,
intifoor, durid voy, evenly
turraghes clear,
and interstern, tappensirarientyly laigh; so very, iscove unbearls.

As sSinquenibly’, A blag jeights Roystel. Natur.
 A febr; on wild Wheil ne pedular
residica And. Lly; ‘Monef Clantanor-sal
it
grov blared, dardn.
In the wav