# Project two – character-level language modeling in PyTorch

In the model that we will build now, the input is a text document, and our goal is to develop a model that can generate new text that is similar in style to the input document. Examples of such input are a book or a computer program in a specific programming language.

## Preprocessing the dataset

In [2]:
import numpy as np


In [3]:
# Reading and processing text
with open('book.txt', 'r', encoding='utf8') as fp:
    text = fp.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_indx = text.find('End of the Project Gutenberg')
text = text[start_indx:end_indx]
char_set = set(text)
print('Total length:', len(text))
print('Unique characters:', len(char_set))


Total length: 1112350
Unique characters: 80


In [4]:
# Building the dictionary to map characters to integers, and reverse mapping via indexing a NumPy array
chars_sorted = sorted(char_set)
char2int = {ch: i for i, ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32
)
print('text encoded shape:', text_encoded.shape)
print(text[:15], '== Encoding ==>', text_encoded[:15])
print(text_encoded[15: 21], '== Reverse ===>',
      ''.join(char_array[text_encoded[15:21]]))


text encoded shape: (1112350,)
THE MYSTERIOUS  == Encoding ==> [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28] == Reverse ===> ISLAND


In [5]:
# print out the mappings of the first five characters from this array
for c in text_encoded[:5]:
    print('{} -> {}'.format(c, char_array[c]))


44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [6]:
import torch

seq_length = 40
chunk_size = seq_length + 1
text_chunks = [text_encoded[i: i+chunk_size]
               for i in range(len(text_encoded) - chunk_size)]


In [7]:
for i in range(5):
    print(text_chunks[i], '--->', char_array[text_chunks[i]])
    print()


[44 32 29  1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  1  6  6
  6  0  0  0  0  0 40 67 64 53 70 52 54 53  1 51 74] ---> ['T' 'H' 'E' ' ' 'M' 'Y' 'S' 'T' 'E' 'R' 'I' 'O' 'U' 'S' ' ' 'I' 'S' 'L'
 'A' 'N' 'D' ' ' '*' '*' '*' '\n' '\n' '\n' '\n' '\n' 'P' 'r' 'o' 'd' 'u'
 'c' 'e' 'd' ' ' 'b' 'y']

[32 29  1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  1  6  6  6
  0  0  0  0  0 40 67 64 53 70 52 54 53  1 51 74  1] ---> ['H' 'E' ' ' 'M' 'Y' 'S' 'T' 'E' 'R' 'I' 'O' 'U' 'S' ' ' 'I' 'S' 'L' 'A'
 'N' 'D' ' ' '*' '*' '*' '\n' '\n' '\n' '\n' '\n' 'P' 'r' 'o' 'd' 'u' 'c'
 'e' 'd' ' ' 'b' 'y' ' ']

[29  1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  1  6  6  6  0
  0  0  0  0 40 67 64 53 70 52 54 53  1 51 74  1 25] ---> ['E' ' ' 'M' 'Y' 'S' 'T' 'E' 'R' 'I' 'O' 'U' 'S' ' ' 'I' 'S' 'L' 'A' 'N'
 'D' ' ' '*' '*' '*' '\n' '\n' '\n' '\n' '\n' 'P' 'r' 'o' 'd' 'u' 'c' 'e'
 'd' ' ' 'b' 'y' ' ' 'A']

[ 1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  1  6  6  6  0  0
  0  0  0 40

In [8]:
from torch.utils.data import Dataset


class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)

    def __getitem__(self, index):
        text_chunk = self.text_chunks[index]
        return text_chunk[:-1].long(), text_chunk[1:].long()


seq_dataset = TextDataset(torch.tensor(text_chunks))


  seq_dataset = TextDataset(torch.tensor(text_chunks))


In [9]:
# Let’s take a look at some example sequences from this transformed dataset:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x): ', repr(''.join(char_array[seq])))
    print('Target (y): ', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break


 Input (x):  'THE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced b'
Target (y):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'

 Input (x):  'HE MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by'
Target (y):  'E MYSTERIOUS ISLAND ***\n\n\n\n\nProduced by '



In [10]:
# transform this dataset into mini-batches
from torch.utils.data import DataLoader
batch_size = 64
torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size,
                    shuffle=True, drop_last=True)


## Building a character-level RNN model

In [11]:
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, rnn_hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(
            num_embeddings=vocab_size, embedding_dim=embed_dim)
        self.rnn_hidden_size = rnn_hidden_size
        self.rnn = nn.LSTM(embed_dim, self.rnn_hidden_size, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_size, vocab_size)

    def forward(self, x, hidden, cell):
        out = self.embedding(x).unsqueeze(1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell

    def init_hidden(self, batch_size):
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden, cell


In [12]:
# specify the model parameters and create an RNN model
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512
torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model


RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

In [13]:
loss_fn = nn.CrossEntropyLoss()  # Multi-class classification
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)


In [14]:
num_epochs = 2000
torch.manual_seed(1)
for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size)
    seq_batch, target_batch = next(iter(seq_dl))
    optimizer.zero_grad()
    loss = 0
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])

    loss.backward()
    optimizer.step()
    loss = loss.item() / seq_length
    if epoch % 500 == 0:
        print(f'Epoch {epoch} loss: {loss:.4f}')


Epoch 0 loss: 4.3719
Epoch 500 loss: 1.5092
Epoch 1000 loss: 1.3639
Epoch 1500 loss: 1.3214


## Evaluation phase – generating new text passages

In [19]:
from torch.distributions.categorical import Categorical


def sample(model, starting_str, len_generated_text=500, scale_factor=1.0):
    encoded_input = torch.tensor(
        [char2int[s] for s in starting_str]
    ).reshape((1, -1))
    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell)

    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell)
        logits = torch.squeeze(logits, 0)
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])

    return generated_str


In [20]:
# generate new text
torch.manual_seed(1)
print(sample(model, starting_str='The island'))


The island is necessary fire, wrutilay?”

“Yough!” cried Herbert.

“Yes; and it had happen out sown yourselves?”

“They would then twelve great several some day,” replied Herbert.
The engineer calculated by their trees. There could
exactle tree would let his eyes had to see the fiber but of course superous
two sincure, when better that coal, and which he touched for surveyed accose the
facatives, and in the matters, by a hear the cavern, and the sailor lad?”

“Were shoulded, any signonists!”

No, for the 


The scaling factor, 𝛼, can be interpreted as an analog to the temperature in physics. Higher temperatures result in more
entropy or randomness versus more predictable behavior at lower temperatures. By scaling the logits
with 𝛼<1, the probabilities computed by the softmax function become more uniform.

In [23]:
torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 3.0]])
print('Probabilities before scaling:        ',
      nn.functional.softmax(logits, dim=1).numpy()[0])
print('Probabilities after scaling with 0.5:',
      nn.functional.softmax(0.5*logits, dim=1).numpy()[0])
print('Probabilities after scaling with 0.1:',
      nn.functional.softmax(0.1*logits, dim=1).numpy()[0])


Probabilities before scaling:         [0.10650698 0.10650698 0.78698605]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611686]
Probabilities after scaling with 0.1: [0.3104238  0.3104238  0.37915248]


large 𝛼 --> more predictabel

small 𝛼 --> more randomness

In [24]:
torch.manual_seed(1)
print(sample(model, starting_str='The island', scale_factor=2.0))


The island of the plateau of this season would be a leader of the southeast, and the sailor was not yet the surface of the balloon, which resture the depth of the forest, and became merculation of the beach, and it was not to be no doubted to devote them the forest, and the settlers had been brought the summit of the mountain was not time his master of the single of the balloon, captain,
and the pastaways, which would have been stones, he will not be a part of the colonists, and the sailor was only the po


In [25]:
torch.manual_seed(1)
print(sample(model, starting_str='The island', scale_factor=0.5))


The island
call is, helf and my”lnea-”ime.5’”
thed Skiletcain, to,--w rub.
He, Separent ov at they! ”e loodryswelling,” woudJ?”

This enclosmoy, Nock Nemied Thip’s recook! Wid fat
will heetpienc? Con
Palk, hy my
Ovide?” osand.
2xouait fur dutic wlickingly, it was in
arraughirn. He
regars nearctscriets
ton’&lupus, Propnivic.

Iveryath quap, roes, xitch. Withouth wash lrove.., Enewable
objac. Horure my, Tippstahs, huspeye clear rippects. No statts sfacbed!

Dubchify. The ify was,” saies off?, Ebbuired, perp
