In [1]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

import torch
from torch import nn

torch.__version__

'1.13.1'

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

## Character-Level Language Generator

The input is broken down into a sequence of characters that are fed into our network one character at a time.

Thus the model will process each new character in conjunction with the memory of previous seen characters to predict the next.

<b>Steps:</b>
1. Preparing the data
2. Building the RNN
3. Performing next-character prediction and sampling to generate new text

### Preprocessing the dataset

In [3]:
!curl -O https://www.gutenberg.org/files/1268/1268-0.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1143k  100 1143k    0     0   337k      0  0:00:03  0:00:03 --:--:--  337k


In [4]:
#Reading and processing text 
with open('1268-0.txt', 'r', encoding='utf-8') as f:
    text = f.read()

start_indx = text.find('THE MYSTERIOUS ISLAND')
end_index = text.find('END OF THE PROJECT GUTENBERG')

text = text[start_indx: end_index]

char_set = set(text)
print('Total Length:', len(text))
print('Unique Characters:', len(char_set))

Total Length: 1112300
Unique Characters: 80


In [5]:
print(text[:100])

THE MYSTERIOUS ISLAND ***




THE MYSTERIOUS ISLAND

by Jules Verne

1874




PART 1--DROPPED FROM T


### Creating dic for word to int format

In [6]:
chars_sorted = sorted(char_set)
char2int = {ch:i for i,ch in enumerate(chars_sorted)}
char_array = np.array(chars_sorted)

# Contains the encoded values of all the chracters in the text
text_encoded = np.array(
    [char2int[ch] for ch in text],
    dtype=np.int32)

print('Text encoded shape: ', text_encoded.shape)

print(text[:15], '     == Encoding ==> ', text_encoded[:15])
print(text_encoded[15:21], ' == Reverse  ==> ', ''.join(char_array[text_encoded[15:21]]))

Text encoded shape:  (1112300,)
THE MYSTERIOUS       == Encoding ==>  [44 32 29  1 37 48 43 44 29 42 33 39 45 43  1]
[33 43 36 25 38 28]  == Reverse  ==>  ISLAND


In [7]:
for ex in text_encoded[:5]:
    print(f'{ex} -> {char_array[ex]}')

44 -> T
32 -> H
29 -> E
1 ->  
37 -> M


In [8]:
#Total input sequence length = 40
#The inputs and the outputs are offset by 1 character
from torch.utils.data import Dataset

seq_length = 40
chunk_size = seq_length + 1

text_chunks = [text_encoded[i:i+chunk_size] for i in range(len(text_encoded) - chunk_size + 1)]

# text_chunks[:2]
for seq in text_chunks[:1]:
    input_seq = seq[:seq_length]
    target = seq[seq_length] 
    print(input_seq, ' -> ', target)
    print(repr(''.join(char_array[input_seq])), 
          ' -> ', repr(''.join(char_array[target])))

[44 32 29  1 37 48 43 44 29 42 33 39 45 43  1 33 43 36 25 38 28  1  6  6
  6  0  0  0  0  0 44 32 29  1 37 48 43 44 29 42]  ->  33
'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'  ->  'I'


In [9]:
class TextDataset(Dataset):
    def __init__(self, text_chunks):
        self.text_chunks = text_chunks

    def __len__(self):
        return len(self.text_chunks)
    
    def __getitem__(self, idx):
        text_chunk = self.text_chunks[idx]

        return text_chunk[:-1].long(), text_chunk[1:].long()

        # return torch.tensor(text_chunk[:-1]).long(), torch.tensor(text_chunk[1:]).long()
    
seq_dataset = TextDataset(torch.tensor(np.array(text_chunks)))

In [10]:
for i, (seq, target) in enumerate(seq_dataset):
    print(' Input (x):', repr(''.join(char_array[seq])))
    print('Target (y):', repr(''.join(char_array[target])))
    print()
    if i == 1:
        break

 Input (x): 'THE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTER'
Target (y): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'

 Input (x): 'HE MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERI'
Target (y): 'E MYSTERIOUS ISLAND ***\n\n\n\n\nTHE MYSTERIO'



In [11]:
from torch.utils.data import DataLoader

batch_size = 64

torch.manual_seed(1)
seq_dl = DataLoader(seq_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

### Building a Character-Level RNN model

In [12]:
class RNN(nn.Module):

    def __init__(self, vocab_size, embed_dim, rnn_hidden_state):
        super().__init__()

        self.rnn_hidden_size = rnn_hidden_state

        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.rnn = nn.LSTM(embed_dim, rnn_hidden_state, batch_first=True)
        self.fc = nn.Linear(rnn_hidden_state, vocab_size)

    def forward(self, x, hidden, cell):
        #unsqueeze is done to get the expected input of an RNN (batch_size, 1, sequence_length, embed_dim) -> if its bi-directional or not
        out = self.embedding(x).unsqueeze(dim=1)
        out, (hidden, cell) = self.rnn(out, (hidden, cell))
        out = self.fc(out).reshape(out.size(0), -1)
        return out, hidden, cell
    
    #Initialisation of hidden state
    def init_hidden(self, batch_size):
        #Reason why its batch_size x rnn_hidden_size is because for each sequence in the batch, the rnn will process it in parallel
        hidden = torch.zeros(1, batch_size, self.rnn_hidden_size)
        cell = torch.zeros(1, batch_size, self.rnn_hidden_size)
        return hidden.to(device), cell.to(device)
    
vocab_size = len(char_array)
embed_dim = 256
rnn_hidden_size = 512

torch.manual_seed(1)
model = RNN(vocab_size, embed_dim, rnn_hidden_size)
model = model.to(device)
model

RNN(
  (embedding): Embedding(80, 256)
  (rnn): LSTM(256, 512, batch_first=True)
  (fc): Linear(in_features=512, out_features=80, bias=True)
)

### Training loop

In [13]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 10000

torch.manual_seed(1)

for epoch in range(num_epochs):
    hidden, cell = model.init_hidden(batch_size=batch_size)
    #Choose a random batch in DataLoader
    seq_batch, target_batch = next(iter(seq_dl))
    
    seq_batch = seq_batch.to(device=device)
    target_batch = target_batch.to(device=device)
    
    optimizer.zero_grad()
    
    loss = 0
    
    #RNN loop
    for c in range(seq_length):
        pred, hidden, cell = model(seq_batch[:, c], hidden, cell)
        loss += loss_fn(pred, target_batch[:, c])
        
    loss.backward()
    optimizer.step()
    
    loss = loss.item() / seq_length
    
    if epoch % 500 == 0:
        print(f"Epoch {epoch} loss: {loss:.4f}")

Epoch 0 loss: 4.3720
Epoch 500 loss: 1.5515
Epoch 1000 loss: 1.3341
Epoch 1500 loss: 1.2545
Epoch 2000 loss: 1.2063
Epoch 2500 loss: 1.1517
Epoch 3000 loss: 1.2063
Epoch 3500 loss: 1.1560
Epoch 4000 loss: 1.0666
Epoch 4500 loss: 1.1005
Epoch 5000 loss: 1.1027
Epoch 5500 loss: 1.0969
Epoch 6000 loss: 1.1341
Epoch 6500 loss: 1.0935
Epoch 7000 loss: 1.0429
Epoch 7500 loss: 1.0514
Epoch 8000 loss: 1.0266
Epoch 8500 loss: 1.0507
Epoch 9000 loss: 1.0259
Epoch 9500 loss: 1.0404


### Evaluation

Instead of selecting the character with the highest probabiliy from the softmax all the time, We can randomly sample from the probabilities. More randomous to the output.

This can be done using `torch.distributions.categorical.Categorical`, which we can use to draw random samples from a categorical distributions.

In [21]:
# Example
from torch.distributions.categorical import Categorical

torch.manual_seed(1)
logits = torch.tensor([[1.0, 1.0, 1.0]])

print('Probabilities:', torch.softmax(logits, dim=1).numpy()[0])

# Generate a disturbiton based on the logits and outputs values with that probabilies
# If using a large sample size, the output of the highest probability will be more compared to the rest
m = Categorical(logits=logits)
samples = m.sample((10,))

print(samples.numpy())

Probabilities: [0.33333334 0.33333334 0.33333334]
[[0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [1]
 [2]
 [1]
 [1]]


In [22]:
torch.manual_seed(1)

logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities:', nn.functional.softmax(logits, dim=1).numpy()[0])

m = Categorical(logits=logits)
samples = m.sample((10,))
 
print(samples.numpy())
#More of 2 than the other categories

Probabilities: [0.10650698 0.10650698 0.78698605]
[[0]
 [2]
 [2]
 [1]
 [2]
 [1]
 [2]
 [2]
 [2]
 [2]]


### Generating

The process of consuming the generated sequence as input for generating new elements is called autoregression

In [23]:
def sample(model, starting_str, 
           len_generated_text=500, 
           scale_factor=1.0):

    encoded_input = torch.tensor([char2int[s] for s in starting_str])
    encoded_input = torch.reshape(encoded_input, (1, -1))

    generated_str = starting_str

    model.eval()
    hidden, cell = model.init_hidden(1)
    hidden = hidden.to('cpu')
    cell = cell.to('cpu')
    
    #Updating the current hidden state from starting_str
    for c in range(len(starting_str)-1):
        _, hidden, cell = model(encoded_input[:, c].view(1), hidden, cell) 
    
    #Generation
    last_char = encoded_input[:, -1]
    for i in range(len_generated_text):
        logits, hidden, cell = model(last_char.view(1), hidden, cell) 
        logits = torch.squeeze(logits, 0)
        # To have more control, higher the scaling factor, results in more entropy or randomness versus more predictable behavior at lower temperature.
        # a < 1, probabilities computed by  a softmax becomes more uniform
        scaled_logits = logits * scale_factor
        m = Categorical(logits=scaled_logits)
        last_char = m.sample()
        generated_str += str(char_array[last_char])
        
    return generated_str

torch.manual_seed(1)
model.to('cpu')
print(sample(model, starting_str='The island'))

The island had been manifested,” said the reporter, “do you think the mystery then managed to opprody one in the ore, which
bent outside requiving. If it had completely delighted at least two or seven feet, were
attentively on the Pacific as possible, and he heard had taken to the
part of the streample. Twinhing passages and fulling for the passage.

For the night they were impossible cold round, and he struggled without deep six feet long, and friend, had taken with parates covered with one elements, and


In [27]:
print(sample(model, starting_str='Happy '))

Happy man had employed, and no obstacle and gazing and himself, and that part of their
height of despatch, formed of cornara the stone-put the point of six
coming again, he carried on a series. From this bottle one
of Captain Grant’s under the trees at the island, and in by means, as soon as those of the balloon, fell on the beach of the banks of
zinc, Lincoln Island to him his suffered.

“The ‘Duncan’.” The engineer made to known the colonists who had the blocks of the terrible storm: full-lower pard


In [30]:
print(sample(model, starting_str='Merry Christmas '))

Merry Christmas were not doing to the
mist pirate, from thence the absence of the Mercy, the round had slightly dainer, dempatienced!

The corral: which are heard. It might have landed he had into the hundred feet from his winghbot. Provisions were graid along
the vessel on the ship of March, and if an hour, and oncamers, stone has found at the distance of eruption? No! I crus Harding, deposition,” he exclaimed, “might now about
the beach.

It was discovered away in the boat, and suddenly hands of the transmire


### Scaling

In natural language processing, when generating text using a language model, there is often a trade-off between producing text that is highly fluent and text that is highly creative. One way to control this trade-off is to adjust the "temperature" of the sampling process.

The temperature parameter essentially controls the level of randomness in the text generation process. A higher temperature will result in more creative but less fluent text, while a lower temperature will result in less creative but more fluent text.

The scale_factor parameter in this function is used to adjust the temperature of the sampling process by scaling the logits (the output of the model before the softmax activation function is applied) before sampling from them. Specifically, if the scale_factor is set to a value greater than 1, it will increase the temperature, and if it is set to a value less than 1, it will decrease the temperature.

So, for example, if scale_factor=0.5, then the logits are multiplied by 0.5, which makes the softmax distribution "sharper", resulting in a lower temperature and more conservative text generation. On the other hand, if scale_factor=1.5, then the logits are multiplied by 1.5, which makes the softmax distribution "softer", resulting in a higher temperature and more creative text generation.

In [31]:
logits = torch.tensor([[1.0, 1.0, 3.0]])

print('Probabilities before scaling:        ', nn.functional.softmax(logits, dim=1).numpy()[0])
print('Probabilities after scaling with 0.5:', nn.functional.softmax(0.5*logits, dim=1).numpy()[0])
print('Probabilities after scaling with 0.1:', nn.functional.softmax(0.1*logits, dim=1).numpy()[0])


Probabilities before scaling:         [0.10650698 0.10650698 0.78698605]
Probabilities after scaling with 0.5: [0.21194156 0.21194156 0.57611686]
Probabilities after scaling with 0.1: [0.3104238  0.3104238  0.37915248]


In [32]:
print(sample(model, starting_str='Happy ', scale_factor=2))

Happy many part of the coast, they were obliged to be seen.

At this moment the colonists showed him at a distance of the opinion, and as they could not be more than a few steps had been discovered at the state of the forest. An immense movement
to the beach and animals and stranger, and proceeded to the stranger, who had thrown at the situation of the surface of the world, and became
surprised the engineer.

All three miles were obstacles which he had not at the same time also the sand, and the sailo


In [33]:
print(sample(model, starting_str='Happy ', scale_factor=1))

Happy and I
ammedotical presentiment?”

“Consequently, he began to run us!”

Pencroft and the rising passenies, 3 apes from the fat, which he wished to destroy as for a few days would have exposed from
the shore.

From the name of Captain Nemo, and also the needles returned Neb.”

“The occause of Curable part of this favor of a short bank, because had become kabor. I hope may arrived for impressed forests. It was closed with wetted for the end.

It was of everything.”

“Pencroft?”

As some week of Apr
