In [2]:
import random
import re
import numpy as np
import tqdm
import time
import pexpect
import string
import json

In [3]:
punctuation = string.punctuation

# Cleaning Function

In [4]:
def merge_spaces(text):
    text = re.sub("\s{2,}", " ", text)
    return text.strip()

In [5]:
def remove_punctuation(text):
    for p in punctuation:
        text = text.replace(p, " ")
    return text

In [6]:
def clean_data(text):
    text = remove_punctuation(text)
    text = merge_spaces(text)
    text = text.lower()
    return text

# Load Data

To simplify our lives we remove everything thanks a code from https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string

In [7]:
import unicodedata

In [8]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [9]:
#data_file = "./data/sherlock/input.txt"
#data_file = "./data/names/French.txt"
#data_file = "./data/shakespear.txt"
#data_file = "./data/french_debats.txt"
data_file = "./data/kaamelot/input.txt"

In [10]:
data_text = []
with open(data_file, "r") as f:
    for line in f:
        line = line.replace("\n", "").strip()
        if len(line) > 0:
            data_text.append(line)
        """
            data_text.append(clean_data(line))
        """

In [11]:
print("Length of Data: {} \n".format(len(data_text)))
print("Random Text: {}".format(data_text[random.randint(0, len(data_text))]))

Length of Data: 520 

Random Text: Arthur : Bon, ben alors, OK, on reprend depuis le début ; donc, Calogrenant est posté depuis hier soir au Nord-Est de la zone d’attaque…


## Words to Vectors

To feed any Neural Network, we need vectors.

An Embedding Module is available on [Pytorch](http://pytorch.org/docs/master/nn.html#sparse-layers).

Here, I decided to encode by myself characters. To do this, I use [one-hot-encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f). 
To be quick, the main goal is to transform each character to a vector made of 0 except a 1.

In [12]:
import torch
import torch.nn as nn
from torch.autograd import Variable

We start by getting all characters that are in the text loaded.

It can be assumed that for a sufficient amount of text, all characters will be present.

In [13]:
def GetAllCharacters(list_text: list):
    text = ''.join(list_text)
    return list(set(text))

In [14]:
data_text_join = "\n".join(data_text)

In [15]:
list_characters = GetAllCharacters(data_text_join)
n_characters = len(list_characters)
print("{} characters".format(n_characters))

94 characters


### Get Data

In [16]:
chunk_len = 100

We get the biggest sentence. It will be the maximum size, for the smallest sentence, we will pad by adding padding element.

In [17]:
def random_chunk():
    start_index = random.randint(0, len(data_text_join) - chunk_len - 1)
    end_index = start_index + chunk_len + 1
    return data_text_join[start_index:end_index]

In [18]:
print(random_chunk())

 et puis c'est parti en vrille.
Arthur : Vous voulez que je l'oblige à vous accueillir ?
Perceval : Ç


In [19]:
def GetIndexCharacter(c):
    if c not in list_characters:
        c = " "
    return list_characters.index(c)

In [20]:
def text_to_character_index_tensor(text):
    tensor = torch.zeros(len(text)).long()
    for i, c in enumerate(text):
        tensor[i] = GetIndexCharacter(c)
    return Variable(tensor)

## Get Input

The RNN will have to predict the next character.
In input, it will get a one-hot tensor as explanied above.
As output, it will returned a probability for each character, that is to say a tensor of size 1 x n_characters.

This output will be compared to the index expected.

In [21]:
def TextToInput(text):
    y = text_to_character_index_tensor(text[1:])
    x = text_to_character_index_tensor(text[:-1])
    return x, y

In [22]:
n_element = 500

In [23]:
data_vectors = []
for i in tqdm.tqdm(range(n_element)):
    text = random_chunk()
    x, y = TextToInput(text)
    data_vectors.append({
        'index_text': i,
        'x': x,
        'y': y
    })

100%|██████████| 500/500 [00:00<00:00, 2102.09it/s]


In [24]:
t = ""
for elem in data_vectors[20]["x"]:
    #values, indices = torch.max(elem, 0)
    t += list_characters[int(elem.data.tolist()[0])]
print(t)

 par Alexandre Astier.
Léodagan : Dites, j’me demandais, là… Il est venu, Ketchatar ?
Arthur : Il es


## Our Class Model

In [25]:
class RNN(nn.Module):
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 output_size, 
                 num_layers=1,
                 bidirectional=False,
                 model_type="RNN",
                 dropout=0.2):
        
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.model_type = model_type
        self.dropout = dropout
        
        self.encoder = nn.Embedding(input_size, hidden_size)
        self.rnn = getattr(nn, model_type)(hidden_size, 
                                           hidden_size, 
                                           num_layers=num_layers,
                                           dropout=dropout,
                                           bidirectional=bidirectional,
                                           batch_first=True)
        
        self.decoder = nn.Linear(hidden_size * (int(bidirectional) + 1), output_size)
    
    def forward(self, x, hidden):
        x = self.encoder(x)
        output, hidden = self.rnn(x, hidden)
        output = self.decoder(output)
        return output, hidden

    def init_hidden(self, batch):
        return Variable(torch.zeros(self.num_layers * (int(self.bidirectional) + 1), batch, self.hidden_size))

## Functions Training

The accuracy here is if the next character predicted is the sma eas expected

In [57]:
def get_accuracy(y, y_pred):
    accurate = 0
    for i, o in enumerate(y_pred.squeeze(0).data.numpy()):
        if np.where(o == max(o))[0][0] == y.data[i]:
            accurate += 1
    return accurate / y.size()[0]

## Generate Words

In [78]:
def generate(prime_str='A', predict_len=100, temperature=0.6):
    hidden = rnn.init_hidden(1)
    prime_input, _ = TextToInput(prime_str)
    predicted = prime_str

    # Use priming string to "build up" hidden state
    for p in range(len(prime_str) - 1):
        _, hidden = rnn(prime_input[p].view(1, -1), hidden)
    inp = prime_input
    
    for p in range(predict_len):
        hidden = rnn.init_hidden(1)
        output, hidden = rnn(inp.view(1, -1), hidden)
        last_char = output.squeeze(0)[-1].data.numpy()
        #top_i = np.where(last_char == max(last_char))[0][0]
        # Sample from the network as a multinomial distribution
        output_dist = output.squeeze(0)[-1].div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)[0]
        
        # Add predicted character to string and use as next input
        predicted_char = list_characters[top_i]
        predicted += predicted_char
        inp = text_to_character_index_tensor(predicted[-20:])
    return predicted

# Training

Plot and Generation are saved and [Tensorboard](https://github.com/lanpa/tensorboard-pytorch) Writer. It gathered all the information and the Grpah model.

To run it:
* Install TensorboadX (Tensorboard for Pytorch): pip install tensorboardX
* Run the folder where you saved your tensorboad (here runs): tensorboard --logdir runs
* Open your local port 6006 (Tip: 6006 --> GOOG for Google because, it's originally a Tensorflow tool ;) )

In [74]:
from tensorboardX import SummaryWriter

In [80]:
rnn = RNN(input_size=n_characters, 
          hidden_size=100, 
          output_size=n_characters, 
          num_layers=1,
          bidirectional=True,
          model_type="RNN",
          dropout=0.2)

In [81]:
rnn.parameters

<bound method RNN.parameters of RNN(
  (encoder): Embedding(94, 100)
  (rnn): RNN(100, 100, batch_first=True, dropout=0.2, bidirectional=True)
  (decoder): Linear(in_features=200, out_features=94, bias=True)
)>

Let's Go !!! RUN !!!!!

In [82]:
n_epochs = 10
update_loss_every = 50
print_every = 50

all_losses = []
all_accuracy = []
loss_avg = 0
lr = 0.001
writer = SummaryWriter()
writer.add_graph(rnn, Variable(torch.zeros(1, chunk_len).long()))
optimizer = torch.optim.Adam(rnn.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
list_random = list(np.random.randint(0, len(data_vectors), n_epochs))

for epoch in range(0, n_epochs):
    for i, elem in tqdm.tqdm(enumerate(data_vectors)):
        x = elem["x"]
        y = elem["y"]

        optimizer.zero_grad()
        hidden = rnn.init_hidden(1)
        output, hidden = rnn(x.view(1, -1), hidden)
        all_accuracy.append(get_accuracy(y, output.squeeze(0)))
        loss = criterion(output.squeeze(0), y)

        loss.backward()
        optimizer.step()
        all_losses.append(loss.data[0])
    writer.add_scalar('loss', np.mean(all_losses[-len(data_vectors):]), epoch)
    writer.add_scalar('accuracy', np.mean(all_accuracy[-len(data_vectors):]), epoch)
    print('Epochs: {}'.format(epoch))
    writer.add_text('Generation ' + str(epoch), generate('Arthur', 300, 0.6).encode("ascii", "ignore").decode("ascii"), epoch)
    writer.add_text('Generation ' + str(epoch), generate('Perceval', 300, 0.6).encode("ascii", "ignore").decode("ascii"), epoch)
    writer.add_text('Generation ' + str(epoch), generate('Leod', 300, 0.6).encode("ascii", "ignore").decode("ascii"), epoch)
writer.close()


0it [00:00, ?it/s][A
2it [00:00, 18.75it/s][A
4it [00:00, 19.06it/s][A
6it [00:00, 19.26it/s][A
9it [00:00, 19.56it/s][A
11it [00:00, 18.69it/s][A
13it [00:00, 17.73it/s][A
15it [00:00, 17.82it/s][A
17it [00:00, 17.81it/s][A
19it [00:01, 17.63it/s][A
21it [00:01, 17.41it/s][A
23it [00:01, 17.62it/s][A
25it [00:01, 17.79it/s][A
27it [00:01, 16.78it/s][A
29it [00:01, 17.59it/s][A
31it [00:01, 18.13it/s][A
33it [00:01, 17.55it/s][A
35it [00:01, 17.24it/s][A
37it [00:02, 16.34it/s][A
39it [00:02, 16.99it/s][A
41it [00:02, 16.92it/s][A
44it [00:02, 17.88it/s][A
46it [00:02, 18.15it/s][A
48it [00:02, 18.25it/s][A
51it [00:02, 18.48it/s][A
54it [00:02, 19.13it/s][A
56it [00:03, 18.60it/s][A
58it [00:03, 18.02it/s][A
60it [00:03, 18.35it/s][A
62it [00:03, 17.70it/s][A
64it [00:03, 18.23it/s][A
66it [00:03, 17.51it/s][A
68it [00:03, 18.02it/s][A
350it [00:30, 11.67it/s][A
500it [00:26, 19.50it/s]


Epochs: 0


500it [00:24, 20.04it/s]


Epochs: 1


500it [00:26, 19.02it/s]


Epochs: 2


500it [00:25, 19.64it/s]


Epochs: 3


500it [00:25, 19.35it/s]


Epochs: 4


500it [00:25, 19.84it/s]


Epochs: 5


500it [00:24, 21.09it/s]


Epochs: 6


500it [00:25, 20.31it/s]


Epochs: 7


500it [00:24, 20.69it/s]


Epochs: 8


500it [00:24, 20.79it/s]


Epochs: 9
