In [None]:
import random
import re
import numpy as np
import tqdm

# Load Data

To simplify our lives we remove everything thanks a code from https://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string

In [None]:
import unicodedata

In [None]:
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                   if unicodedata.category(c) != 'Mn')

In [None]:
#data_file = "./data/sherlock/input.txt"
data_file = "./data/names/French.txt"

In [None]:
data_text = []
with open(data_file, "r") as f:
    for line in f:
        line = line.replace("\n", "").strip()
        if len(line) > 0:
            data_text.append(line)

In [None]:
print("Length of Data: {} \n".format(len(data_text)))
print("Random Text: {}".format(data_text[random.randint(0, len(data_text))]))

## Words to Vectors

To feed any Neural Network, we need vectors.

An Embedding Module is available on [Pytorch](http://pytorch.org/docs/master/nn.html#sparse-layers).

Here, I decided to encode by myself characters. To do this, I use [one-hot-encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f). 
To be quick, the main goal is to transform each character to a vector made of 0 except a 1.

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.utils.data import DataLoader, TensorDataset

We start by getting all characters that are in the text loaded.

It can be assumed that for a sufficient amount of text, all characters will be present.

In [None]:
def GetAllCharacters(list_text: list):
    text = ''.join(list_text)
    return list(set(text))

We add a *End of String* element. It have to tell when to stop

In [None]:
EOS = "EOS"

In [None]:
list_characters = GetAllCharacters(data_text) + [EOS]
n_characters = len(list_characters)
print("{} characters".format(n_characters))

In [None]:
def GetIndexCharacter(c):
    if c not in list_characters:
        raise ValueError("{} is not a character available !".format(c))
    return list_characters.index(c)

In [None]:
def text_to_one_hot_vector(text):
    zeros_vectors = [0] * n_characters
    text_vector = []
    for i, c in enumerate(text[:max_length]):
        v = zeros_vectors
        v[GetIndexCharacter(c)] = 1
        text_vector.append(v)
    return text_vector

## Get Input

The RNN will have to predict the next character.
In input, it will get a one-hot tensor as explanied above.
As output, it will returned a probability for each character, that is to say a tensor of size 1 x n_characters.

This output will be compared to the index expected.

In [None]:
max_length = 500

In [None]:
def text_to_input(text, zero_padding=True):
    y = []
    # We start to 1 because the first character is not predicted
    for c in text[1:max_length]:
        y.append(GetIndexCharacter(c))
    # We add the End of String Element
    y += [n_characters - 1]
    x = text_to_one_hot_vector(text)
    
    if zero_padding:
        if len(text) < max_length:
            y += [n_characters - 1] * (max_length - len(text))
            x += [[0] * n_characters] * (max_length - len(text))
    return x, y

In [None]:
def data_to_batch(data, batch_size):
    source, target = [], []
    for i, text in enumerate(data):
        x, y = text_to_input(text)
        source.append(x)
        target.append(y)
    
    train = TensorDataset(torch.Tensor(source), torch.LongTensor(target))
    train_loader = DataLoader(train, batch_size=batch_size, shuffle=True)
    return train_loader

In [None]:
batch = 32
train_loader = data_to_batch(data_text, batch)

## Our Class Model

In [None]:
class RNN(nn.Module):
    def __init__(self, 
                 input_size, 
                 hidden_size, 
                 output_size, 
                 num_layers=1,
                 bidirectional=False,
                 model_type="RNN",
                 batch_size=32,
                 dropout=0.5):
        
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.model_type = model_type
        self.dropout = dropout
        self.batch_size = batch_size
        
        
        self.rnn = getattr(nn, model_type)(input_size, 
                                           hidden_size, 
                                           num_layers=num_layers,
                                           dropout=dropout,
                                           bidirectional=bidirectional,
                                           batch_first=True)
        self.decoder = nn.Linear(hidden_size, output_size)
    
    def forward(self, x, hidden):
        output, hidden = self.rnn(x, hidden)
        output = self.decoder(output)
        output = torch.nn.functional.softmax(output)
        return output, hidden

    def init_hidden(self):
        return Variable(torch.zeros(self.num_layers, self.batch_size, self.hidden_size))

## Functions Training

In [None]:
def int_to_one_hot_vectors(i, length):
    tensor = [[0] * length]
    tensor[0][i] = 1
    return Variable(torch.Tensor(tensor))

In [None]:
def train(x, y):
    hidden = hidden_init
    rnn.zero_grad()
    loss = 0

    for i, elem in enumerate(x):
        output, hidden = rnn(elem, hidden)
        y_tensor = int_to_one_hot_vectors(y[i].data.tolist()[0], n_characters)
        loss += criterion(output, y_tensor)
        
    loss.backward()
    optimizer.step()
    return loss.data[0] / x.size()[0]

## Generate Words

In [None]:
def generate(text_start, predict_len=100):
    hidden = Variable(torch.zeros(1, 1, rnn.hidden_size))   
    start_input, start_expected = text_to_input(text_start, zero_padding=False)
    
    start_input = Variable(torch.Tensor(start_input)).unsqueeze(0)
    start_expected = Variable(torch.Tensor(start_expected))

    # We start by learning the hidden layer from the start text
    output, hidden = rnn(start_input, hidden)
    top_i = torch.topk(output, 1)[1].data.tolist()[0][0][0]
    print(top_i)
    predicted_char = list_characters[top_i]
    i = 0
    print(output[:][:][-1].size())
    predicted = text_start + predicted_char
    while predicted_char != EOS and i < predict_len:
        output, hidden = rnn(output[:][:][-1], hidden)
 
        top_i = torch.topk(output, 1)[1].data.tolist()[0][0][0]
        
        # Add predicted character to string and use as next input
        predicted_char = list_characters[top_i]
        predicted += predicted_char
        i += 1
        
    return predicted

## Start training

In [None]:
rnn = RNN(input_size=n_characters, 
          hidden_size=200, 
          output_size=n_characters, 
          num_layers=3,
          bidirectional=False,
          model_type="RNN",
          batch_size=batch,
          dropout=0.5)

In [None]:
hidden_init = rnn.init_hidden()

In [None]:
n_epochs = 10
print_every = 1000
plot_every = 100

all_losses = []
loss_avg = 0

optimizer = torch.optim.Adam(rnn.parameters(), lr=0.1)
criterion = nn.CrossEntropyLoss()

for epoch in range(0, n_epochs):
    print('Epochs: {}'.format(epoch))
    for data, target in tqdm.tqdm(train_loader):
        try:
            data = Variable(data)
            target = Variable(target)
            optimizer.zero_grad()
            y_pred, hidden = rnn(data, hidden_init)

            loss = 0
            for i, elem in enumerate(y_pred):
                loss += criterion(elem, target[i])
            all_losses.append(loss / batch)

            loss.backward()
            optimizer.step()
            print(generate("je"))
        except:
            print("Fuck")

    loss_avg = 0

# Plotting the Training Losses

Plotting the historical loss from all_losses shows the network learning:

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
%matplotlib inline

plt.figure()
plt.plot(all_losses)

In [None]:
loss = nn.MSELoss()
input = Variable(torch.randn(3, 5), requires_grad=True)
target = Variable(torch.randn(3, 5))
output = loss(input, target)
output.backward()

# Exercises

* Train with your own dataset, e.g.
    * Text from another author
    * Blog posts
    * Code
* Increase number of layers and network size to get better results