In [8]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os

def findFiles(path): return glob.glob(path)
files = findFiles('data/*.txt')
print(files)
import re
def readLines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    lines = [s.lower() for s in lines]
    lines = [re.sub("[\t\s]", '', s) for s in lines]
    return lines

for filename in files:
    lines = readLines(filename)

['data/straattaal.txt']


In [9]:
letters = set(" ".join(lines))
all_letters = ""
for s in letters:
	all_letters += all_letters.join(s)

n_letters = len(all_letters) + 1 # EOS
all_letters, len(all_letters), n_letters

('vtdlcg bnuhrzmjskpefiçxoway', 27, 28)

In [10]:
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

In [11]:
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

In [12]:
def randomChoice(l):
	return l[torch.randint(len(l), size=[1])]
def randomTrain():
	line = randomChoice(lines)
	input_line_tensor = inputTensor(line)
	target_line_tensor = targetTensor(line)
	return input_line_tensor, target_line_tensor

In [13]:
inputs, target = randomTrain()
inputs.size(), target.size()

(torch.Size([5, 1, 28]), torch.Size([5]))

In [14]:
import torch

import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()

        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.1)

    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.tanh(self.i2h(combined))
        output = self.i2o(hidden)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_letters)

In [15]:
criterion = nn.NLLLoss()
learning_rate = 0.0001

def train(input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = rnn.initHidden()
    rnn.zero_grad()
    loss = 0

    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item() / input_line_tensor.size(0)

In [16]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [17]:
rnn = RNN(n_letters, 128, n_letters)

In [18]:
n_iters = 50000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every plot_every iters

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*randomTrain())
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

0m 8s (5000 10%) 2.6905
0m 17s (10000 20%) 2.4666
0m 27s (15000 30%) 2.6794
0m 36s (20000 40%) 2.2552
0m 46s (25000 50%) 2.2539
0m 55s (30000 60%) 2.0132
1m 5s (35000 70%) 2.5100
1m 14s (40000 80%) 2.3528
1m 23s (45000 90%) 2.4330
1m 32s (50000 100%) 2.6964


In [19]:
import matplotlib.pyplot as plt

plt.figure()
plt.plot(all_losses)
plt.show()

<Figure size 640x480 with 1 Axes>

In [20]:
def sample(start_letter='a', maxn=20, temp=0):
	with torch.no_grad():
		input = inputTensor(start_letter)
		hidden = rnn.initHidden()
		output_name = start_letter
		for i in range(maxn):
			output, hidden = rnn(input[0], hidden)
			if temp != 1:
				# print("use temp")
				probs = torch.softmax(output, 1) / temp
				dist = torch.distributions.Categorical(probs)
				pick = dist.sample()
			else:
				topv, topi = output.topk(1)
				pick = topi[0][0]
			if pick == n_letters - 1:
				break
			else:
				letter = all_letters[pick]
				output_name += letter
			input = inputTensor(letter)
		return output_name

In [23]:
for i in range(30):
	print(sample(start_letter=randomChoice(all_letters), temp=1))

 aaa
eaaaa
kaaaa
baaa
taa
iaaa
çaaa
laie
laaa
saa
laaa
saaa
foee
caaa
haaa
uaa
caaa
gaa
kanae
çaaa
paa
naaa
zaaa
faaa
paa
iaaa
ioee
faaa
daaa
caaaa


In [2]:
print(sorted("vtdlcg bnuhrzmjskpefiçxoway"))

[' ', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ç']
