In [1]:
import numpy as np
import pandas as pd

In [2]:
from io import open
import glob
import os
import unicodedata
import string

all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 # Plus EOS marker

In [3]:
import torch
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size

        self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
        self.o2o = nn.Linear(hidden_size + output_size, output_size)
        self.dropout = nn.Dropout(0.1)
        self.softmax = nn.LogSoftmax(dim=1)

    def forward(self, category, input, hidden):
        input_combined = torch.cat((category, input, hidden), 1)
        hidden = self.i2h(input_combined)
        output = self.i2o(input_combined)
        output_combined = torch.cat((hidden, output), 1)
        output = self.o2o(output_combined)
        output = self.dropout(output)
        output = self.softmax(output)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, self.hidden_size)

In [14]:
df = pd.read_csv("../db/names.csv")
df['race'].unique()

array(['Altmer', 'Argonian', 'Bosmer', 'Breton', 'Dunmer', 'Imperial',
       'Khajit', 'Nord', 'Orc', 'Redguard'], dtype=object)

In [15]:
df.shape[0]

22200

In [16]:
df.iloc[0]['name']

'Coredalf'

In [17]:
df['race'].unique()

array(['Altmer', 'Argonian', 'Bosmer', 'Breton', 'Dunmer', 'Imperial',
       'Khajit', 'Nord', 'Orc', 'Redguard'], dtype=object)

In [18]:
category_lines = {'Altmer':[], 'Argonian':[], 'Bosmer':[], 'Breton':[], 'Dunmer':[], 'Imperial':[],
       'Khajit':[], 'Nord':[], 'Orc':[], 'Redguard':[]}
all_categories = list(df['race'].unique())
for i in range(df.shape[0]):
    category = df.iloc[i]['race']
    category_lines[category].append(df.iloc[i]['name'])

n_categories = len(all_categories)

In [19]:
category_lines

{'Altmer': ['Coredalf',
  'Corelas',
  'Corelian',
  'Corellon',
  'Coreman',
  'Coremon',
  'Corenar',
  'Corera',
  'Coreriil',
  'Coreril',
  'Coreron',
  'Coretar',
  'Corridalf',
  'Corrilas',
  'Corrilian',
  'Corrillon',
  'Corriman',
  'Corrimon',
  'Corrinar',
  'Corrira',
  'Corririil',
  'Corriril',
  'Corriron',
  'Corritar',
  'Cyredalf',
  'Cyrelas',
  'Cyrelian',
  'Cyrellon',
  'Cyreman',
  'Cyremon',
  'Cyrenar',
  'Cyrera',
  'Cyreriil',
  'Cyreril',
  'Cyreron',
  'Cyretar',
  'Gandalf',
  'Ganlas',
  'Ganlian',
  'Ganllon',
  'Ganman',
  'Ganmon',
  'Gannar',
  'Ganra',
  'Ganriil',
  'Ganril',
  'Ganron',
  'Gantar',
  'Kaladalf',
  'Kalalas',
  'Kalalian',
  'Kalallon',
  'Kalaman',
  'Kalamon',
  'Kalanar',
  'Kalara',
  'Kalariil',
  'Kalaril',
  'Kalaron',
  'Kalatar',
  'Kelkemmedalf',
  'Kelkemmelas',
  'Kelkemmelian',
  'Kelkemmellon',
  'Kelkemmeman',
  'Kelkemmemon',
  'Kelkemmenar',
  'Kelkemmera',
  'Kelkemmeriil',
  'Kelkemmeril',
  'Kelkemmeron',
  'Ke

In [20]:
type(list(all_categories))

list

In [21]:
import random

# Random item from a list
def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]

# Get a random category and random line from that category
def randomTrainingPair():
    category = randomChoice(all_categories)
    line = randomChoice(category_lines[category])
    return category, line

In [22]:
# One-hot vector for category
def categoryTensor(category):
    li = all_categories.index(category)
    tensor = torch.zeros(1, n_categories)
    tensor[0][li] = 1
    return tensor

# One-hot matrix of first to last letters (not including EOS) for input
def inputTensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li in range(len(line)):
        letter = line[li]
        tensor[li][0][all_letters.find(letter)] = 1
    return tensor

# ``LongTensor`` of second letter to end (EOS) for target
def targetTensor(line):
    letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
    letter_indexes.append(n_letters - 1) # EOS
    return torch.LongTensor(letter_indexes)

In [23]:
# Make category, input, and target tensors from a random category, line pair
def randomTrainingExample():
    category, line = randomTrainingPair()
    category_tensor = categoryTensor(category)
    input_line_tensor = inputTensor(line)
    target_line_tensor = targetTensor(line)
    return category_tensor, input_line_tensor, target_line_tensor

In [24]:
criterion = nn.NLLLoss()

learning_rate = 0.0005

def train(category_tensor, input_line_tensor, target_line_tensor):
    target_line_tensor.unsqueeze_(-1)
    hidden = rnn.initHidden()

    rnn.zero_grad()

    loss = torch.Tensor([0]) # you can also just simply use ``loss = 0``

    for i in range(input_line_tensor.size(0)):
        output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
        l = criterion(output, target_line_tensor[i])
        loss += l

    loss.backward()

    for p in rnn.parameters():
        p.data.add_(p.grad.data, alpha=-learning_rate)

    return output, loss.item() / input_line_tensor.size(0)

In [25]:
import time
import math

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

In [26]:
rnn = RNN(n_letters, 128, n_letters)

n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # Reset every ``plot_every`` ``iters``

start = time.time()

for iter in range(1, n_iters + 1):
    output, loss = train(*randomTrainingExample())
    total_loss += loss

    if iter % print_every == 0:
        print('%s (%d %d%%) %.4f' % (timeSince(start), iter, iter / n_iters * 100, loss))

    if iter % plot_every == 0:
        all_losses.append(total_loss / plot_every)
        total_loss = 0

0m 10s (5000 5%) 2.7710
0m 20s (10000 10%) 3.6281
0m 31s (15000 15%) 2.3235
0m 42s (20000 20%) 2.4783
0m 52s (25000 25%) 2.8476
1m 3s (30000 30%) 2.8594
1m 13s (35000 35%) 2.1642
1m 24s (40000 40%) 2.3653
1m 34s (45000 45%) 2.0282
1m 45s (50000 50%) 2.5857
1m 55s (55000 55%) 2.1739
2m 6s (60000 60%) 2.3062
2m 17s (65000 65%) 2.1633
2m 28s (70000 70%) 2.9871
2m 39s (75000 75%) 1.5931
2m 50s (80000 80%) 2.8785
3m 0s (85000 85%) 2.7542
3m 10s (90000 90%) 2.4445
3m 21s (95000 95%) 1.8548
3m 32s (100000 100%) 2.3436


In [29]:
max_length = 20

# Sample from a category and starting letter
def sample(category, start_letter='A'):
    with torch.no_grad():  # no need to track history in sampling
        category_tensor = categoryTensor(category)
        input = inputTensor(start_letter)
        hidden = rnn.initHidden()

        output_name = start_letter

        for i in range(max_length):
            output, hidden = rnn(category_tensor, input[0], hidden)
            topv, topi = output.topk(1)
            topi = topi[0][0]
            if topi == n_letters - 1:
                break
            else:
                letter = all_letters[topi]
                output_name += letter
            input = inputTensor(letter)

        return output_name

# Get multiple samples from one category and multiple starting letters
def samples(category, start_letters='ABC'):
    for start_letter in start_letters:
        print(sample(category, start_letter))

samples('Nord', 'S')

samples('Imperial', 'Z')

samples('Breton', 'S')

samples('Orc', 'Z')

Sarand
Zarinas
Seelen
Zagara
