## Classifying Names With A Char-level RNN


In [8]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import pprint

import unicodedata
import string

In [10]:
def find_files(path): return glob.glob(path)
pprint.pprint(find_files('./data/names/*.txt'))

['./data/names/Czech.txt',
 './data/names/German.txt',
 './data/names/Arabic.txt',
 './data/names/Japanese.txt',
 './data/names/Chinese.txt',
 './data/names/Vietnamese.txt',
 './data/names/Russian.txt',
 './data/names/French.txt',
 './data/names/Irish.txt',
 './data/names/English.txt',
 './data/names/Spanish.txt',
 './data/names/Greek.txt',
 './data/names/Italian.txt',
 './data/names/Portuguese.txt',
 './data/names/Scottish.txt',
 './data/names/Dutch.txt',
 './data/names/Korean.txt',
 './data/names/Polish.txt']


In [11]:
# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427

all_letters = string.ascii_letters + ".,;'"
n_letters = len(all_letters)

def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )
print(unicode_to_ascii('Ślusàrski'))

Slusarski


In [22]:
# Build the category_lines dictionary, a list of names per language

category_lines = {}
all_categories = []

# Read a file and split into lines 
def read_lines(filename):
    lines = open(filename, encoding='utf-8').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

for filename in find_files('./data/names/*.txt'):
    category = os.path.splitext(os.path.basename(filename))[0]
    all_categories.append(category)
    lines = read_lines(filename)
    category_lines[category] = lines
    
n_categories = len(all_categories)

In [23]:
all_categories

['Czech', 'German', 'Arabic', 'Japanese', 'Chinese', 'Vietnamese', 'Russian', 'French', 'Irish', 'English', 'Spanish', 'Greek', 'Italian', 'Portuguese', 'Scottish', 'Dutch', 'Korean', 'Polish']

In [26]:
print(category_lines['Italian'][:5])

['Abandonato', 'Abatangelo', 'Abatantuono', 'Abate', 'Abategiovanni']


## Turning Names into Tensors 

In [29]:
import torch

# Find letter index from all_letters, e.g. "a" = 0
def letter_to_idx(letter):
    return all_letters.find(letter)


# Just for demonstration, turn a letter into a <1 x n_letters> Tensor
def letter_to_tensor(letter):
    tensor = torch.zeros(1, n_letters)
    tensor[0][letter_to_idx(letter)] = 1
    return tensor

# Turn a line into a <line_length x 1 x n_letters>,
# or an array of on-hot letter vectors
def line_to_tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)
    for li, letter in enumerate(line):
        tensor[li][0][letter_to_idx(letter)] = 1
        return tensor
    
print(letter_to_tensor('J'))
print(letter_to_tensor('J').size())

print(line_to_tensor('Jones').size())

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])
torch.Size([1, 56])
torch.Size([5, 1, 56])


##  Creating the Network

In [31]:
import torch.nn as nn

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        
        self.hidden_size = hidden_size
        
        self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
        self.i2o = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        
    def forward(self, input, hidden):
        combined = torch.cat((input, hidden), 1)
        hidden = self.i2h(combined)
        output = self.i2o(combined)
        output = self.softmax(output)
        return output, hidden
    
    def initHidden(self):
        return torch.zeros(1, self.hidden_size)
        
n_hidden = 128
rnn = RNN(n_letters, n_hidden, n_categories)

In [32]:
input = letter_to_tensor('A')
print(input)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])


In [33]:
hidden =torch.zeros(1, n_hidden)

In [34]:
hidden

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0.]])

In [35]:
output, next_hidden = rnn(input, hidden)

In [36]:
output

tensor([[-2.9472, -3.0144, -2.7960, -2.8999, -2.8677, -2.9812, -2.7809, -2.8793,
         -2.8915, -2.8886, -2.8537, -2.9157, -2.8375, -3.0177, -2.9163, -2.8292,
         -2.8307, -2.9181]], grad_fn=<LogSoftmaxBackward>)

In [37]:
next_hidden

tensor([[-0.0227, -0.1076, -0.1244,  0.0470,  0.0159,  0.0124, -0.0289, -0.0637,
         -0.0359,  0.0542, -0.0905,  0.1135,  0.0190,  0.0016,  0.0814, -0.0424,
         -0.0817,  0.0191, -0.0487, -0.0067, -0.0896,  0.0598, -0.0274,  0.0142,
          0.1197,  0.0250, -0.0072,  0.0242, -0.0550,  0.1133, -0.0900,  0.0053,
          0.0712, -0.0231,  0.1016,  0.0200,  0.0022,  0.0988,  0.0031, -0.0303,
         -0.0397, -0.0643,  0.0663,  0.0577, -0.0184,  0.0013, -0.0414,  0.0595,
          0.0002,  0.0727, -0.0673, -0.0741,  0.0399,  0.0151, -0.0201, -0.0422,
         -0.0108, -0.0420, -0.0687, -0.0226,  0.0632,  0.0475,  0.0972, -0.1391,
          0.0205, -0.0384, -0.1061,  0.0561,  0.0806,  0.0047,  0.0367,  0.0402,
          0.0047, -0.1035,  0.0223,  0.0152,  0.1089, -0.1121, -0.0460, -0.0449,
         -0.1033,  0.0364, -0.0190, -0.0611,  0.0409,  0.0905, -0.0449, -0.0313,
         -0.1265,  0.0260, -0.0826,  0.0053, -0.0254, -0.0492, -0.0629,  0.0052,
         -0.0230, -0.0934, -

In [39]:
input = line_to_tensor('Albert')
hidden = torch.zeros(1, n_hidden)

output, next_hidden = rnn(input[0], hidden)
print(output)

tensor([[-2.9472, -3.0144, -2.7960, -2.8999, -2.8677, -2.9812, -2.7809, -2.8793,
         -2.8915, -2.8886, -2.8537, -2.9157, -2.8375, -3.0177, -2.9163, -2.8292,
         -2.8307, -2.9181]], grad_fn=<LogSoftmaxBackward>)


## Training 