http://pytorch.org/tutorials/intermediate/char_rnn_classification_tutorial.html

In [98]:
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import pandas as pd
import unicodedata
import string
from itertools import chain
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
glob.glob('data/names/*.txt')

['data/names/Arabic.txt',
 'data/names/Chinese.txt',
 'data/names/Czech.txt',
 'data/names/Dutch.txt',
 'data/names/English.txt',
 'data/names/French.txt',
 'data/names/German.txt',
 'data/names/Greek.txt',
 'data/names/Irish.txt',
 'data/names/Italian.txt',
 'data/names/Japanese.txt',
 'data/names/Korean.txt',
 'data/names/Polish.txt',
 'data/names/Portuguese.txt',
 'data/names/Russian.txt',
 'data/names/Scottish.txt',
 'data/names/Spanish.txt',
 'data/names/Vietnamese.txt']

In [69]:
names = []

for fn in glob.glob('data/names/*.txt'):
    name = pd.read_csv(fn, header=None, names=['name'])
    name['lang'] = fn.split('/')[-1].split('.')[0]
    
    names.append(name)

names = pd.concat(names)

In [70]:
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

unicodeToAscii('Ślusàrski')

'Slusarski'

In [82]:
names['name'] = [unicodeToAscii(s) for s in names['name']]
names

Unnamed: 0,name,lang
0,Khoury,Arabic
1,Nahas,Arabic
2,Daher,Arabic
3,Gerges,Arabic
4,Nazari,Arabic
5,Maalouf,Arabic
6,Gerges,Arabic
7,Naifeh,Arabic
8,Guirguis,Arabic
9,Baba,Arabic


In [83]:
X, y = names['name'], names['lang']

In [85]:
# Separate input strings into characters
X = [list(x.lower()) for x in X]
X[:10]

[['k', 'h', 'o', 'u', 'r', 'y'],
 ['n', 'a', 'h', 'a', 's'],
 ['d', 'a', 'h', 'e', 'r'],
 ['g', 'e', 'r', 'g', 'e', 's'],
 ['n', 'a', 'z', 'a', 'r', 'i'],
 ['m', 'a', 'a', 'l', 'o', 'u', 'f'],
 ['g', 'e', 'r', 'g', 'e', 's'],
 ['n', 'a', 'i', 'f', 'e', 'h'],
 ['g', 'u', 'i', 'r', 'g', 'u', 'i', 's'],
 ['b', 'a', 'b', 'a']]

In [93]:
# Create character to index mappings
chars = sorted(set(list(chain.from_iterable(X))))
chars.insert(0, 'PAD')

char_to_ix = dict((c, i) for i, c in enumerate(chars))
ix_to_char = dict((i, c) for i, c in enumerate(chars))

In [94]:
char_to_ix

{' ': 1,
 "'": 2,
 'PAD': 0,
 'a': 3,
 'b': 4,
 'c': 5,
 'd': 6,
 'e': 7,
 'f': 8,
 'g': 9,
 'h': 10,
 'i': 11,
 'j': 12,
 'k': 13,
 'l': 14,
 'm': 15,
 'n': 16,
 'o': 17,
 'p': 18,
 'q': 19,
 'r': 20,
 's': 21,
 't': 22,
 'u': 23,
 'v': 24,
 'w': 25,
 'x': 26,
 'y': 27,
 'z': 28}

In [95]:
ix_to_char

{0: 'PAD',
 1: ' ',
 2: "'",
 3: 'a',
 4: 'b',
 5: 'c',
 6: 'd',
 7: 'e',
 8: 'f',
 9: 'g',
 10: 'h',
 11: 'i',
 12: 'j',
 13: 'k',
 14: 'l',
 15: 'm',
 16: 'n',
 17: 'o',
 18: 'p',
 19: 'q',
 20: 'r',
 21: 's',
 22: 't',
 23: 'u',
 24: 'v',
 25: 'w',
 26: 'x',
 27: 'y',
 28: 'z'}

In [96]:
# Convert characters into indexes
X_ix = [[char_to_ix[c] for c in x] for x in X]
X_ix[:10]

[[13, 10, 17, 23, 20, 27],
 [16, 3, 10, 3, 21],
 [6, 3, 10, 7, 20],
 [9, 7, 20, 9, 7, 21],
 [16, 3, 28, 3, 20, 11],
 [15, 3, 3, 14, 17, 23, 8],
 [9, 7, 20, 9, 7, 21],
 [16, 3, 11, 8, 7, 10],
 [9, 23, 11, 20, 9, 23, 11, 21],
 [4, 3, 4, 3]]

In [97]:
# Pad to the same length
max_len = max([len(x) for x in X_ix])
max_len

19

In [99]:
X_ix = pad_sequences(X_ix, maxlen=max_len, padding='post')
X_ix[:10]

array([[13, 10, 17, 23, 20, 27,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [16,  3, 10,  3, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 6,  3, 10,  7, 20,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 9,  7, 20,  9,  7, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [16,  3, 28,  3, 20, 11,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [15,  3,  3, 14, 17, 23,  8,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 9,  7, 20,  9,  7, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [16,  3, 11,  8,  7, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 9, 23, 11, 20,  9, 23, 11, 21,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [ 4,  3,  4,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]], dtype=int32)