In [35]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
df = pd.read_csv('name-nationality.csv')
df.head()

Unnamed: 0,name,nationality
0,Alain Connes,french
1,Arthur Schopenhauer,german
2,Alfred Nobel,swedish
3,A. E. van Vogt,canadian
4,Alfons Maria Jakob,german


In [36]:
nationality_to_id = {}

def get_id(x, d):
    if x in d:
        return d[x]
    else:
        d[x] = len(d)
        return d[x]

df['nationality_id'] = df.nationality.apply(lambda x: get_id(x, nationality_to_id))
id_to_nationality = [0] * len(nationality_to_id)
for n, i in nationality_to_id.items():
    id_to_nationality[i] = n

In [37]:
train_df, dev_df, test_df = \
              np.split(df.sample(frac=1, random_state=42), 
                       [int(.8*len(df)), int(.9*len(df))])

In [38]:
vocab = set()
for name in df.name:
    for c in name:
        vocab.add(c)
vocab_size = len(vocab)
print("Saw %d unique characters in all names" % len(vocab))

Saw 205 unique characters in all names


In [39]:
tokenizer = Tokenizer(num_words=vocab_size, char_level=True)
tokenizer.fit_on_texts(df.name)
x_train = tokenizer.texts_to_sequences(train_df.name)
y_train = train_df.nationality_id
x_text = tokenizer.texts_to_sequences(dev_df.name) 

In [40]:
max_len = max([len(x) for x in df.name])
print(max_len)
def pad(name, l):
    while len(name) < l:
        name += ' '
    return name    
df.name = df.name.apply(lambda x: pad(x, max_len))

83


In [41]:
nationalities = set(df.nationality)
print("Saw %d total nationalities in all names" % len(nationalities))

Saw 92 total nationalities in all names


In [42]:
x_train = pad_sequences(x_train, padding='post', maxlen=max_len)
x_test = pad_sequences(x_test, padding='post', maxlen=max_len)

In [43]:
embedding_dim=10

model=Sequential()
model.add(layers.Embedding(input_dim=vocab_size,
      output_dim=embedding_dim,
      input_length=max_len))
model.add(layers.LSTM(units=50))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(len(nationalities), activation="sigmoid"))
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", 
     metrics=['accuracy'])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 83, 10)            2050      
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dropout_4 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 92)                4692      
Total params: 18,942
Trainable params: 18,942
Non-trainable params: 0
_________________________________________________________________


In [44]:
model.fit(x_train, y_train, epochs=2, batch_size=32, verbose=True)

Epoch 1/2
 328/6000 [>.............................] - ETA: 1:56 - loss: 4.0203 - accuracy: 0.1691

KeyboardInterrupt: 

In [None]:
new_name = ['Wei Xu']
seq = tokenizer.texts_to_sequences(new_name)
padded = pad_sequences(seq, maxlen=max_len)
pred = model.predict_classes(padded)
print(id_to_nationality[pred[0]])