# Predicting Country from Names

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
names = pd.read_csv('data/every-politician/names.csv')

## Load Data

In [3]:
names.sample(10)

Unnamed: 0,id,name,country,legislature
24753,7424e432-77c5-40b9-b883-eb6f52fbfa15,Μαύρου Ελένη,Cyprus,House-of-Representatives
31502,a2e2e170-ed3d-4f27-baaa-5d2efa7970ed,Nadia Ramassamy,France,National-Assembly
41124,dc66046e-2d61-4650-ae45-c648cc7bdba4,모니카 그뤼터스,Germany,Bundestag
71855,b995d209-00e5-4b24-a4be-db667b0b8b11,"Нигматулин, Нурлан Зайруллаевич",Kazakhstan,Assembly
68434,f16fa51a-5871-44cb-acbc-c68d2dd0891d,Shigeki Kobayashi,Japan,House-of-Representatives
31602,ebe94d0a-a539-4b9a-926b-4c0d3af3a3c4,Patrick Labaune,France,National-Assembly
89866,9f92aedc-2a2b-4caf-9f6b-85a2809c5e25,Sin Pyong-kang,North-Korea,National-Assembly
30427,30a0f93b-e548-4ea5-8a4d-c43e594504ed,Bertrand Bouyx,France,National-Assembly
92506,73a0292f-2032-409d-863e-7715e1aca704,گرو هارلم بروندلاند,Norway,Storting
57768,a6e8e3d8-efd6-40b0-a072-1e0855440424,Niall Collins,Ireland,Dail


## Subset Countries

In [5]:
countries = ['China', 'Thailand', 'Russia',
             'Brazil', 'Japan', 'Ukraine',
             'UK', 'Spain', 'Iceland']
customer_names = names[names.country.isin(countries)].reset_index(drop=True)

In [6]:
customer_names.sample(10)

Unnamed: 0,id,name,country,legislature
7320,859fc3c6-c8a4-428e-9ed0-db9677cbfc35,おぶち ゆうこ,Japan,House-of-Representatives
14449,433f8d8f-0879-4ecd-a088-d1c8f2b0bd84,Juhims Zvjahiļskis,Ukraine,Verkhovna-Rada
2561,c7505ece-1453-4b37-947d-655f6a8fe5ee,张忠民,China,Congress
5288,e67c0c7d-2986-4ed4-a73b-aae6b41edda2,AKABA Kazuyoshi,Japan,House-of-Representatives
8538,6cf1b73c-574f-4a67-8c92-fc14a4bb42a8,田所嘉徳,Japan,House-of-Representatives
4860,956f67aa-9fba-4a05-83b3-eeb850731b3f,Guðjón Arnar Kristjánsson,Iceland,Assembly
17322,27352be9-c266-498d-8303-f3ce61c30d59,Lynda Clark,UK,Commons
20133,14573aa9-0e99-4ab8-b7ba-5443e59df132,スティーブン・メトカーフ,UK,Commons
8423,e8842709-b508-4404-94b4-51f2d6124f24,武部新,Japan,House-of-Representatives
11880,8af09397-3e3e-433d-ad38-afca040677d5,نيكولاي فالويف,Russia,Duma


In [215]:
customer_names.country.value_counts()

UK          5593
Japan       3756
China       3435
Russia      3083
Ukraine     1898
Spain       1749
Brazil      1341
Iceland      502
Thailand     364
Name: country, dtype: int64

## Encode Names

In [7]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Dropout
from keras.layers import LSTM
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot

Using TensorFlow backend.


In [8]:
from keras.preprocessing.text import Tokenizer
char_tokenizer = Tokenizer(char_level=True)

In [9]:
char_tokenizer.fit_on_texts(customer_names.name)

In [10]:
customer_names_encoded = char_tokenizer.texts_to_sequences(customer_names.name)

In [12]:
customer_names_encoded[0], customer_names_encoded[20]

([2, 35, 6, 12, 1, 18, 6, 10, 110, 19, 3, 15, 2, 1, 30, 5, 312],
 [2, 48, 4, 7, 10, 4, 1, 18, 4, 15, 15, 2])

In [14]:
len_longest_name = len(max(customer_names_encoded, key=len))
customer_names_encoded = sequence.pad_sequences(customer_names_encoded, 
                                                maxlen=len_longest_name)

In [15]:
customer_names_encoded[0], customer_names_encoded[20]

(array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   2,  35,   6,  12,   1,  18,   6,  10, 110,
         19,   3,  15,   2,   1,  30,   5, 312]),
 array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  2, 48,  4,
         7, 10,  4,  1, 18,  4, 15, 15,  2]))

## One Hot Encode Countries

In [17]:
countries_one_hot_df = pd.get_dummies(customer_names.country)
countries_one_hot = countries_one_hot_df.values

In [21]:
countries_one_hot_df.sample(10)

Unnamed: 0,Brazil,China,Iceland,Japan,Russia,Spain,Thailand,UK,Ukraine
5353,0,0,0,1,0,0,0,0,0
4976,0,0,1,0,0,0,0,0,0
8120,0,0,0,1,0,0,0,0,0
19194,0,0,0,0,0,0,0,1,0
3334,0,1,0,0,0,0,0,0,0
3392,0,1,0,0,0,0,0,0,0
16334,0,0,0,0,0,0,0,1,0
8443,0,0,0,1,0,0,0,0,0
13838,0,0,0,0,0,1,0,0,0
6109,0,0,0,1,0,0,0,0,0


In [22]:
def get_country(country_one_hot):
    countries_one_hot_df.dot(countries_one_hot_df.columns)

In [23]:
countries_one_hot_df.columns[3]

'Japan'

## Pad up to length of Longest Name

In [24]:
len(min(customer_names_encoded, key=len))

60

## Prepare X and y

In [25]:
X = customer_names_encoded
y = countries_one_hot

## Split into Train and Test Set

In [26]:
from sklearn.model_selection import train_test_split

In [27]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
len(X_train), len(X_test)

(17376, 4345)

In [29]:
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (17376, 60)
x_test shape: (4345, 60)
y_train shape: (17376, 9)
y_test shape: (4345, 9)


## Build Model

In [30]:
batch_size = 64
embedding_size = 256
num_classes = 9

model = Sequential()
model.add(Embedding(embedding_size, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
#model.add(Activation('softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Train...
Instructions for updating:
Use tf.cast instead.
Train on 17376 samples, validate on 4345 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 0.7084899139623785
Test accuracy: 0.7620253162362084


In [32]:
model.save('models/NameToCountryModelKeras.mdl')

## Prediction Functions

In [204]:
def name_to_tensor(name:str):
    name_tensor = char_tokenizer.texts_to_sequences([name])
    name_tensor_padded = sequence.pad_sequences(name_tensor, maxlen=len_longest_name)
    return name_tensor_padded[0]

In [205]:
def predict_class(input):
    return model.predict_classes( np.array([input,] ) )[0]

In [206]:
predict_class(X_train[0])

8

In [210]:
def predict_country(name:str):
    name_tensor = name_to_tensor(name)
    country_index = predict_class(name_tensor)
    country = countries_one_hot_df.columns[country_index]
    return country

'China'

In [211]:
predict_country('刘雪荣')

'China'

In [212]:
predict_country('Dave Smith')

'UK'

In [213]:
predict_country('ADELSON BARRETO')

'Brazil'