# Every Politician

In [184]:
import pandas as pd
pd.set_option('display.max_rows', 1000)
names = pd.read_csv('data/every-politician/names.csv')

## Load Data

In [185]:
names.sample(10)

Unnamed: 0,id,name,country,legislature
32785,80019eef-f54b-4def-baa4-0b031629661d,فيليب مارتن,France,National-Assembly
106764,afe706e6-4089-47bc-912c-11a75c95f2b6,Tommy Sheridan,Scotland,Parliament
117813,32288e2a-302e-45af-95f9-40b2ff8854fa,Mona Jönsson,Sweden,Riksdag
101884,3c5d86e3-418f-4ec0-ad6d-79d7b211a3f6,Marian NEACŞU,Romania,Deputies
22512,a5ca90de-a1b0-4c78-b213-486cef282661,Ciro Alejandro Ramírez Cortés,Colombia,Representatives
69982,58b602d2-8456-4875-8718-58c7f766764d,みちした だいき,Japan,House-of-Representatives
43999,68792164-579b-436b-8206-aa1a4f355f39,Panayótis Kouroumplís,Greece,Parliament
146078,81e4094a-38f8-4726-95fe-d35000774e90,Дэвид Ву,United-States-of-America,House
12540,3c18cc91-5275-4495-b9c2-ac60d648a773,Tanju Kirjakow,Bulgaria,National-Assembly
2192,81d6c824-49a3-4448-8805-c51092b29caf,JOSE MARIA DIAZ BANCALARI,Argentina,Diputados


## Subset Countries

In [186]:
countries = ['China', 'Thailand', 'Russia', 'Brazil', 'Japan', 'Ukraine', 'UK', 'Spain', 'Iceland']
customer_names = names[names.country.isin(countries)].reset_index()

In [216]:
customer_names.sample(10)

Unnamed: 0,index,id,name,country,legislature
10862,104178,1f980ed0-8566-4d03-8058-50ce80d40736,"Клинцевич, Франц Адамович",Russia,Duma
9980,103296,0d29247b-bf18-4bab-91d0-778f892526de,Valery Gazzaev,Russia,Duma
7184,69492,f8cde35f-6935-46a1-9d89-2dabe63bc9c7,あおやぎ よういちろう,Japan,House-of-Representatives
3840,21516,24123f81-eefe-465b-83b4-196b76353e13,苏文金,China,Congress
54,9790,ce18aeb6-612e-474a-9f40-eb749b11e1f2,AMAURI TEIXEIRA,Brazil,Deputies
15778,136698,14ac080a-231b-45f3-a9c6-b6ad05b0e683,Семенченко Семен Ігорович,Ukraine,Verkhovna-Rada
7431,69739,ef433dce-85ba-4002-ac8f-9a86394ceae6,さとう あきら,Japan,House-of-Representatives
16533,137533,a9e9e4f0-50db-46ab-948f-548eb9eb2455,Dan Jarvis,UK,Commons
13890,123647,ff06086d-c7f7-48b3-9f05-7d0f23bfd26d,Pongsapat Pongcharoen,Thailand,National-Legislative-Assembly
14925,135845,2625a3ca-0524-4cb1-959e-82e6eced1a3a,Yaroslav Vasilyevich Dubnevich,Ukraine,Verkhovna-Rada


In [215]:
customer_names.country.value_counts()

UK          5593
Japan       3756
China       3435
Russia      3083
Ukraine     1898
Spain       1749
Brazil      1341
Iceland      502
Thailand     364
Name: country, dtype: int64

## Encode Names

In [189]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Activation, Dropout
from keras.layers import LSTM
from keras.utils import to_categorical
from keras.preprocessing.text import one_hot

In [190]:
from keras.preprocessing.text import Tokenizer
char_tokenizer = Tokenizer(char_level=True)

In [191]:
char_tokenizer.fit_on_texts(customer_names.name)

In [192]:
customer_names_encoded = char_tokenizer.texts_to_sequences(customer_names.name)

## One Hot Encode Countries

In [193]:
countries_one_hot_df = pd.get_dummies(customer_names.country)
countries_one_hot = countries_one_hot_df.values

In [194]:
def get_country(country_one_hot):
    countries_one_hot_df.dot(countries_one_hot_df.columns)

In [195]:
countries_one_hot_df.columns[3]

'Japan'

## Pad up to length of Longest Name

In [196]:
len_longest_name = len(max(customer_names_encoded, key=len))
customer_names_encoded = sequence.pad_sequences(customer_names_encoded, maxlen=len_longest_name)

In [197]:
len(min(customer_names_encoded, key=len))

60

## Prepare X and y

In [198]:
X = customer_names_encoded
y = countries_one_hot

## Split into Train and Test Set

In [199]:
from sklearn.model_selection import train_test_split

In [200]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [201]:
len(X_train), len(X_test)

(17376, 4345)

In [202]:
print('x_train shape:', X_train.shape)
print('x_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

x_train shape: (17376, 60)
x_test shape: (4345, 60)
y_train shape: (17376, 9)
y_test shape: (4345, 9)


## Build Model

In [203]:
batch_size = 64
embedding_size = 256
num_classes = 9

model = Sequential()
model.add(Embedding(embedding_size, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
#model.add(Activation('softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 17376 samples, validate on 4345 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test score: 0.7028344789089076
Test accuracy: 0.751668584552541


In [80]:
customer_names.name

9736                             ABEL MESQUITA JR.
9737                          Abel Mesquita Júnior
9738                            ABELARDO CAMARINHA
9739                               ABELARDO LUPION
9740                                  ACELINO POPÓ
9741                                ADAIL CARNEIRO
9742                          ADALBERTO CAVALCANTI
9743                          ADELMO CARNEIRO LEÃO
9744                               ADELSON BARRETO
9745                                 ADEMIR CAMILO
9746                Ademir Camilo Prates Rodrigues
9747                              ADILTON SACHETTI
9748                                        ADRIAN
9749                                ADÉRMIS MARINI
9750                                Adílson Soares
9751                                AELTON FREITAS
9752                        Aelton José de Freitas
9753                       Afonso Antunes da Motta
9754                               AFONSO FLORENCE
9755                           

## Prediction Functions

In [204]:
def name_to_tensor(name:str):
    name_tensor = char_tokenizer.texts_to_sequences([name])
    name_tensor_padded = sequence.pad_sequences(name_tensor, maxlen=len_longest_name)
    return name_tensor_padded[0]

In [205]:
def predict_class(input):
    return model.predict_classes( np.array([input,] ) )[0]

In [206]:
predict_class(X_train[0])

8

In [210]:
def predict_country(name:str):
    name_tensor = name_to_tensor(name)
    country_index = predict_class(name_tensor)
    country = countries_one_hot_df.columns[country_index]
    return country

'China'

In [211]:
predict_country('刘雪荣')

'China'

In [212]:
predict_country('Dave Smith')

'UK'

In [213]:
predict_country('ADELSON BARRETO')

'Brazil'