In [83]:
import os
import pandas as pd
import requests as req
import json
import numpy as np
import re
from unidecode import unidecode
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import MinMaxScaler

MAX_NAME_SIZE = 50
DICT_SIZE = 26

In [17]:
DATA_URL = 'https://api.got.show/api/characters/'

data = req.get(DATA_URL).content

In [21]:
# OPTIONAL

with open(os.path.join('data', 'characters.json'), 'r') as f:
    data = f.read()

In [22]:
df = pd.read_json(data, orient='records')
len(df)

2028

In [23]:
df = df[['name', 'dateOfBirth', 'dateOfDeath']]

In [24]:
df_only_born = df.dropna(subset=['dateOfBirth'])
len(df_only_born)

478

In [84]:
def preprocess_name(name):
    out_name = unidecode(name.lower())
    out_name = re.sub(r'\W', '', out_name)
    
    if len(out_name) > MAX_NAME_SIZE:
        out_name = out_name[:MAX_NAME_SIZE]
    
    return out_name

def preprocess_age(birth, death):
    if np.isnan(death):
        return -1
    else:
        return int(death - birth)

In [26]:
df_names = df_only_born['name'].apply(preprocess_name)

In [31]:
list_ages = []

for birth, death in zip(list(df_only_born['dateOfBirth']), list(df_only_born['dateOfDeath'])):
    list_ages.append(preprocess_age(birth, death))

In [52]:
def encode_name(name):
    return one_hot(' '.join(list(name)), DICT_SIZE)

In [56]:
encoded_names = []

# encode names in one hot
for index, name in enumerate(list(df_names)):
    encoded_names.append(encode_name(name))

# pad sequences
encoded_names = pad_sequences(encoded_names, maxlen=MAX_NAME_SIZE, padding='post')
encoded_names = np.array(encoded_names)
encoded_names.shape

(478, 50)

In [57]:
mm_scaler = MinMaxScaler()
scaler = mm_scaler.fit([[age] for age in list_ages])

In [58]:
ages_scaled = scaler.transform([[age] for age in list_ages])
ages_scaled = np.array(ages_scaled)
ages_scaled.shape

(478, 1)

# Machine Learning

In [146]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten

model = Sequential()
model.add(Embedding(DICT_SIZE, 50, input_length=MAX_NAME_SIZE))
model.add(Flatten())
model.add(Dense(units=25, activation='sigmoid'))
model.add(Dense(units=10, activation='sigmoid'))
model.add(Dense(units=1, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 50, 50)            1300      
_________________________________________________________________
flatten_6 (Flatten)          (None, 2500)              0         
_________________________________________________________________
dense_19 (Dense)             (None, 25)                62525     
_________________________________________________________________
dense_20 (Dense)             (None, 10)                260       
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 11        
Total params: 64,096
Trainable params: 64,096
Non-trainable params: 0
_________________________________________________________________


In [147]:
model.compile(optimizer="adam", loss='binary_crossentropy',  metrics=['accuracy'])

history = model.fit(x=encoded_names, y=ages_scaled, epochs=200, verbose=1, shuffle=True)

#score = model.evaluate(x_test, y_test_oh, verbose=0)

#print('Test loss:', score[0])
#print('Test accuracy:', score[1])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/200
Epoch 154/200
Epoch 155/200
Epoch 15

Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [148]:
my_name = encode_name(preprocess_name('Vinicius Matheus Veríssimo da Silva'))
my_name = pad_sequences([my_name], maxlen=MAX_NAME_SIZE, padding='post')
my_name.shape

(1, 50)

In [149]:
res = model.predict(np.array(my_name))
print(res[0][0])
scaler.inverse_transform(res[0][0])

0.120866


array([[ 11.44918156]], dtype=float32)