In [2]:
import os
import pandas as pd
import requests as req
import json
import numpy as np
import re
from unidecode import unidecode

MAX_NAME_SIZE = 50

In [3]:
DATA_URL = 'https://api.got.show/api/characters/'

data = req.get(DATA_URL).content

In [4]:
df = pd.read_json(data, orient='records')
len(df)

2028

In [5]:
df = df[['name', 'dateOfBirth', 'dateOfDeath']]

In [6]:
df_only_born = df.dropna(subset=['dateOfBirth'])
len(df_only_born)

478

In [7]:
def build_dict(names):
    out_dict = set()
    
    for name in names:
        for letter in name:
            out_dict.add(letter)
    
    return sorted(list(out_dict))

def preprocess_name(name):
    out_name = unidecode(name.lower())
    out_name = re.sub(r'\W', '', out_name)
    
    if len(out_name) > MAX_NAME_SIZE:
        out_name = out_name[:MAX_NAME_SIZE]
    
    return out_name

def preprocess_age(birth, death):
    if np.isnan(death):
        return -1
    else:
        return int(death - birth)

def names_to_letters_list(names):
    letters_list = []
    
    for name in names:
        letters_list.append(list(name))
    
    return letters_list

In [8]:
df_names = df_only_born['name'].apply(preprocess_name)

In [9]:
LETTERS_DICT = build_dict(list(df_names))

In [10]:
list_ages = []

for birth, death in zip(list(df_only_born['dateOfBirth']), list(df_only_born['dateOfDeath'])):
    list_ages.append(preprocess_age(birth, death))

In [11]:
def onehot_encoder(name):
    name_encoded = np.zeros((MAX_NAME_SIZE, len(LETTERS_DICT)), dtype='int')
    for index, character in enumerate(list(name)):
        char_index = LETTERS_DICT.index(character)
        name_encoded[index, char_index] = 1
    
    return name_encoded

In [12]:
onehot_names = []

for index, name in enumerate(list(df_names)):
    onehot_names.append(onehot_encoder(name))

onehot_names = np.array(onehot_names)
onehot_names.shape

(478, 50, 26)

In [70]:
def encode_vect(name):
    name_encoded = np.zeros((MAX_NAME_SIZE), dtype='int')
    for index, character in enumerate(list(name)):
        char_index = LETTERS_DICT.index(character)
        name_encoded[index] = char_index
    
    return name_encoded

In [71]:
encoded_names = []

for index, name in enumerate(list(df_names)):
    encoded_names.append(encode_vect(name))

encoded_names = np.array(encoded_names)
encoded_names.shape

(478, 50)

In [13]:
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
scaler = mm_scaler.fit([[age] for age in list_ages])

In [28]:
ages_scaled = scaler.transform([[age] for age in list_ages])
ages_scaled = np.array(ages_scaled)
ages_scaled.shape

(478, 1)

# Machine Learning

In [107]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

model = Sequential()
model.add(Embedding(len(LETTERS_DICT), 30, input_length=MAX_NAME_SIZE))
model.add(Flatten())
model.add(Dense(units=20, activation='sigmoid'))
model.add(Dense(units=10, activation='sigmoid'))
model.add(Dense(units=1, activation='sigmoid'))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 50, 30)            780       
_________________________________________________________________
flatten_16 (Flatten)         (None, 1500)              0         
_________________________________________________________________
dense_38 (Dense)             (None, 20)                30020     
_________________________________________________________________
dense_39 (Dense)             (None, 10)                210       
_________________________________________________________________
dense_40 (Dense)             (None, 1)                 11        
Total params: 31,021
Trainable params: 31,021
Non-trainable params: 0
_________________________________________________________________


In [110]:
model.compile(optimizer="adam", loss='binary_crossentropy',  metrics=['accuracy'])

history = model.fit(x=encoded_names, y=ages_scaled, epochs=100, verbose=1, shuffle=True)

#score = model.evaluate(x_test, y_test_oh, verbose=0)

#print('Test loss:', score[0])
#print('Test accuracy:', score[1])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [131]:
my_name = encode_vect(preprocess_name('Vinícius'))
my_name

array([21,  8, 13,  8,  2,  8, 20, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [132]:
res = model.predict(np.array([my_name]))
print(res[0][0])
scaler.inverse_transform(res[0][0])

0.0540818


array([[ 4.57042074]], dtype=float32)