# Dinosaur Names Generator with RNNs 

First let's import the libraries we'll use:

In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [2]:
dino_file = open('datasets/dino.txt', 'r')
dinos = dino_file.read()
dino_file.close()
print(dinos[:99])

aachenosaurus
aardonyx
abelisaurus
abrictosaurus
abrosaurus
abydosaurus
acantholipan
acanthopholis



## Data Preprocessing:

In [3]:
dino_list = dinos.split('\n')
print(dino_list[3:6])
print(f'The dataset contains {len(dino_list)} exemples')

['abrictosaurus', 'abrosaurus', 'abydosaurus']
The dataset contains 1533 exemples


First let's see all the chars that figures in the names:

In [4]:
all_chars = []
for dino in dino_list:
    splitted = list(dino)
    for c in splitted:
        if c not in all_chars:
            all_chars.append(c)
print(all_chars)

['a', 'c', 'h', 'e', 'n', 'o', 's', 'u', 'r', 'd', 'y', 'x', 'b', 'l', 'i', 't', 'p', 'v', 'm', 'k', 'g', 'f', 'j', 'w', 'z', 'q', '_']


So in addition to alphabet chars, the underscore '_' does also figure in and should be counted later, next we'll append each training sample with a start and end character padding to a maximum length:

First let's see the longest dinosaur name in our dataset:

In [5]:
max_len = 0
for dino in dino_list:
    lng = len(dino)
    if lng > max_len:
        max_len = lng
print(f'longest name contains {max_len} characters')

longest name contains 23 characters


Now let's pad the names and add the start/end chars:
But first let's create a simple padding function:

In [6]:
def padname(name, pad_char, maxl):
    to_add = maxl - len(name)
    if to_add <= 0:
        return name
    to_append = ''.join([pad_char] * to_add)
    return name + to_append

padname('Dino', '$', 7)

'Dino$$$'

In [7]:
start_char = '~'
end_char = '$'
dino_list = [start_char + padname(dino_name.lower(), end_char, max_len) + end_char for dino_name in dino_list]
print(dino_list[0:5])

['~aachenosaurus$$$$$$$$$$$', '~aardonyx$$$$$$$$$$$$$$$$', '~abelisaurus$$$$$$$$$$$$$', '~abrictosaurus$$$$$$$$$$$', '~abrosaurus$$$$$$$$$$$$$$']


Next, we'll split dino names into charcaters to form our dataset: 

In [24]:
dataset_X = [list(dino) for dino in dino_list]
dataset_y = []
for l in dataset_X:
    y_label = l[1:] + [end_char]
    dataset_y.append(y_label)
    
print(dataset_X[0]) # first sample
print(dataset_y[0]) # first label

['~', 'a', 'a', 'c', 'h', 'e', 'n', 'o', 's', 'a', 'u', 'r', 'u', 's', '$', '$', '$', '$', '$', '$', '$', '$', '$', '$', '$']
['a', 'a', 'c', 'h', 'e', 'n', 'o', 's', 'a', 'u', 'r', 'u', 's', '$', '$', '$', '$', '$', '$', '$', '$', '$', '$', '$', '$']


We'll feed the RNN with one-hot representation of characters and to do so, we'll need to create a dictionnary mapping each char in the dataset to an integer which will be converted to a one hot later:

In [25]:
char_to_int = {chr(x): x - ord('a') for x in range(ord('a'), ord('z') + 1)}
char_to_int[start_char] = 26
char_to_int[end_char] = 27
char_to_int['_'] = 28
print(char_to_int)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '~': 26, '$': 27, '_': 28}


Now let's get the inverted dictionnay:

In [26]:
int_to_char = {v: k for k, v in char_to_int.items()}
print(int_to_char)

{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z', 26: '~', 27: '$', 28: '_'}


Using the dictionary above, we can now implement a fucntion that returns the one-hot vector of a character:

In [27]:
def one_hot(c):
    order = char_to_int[c]
    return keras.utils.to_categorical(order, num_classes = len(char_to_int))

def one_hot_list(l): # ret: numpy array
    ret = []
    for c in l:
        ret.append(one_hot(c))
    return np.array(ret)

print(one_hot('f'))
print('-------------------------------')
print(one_hot_list(['a', 'b', 'c']))

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0.]
-------------------------------
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]]


In [30]:
Tx = len(dino_list[0])
n_samples = len(dino_list)
input_vec_shape = (len(one_hot('a')), )

print(f'MAXLENGTH = {Tx}')
print(f'Number of samples = {n_samples}')
print(f'Input vector shape = {input_vec_shape}')


X, y = np.zeros((n_samples, Tx, input_vec_shape[0])), np.zeros((n_samples, Tx, input_vec_shape[0]))

for i in range(n_samples):
    X_hot = one_hot_list(dataset_X[i])
    y_hot = one_hot_list(dataset_y[i])
    X[i, :, :] = X_hot
    y[i, :, :] = X_hot
    
print(f'data shape is {data.shape}')
print('exemple: ')
print(X[0, :, :])

MAXLENGTH = 25
Number of samples = 1533
Input vector shape = (29,)
data shape is (1533, 25, 29)
exemple: 
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 1. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
  0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0

Now our dataset is preprocessed, let's create our model:

In [36]:
model = keras.models.Sequential([
    keras.layers.Input(shape=(25, 29)),
    keras.layers.SimpleRNN(32, return_sequences = True),
    keras.layers.Dense(29, activation = 'softmax')
])

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_4 (SimpleRNN)     (None, 25, 32)            1984      
_________________________________________________________________
dense_3 (Dense)              (None, 25, 29)            957       
Total params: 2,941
Trainable params: 2,941
Non-trainable params: 0
_________________________________________________________________


Compile the model :

In [39]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])

model.fit(X, y, epochs=12)

Epoch 1/12
Epoch 2/12
Epoch 3/12
Epoch 4/12
Epoch 5/12
Epoch 6/12
Epoch 7/12
Epoch 8/12
Epoch 9/12
Epoch 10/12
Epoch 11/12
Epoch 12/12


<tensorflow.python.keras.callbacks.History at 0x7f2d51ebb700>

In [47]:
print(X[0, :, :].shape)
pred = model.predict(np.array([X[0, :, :]]))[0]
print(pred.shape)

(25, 29)
(25, 29)


In [50]:
str_pred = []
for c in pred:
    int_c = np.argmax(c)
    real_c = int_to_char[int_c]
    str_pred.append(real_c)
    
str_pred = ''.join(str_pred)

print(str_pred)

~aachenosaurus$$$$$$$$$$$
