# Dinosaur Names Generator with LSTMs 

First let's import the libraries we'll use:

In [None]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np

In [7]:
dino_file = open('dino.txt', 'r')
dinos = dino_file.read()
dino_file.close()
print(dinos[:99])

aachenosaurus
aardonyx
abelisaurus
abrictosaurus
abrosaurus
abydosaurus
acantholipan
acanthopholis



## Data Preprocessing:

In [10]:
dino_list = dinos.split('\n')
print(dino_list[3:6])
print(f'The dataset contains {len(dino_list)} exemples')

['abrictosaurus', 'abrosaurus', 'abydosaurus']
The dataset contains 1533 exemples


We'll append each training sample with a start and end characters:

In [17]:
start_char = '~'
end_char = '$'
dino_list = [start_char + dino_name.lower() + end_char for dino_name in dino_list]
print(dino_list[0:5])

['~~aachenosaurus$$', '~~aardonyx$$', '~~abelisaurus$$', '~~abrictosaurus$$', '~~abrosaurus$$']


Next, we'll split dino names into charcaters to form our dataset: 

In [18]:
dataset = [list(dino) for dino in dino_list]
print(dataset[:4])

[['~', '~', 'a', 'a', 'c', 'h', 'e', 'n', 'o', 's', 'a', 'u', 'r', 'u', 's', '$', '$'], ['~', '~', 'a', 'a', 'r', 'd', 'o', 'n', 'y', 'x', '$', '$'], ['~', '~', 'a', 'b', 'e', 'l', 'i', 's', 'a', 'u', 'r', 'u', 's', '$', '$'], ['~', '~', 'a', 'b', 'r', 'i', 'c', 't', 'o', 's', 'a', 'u', 'r', 'u', 's', '$', '$']]


We'll feed the RNN with one-hot representation of characters and to do so, we'll need to create a dictionnary mapping each char in the dataset to an integer which will be converted to a one hot later:

In [39]:
char_to_int = {chr(x): x - ord('a') for x in range(ord('a'), ord('z') + 1)}
char_to_int[start_char] = 26
char_to_int[end_char] = 27
print(char_to_int)

{'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14, 'p': 15, 'q': 16, 'r': 17, 's': 18, 't': 19, 'u': 20, 'v': 21, 'w': 22, 'x': 23, 'y': 24, 'z': 25, '~': 26, '$': 27}


Now let's get the inverted dictionnay:

In [38]:
int_to_char = {v: k for k, v in char_to_int.items()}
print(int_to_char)

{0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g', 7: 'h', 8: 'i', 9: 'j', 10: 'k', 11: 'l', 12: 'm', 13: 'n', 14: 'o', 15: 'p', 16: 'q', 17: 'r', 18: 's', 19: 't', 20: 'u', 21: 'v', 22: 'w', 23: 'x', 24: 'y', 25: 'z', 26: '~', 27: '$'}
