In [7]:
from tensorflow.keras.callbacks import LambdaCallback
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
import numpy as np
import random
import sys
import io
import os

In [8]:
path = '/content/words_alpha.txt'
with io.open(path, encoding='utf-8') as f:
    text = f.read().lower()

In [9]:
text



In [10]:
print(len(text))

3864812


In [11]:
chars = sorted(list(set(text)))
print('total chars:', len(chars))
print(chars)

total chars: 27
['\n', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [12]:
char_indices = dict((char,i) for i,char in enumerate(chars))
indices_char = dict((i,char) for i,char in enumerate(chars))

In [13]:
print(char_indices)

{'\n': 0, 'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5, 'f': 6, 'g': 7, 'h': 8, 'i': 9, 'j': 10, 'k': 11, 'l': 12, 'm': 13, 'n': 14, 'o': 15, 'p': 16, 'q': 17, 'r': 18, 's': 19, 't': 20, 'u': 21, 'v': 22, 'w': 23, 'x': 24, 'y': 25, 'z': 26}


In [14]:
lines = text.split('\n')
lines = [line for line in lines if len(line)!=0]
print("number of lines:", len(lines))

number of lines: 370105


In [15]:
maxlen = len(max(lines,key=len))
minlen = len(min(lines,key=len))
print(maxlen)
print(minlen)

31
1


In [16]:
steps = 1
sequences = []
next_chars = []

for line in lines:
    # pre-padding with zeros
    s = (maxlen - len(line))*'0' + line
    sequences.append(s)
    next_chars.append('\n')
    for it,j in enumerate(line):
        if (it >= len(line)-1):
            continue
        s = (maxlen - len(line[:-1-it]))*'0' + line[:-1-it]
        sequences.append(s)
        next_chars.append(line[-1-it])

In [17]:
print('total sequences:', len(sequences))

total sequences: 3494707


In [18]:
print(sequences[0])

000000000000000000000000000000a


In [24]:
print(next_chars)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [20]:
x = np.zeros((len(sequences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sequences), len(chars)), dtype=np.bool)
for i, seq in enumerate(sequences):
    for t, char in enumerate(seq):
        if char != '0':
            x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  """Entry point for launching an IPython kernel.
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  


In [21]:
x.shape

(3494707, 31, 27)

In [26]:
y.shape

(3494707, 27)

In [34]:
prefix = ""
max_names = 10

def sample(preds):
    """ function that sample an index from a probability array """
    preds = np.asarray(preds).astype('float64')
    preds = preds / np.sum(preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.random.choice(range(len(chars)), p = probas.ravel())

def print_name_generated(name):
    print(name, flush=True)
def print_list_generated(lst):
    print(lst, flush=True)
    
    
def generate_new_names(*args):
    print("----------Generatinig names----------")

    # Add pre-padding of zeros in the input.
    sequence = ('{0:0>' + str(maxlen) + '}').format(prefix).lower()

    # tmp variables
    tmp_generated = prefix
    list_outputs = list()

    while (len(list_outputs) < max_names):

        # Vectorize the input of the model.
        x_pred = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(sequence):
            if char != '0':
                x_pred[0, t, char_indices[char]] = 1

        # Predict the probabilities of the next char.
        preds = model.predict(x_pred, verbose=0)[0]

        # Chose one based on the distribution obtained in the output of the model.
        next_index = sample(preds)
        # Get the corresponding char.
        next_char = indices_char[next_index]

        # If the char is a new line character or the name start to be bigger than the longest word, 
        # try to add it to the list and reset temp variables.
        if next_char == '\n' or len(tmp_generated) > maxlen:
            
            # If the name generated is not in the list, append it and print it.
            if tmp_generated not in list_outputs:
                list_outputs.append(tmp_generated)
                print_name_generated(tmp_generated)
            # Reset tmp variables
            sequence = ('{0:0>' + str(maxlen) + '}').format(prefix).lower()
            tmp_generated = prefix
        else:
    
            # Append the char to the sequence that we're generating.
            tmp_generated += next_char
            # Add pre-padding of zeros to the sequence generated and continue.
            sequence = ('{0:0>' + str(maxlen) + '}').format(tmp_generated).lower()
            
    # Show the intersection of the words generated and your dataset. . 
    print("Set of words already in the dataset:")
    print_list_generated(set(lines).intersection(list_outputs))
    
    # Show the rate of how many repeated words you've created.
    total_repited = len(set(lines).intersection(list_outputs))
    total = len(list_outputs)
    print("Rate of total invented words: " + "{:.2f}".format((total-total_repited)/total))
    print("-----------------End-----------------")
    
# Function invoked at the end of each epoch. Prints generated names.
callback = LambdaCallback(on_epoch_end=generate_new_names)

In [36]:
model = Sequential()
model.add(LSTM(64, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars), activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.01))
history = model.fit(x, y, batch_size=128, epochs=2, verbose=2,callbacks=[callback])

Epoch 1/2


  super(RMSprop, self).__init__(name, **kwargs)


----------Generatinig names----------
quinism
forkets
quarithymon
philistlorinon
shemlorman
wactlindeer
furite
jufpsicaria
queramalycer
nonomelity
Set of words already in the dataset:
{'quinism'}
Rate of total invented words: 0.90
-----------------End-----------------
27303/27303 - 800s - loss: 1.9612 - 800s/epoch - 29ms/step
Epoch 2/2
----------Generatinig names----------
jified
weatterpri
jobuoited
quierss
wh
oversychonal
flafformy
iquibiting
jashningness
jutolikely
Set of words already in the dataset:
{'wh'}
Rate of total invented words: 0.90
-----------------End-----------------
27303/27303 - 801s - loss: 1.8952 - 801s/epoch - 29ms/step


In [37]:
model.save('model.h5')