# Generador de texto con arquitectura One-to-Many

##Introducción. 
Un generador de texto es una API que utiliza un modelo de lenguaje no supervisado, capaz de generar parrafos de texto. 

A continuacion veremos un ejemplo de esto.

Importamoslibrerias

In [1]:
import os
import numpy as np
import re
import shutil
import tensorflow as tf

En la variable **CHECKPOINT_DIR** se guardaran los pesos del modelo cada 10 épocas

In [2]:
DATA_DIR = "./"
CHECKPOINT_DIR = os.path.join(DATA_DIR, "chekpoints")

Preparar los datos

In [3]:
def download_and_read(urls):
  texts = []
  for i, url in enumerate(urls):
    p = tf.keras.utils.get_file("ex1-{:d}.txt".format(i), url,cache_dir=".")
  text = open(p, "r").read()
  # borrar la marca de orden de bits
  text = text.replace("\ufeff", "")
  # cambiar lineas nuevas por espacios
  text = text.replace('\n', ' ')
  text = re.sub(r'\s+', " ", text)
  # añadir cada una a la lista
  texts.extend(text)
  return texts
  
texts = download_and_read([
"http://www.gutenberg.org/cache/epub/28885/pg28885.txt",
"https://www.gutenberg.org/files/12/12-0.txt"
])

Downloading data from http://www.gutenberg.org/cache/epub/28885/pg28885.txt
Downloading data from https://www.gutenberg.org/files/12/12-0.txt


Crear el vocabulario

In [4]:
vocab = sorted(set(texts))
print('vocav size: {:d}'.format(len(vocab)))

vocav size: 86


Mapeo de palabras a enteros (int)

In [5]:
char2idx = {c:i for i,c in enumerate(vocab)}
idx2char = {i:c for c,i in char2idx.items()}

Numerizar el texto

In [6]:
texts_as_ints = np.array([char2idx[c] for c in texts])
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)

In [7]:
seq_length = 100
sequences = data.batch(seq_length +1, drop_remainder=True)

In [8]:
def split_train_labels(sequence):
  input_seq = sequence[0:-1]
  output_seq = sequence[1:]
  return input_seq, output_seq

sequences = sequences.map(split_train_labels)
batch_size = 64
steps_per_epoch = len(texts) // seq_length // batch_size
dataset = sequences.shuffle(10000).batch(
batch_size, drop_remainder=True)

Definir el modelo

In [9]:
class CharGenModel(tf.keras.Model):

    def __init__(self, vocab_size, num_timesteps, 
            embedding_dim, **kwargs):
        super(CharGenModel, self).__init__(**kwargs)
        self.embedding_layer = tf.keras.layers.Embedding(
            vocab_size,
            embedding_dim
        )
        self.rnn_layer = tf.keras.layers.GRU(
            num_timesteps,
            recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid",
            stateful=True,
            return_sequences=True
        )
        self.dense_layer = tf.keras.layers.Dense(vocab_size)

    def call(self, x):
        x = self.embedding_layer(x)
        x = self.rnn_layer(x)
        x = self.dense_layer(x)
        return x

Construir el modelo

In [10]:
vocab_size = len(vocab)
embedding_dim = 256
rnn_output_dim = 1024

model = CharGenModel(vocab_size, seq_length, embedding_dim)
model.build(input_shape=(batch_size, seq_length))
model.summary()

Model: "char_gen_model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        multiple                  22016     
_________________________________________________________________
gru (GRU)                    multiple                  107400    
_________________________________________________________________
dense (Dense)                multiple                  8686      
Total params: 138,102
Trainable params: 138,102
Non-trainable params: 0
_________________________________________________________________


Definir la **loss function** y compilar el modelo

In [11]:
def loss(labels, predictions):
  return tf.losses.sparse_categorical_crossentropy(
      labels,
      predictions,
      from_logits=True
  )
model.compile(optimizer=tf.optimizers.Adam(), loss=loss)

Generar el texto

In [12]:
def generate_text(model, prefix_string, char2idx, idx2char,
        num_chars_to_generate=1000, temperature=1.0):
    input = [char2idx[s] for s in prefix_string]
    input = tf.expand_dims(input, 0)
    text_generated = []
    model.reset_states()
    for i in range(num_chars_to_generate):
        preds = model(input)
        preds = tf.squeeze(preds, 0) / temperature
        pred_id = tf.random.categorical(preds, num_samples=1)[-1, 0].numpy()
        text_generated.append(idx2char[pred_id])
        input = tf.expand_dims([pred_id], 0)

    return prefix_string + "".join(text_generated)

Correr y evaluar el modelo

Para no imprimir warnings y ver el texto mas limpio

In [13]:
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)

In [14]:
num_epochs = 200
for i in range(num_epochs // 10):
    model.fit(
        dataset.repeat(),
        epochs=10,
        steps_per_epoch=steps_per_epoch,
        verbose = 0
    )
    checkpoint_file = os.path.join(
        CHECKPOINT_DIR, "model_epoch_{:d}".format(i+1))
    model.save_weights(checkpoint_file)
    gen_model = CharGenModel(vocab_size, seq_length, embedding_dim)
    gen_model.load_weights(checkpoint_file)
    gen_model.build(input_shape=(1, seq_length))
    
    #print("after epoch: {:d}".format(i+1)*10)
    print(generate_text(gen_model, "Alice ", char2idx, idx2char))
    print("---")

Alice both ut I on’t cou tiek the lof ict yout, “Itles was UBfexprsikk!” “Hullf diw and’t on dowser edf cof toulf wave thatked to ! the waid.”’s breer piy, Het ot she woming lith a dis an candey. “I wingad in I, as to u wangwain fnounzn. Thooullly. s%oje_vingre tisice as oflbete the utling laut: hing as as thabpe—ard, as bail, and a thid ieden wevert ive com—the herly andpled hico fere nom at sayze.” I wer, heady pur, I you ty eascas hen waste. Sums, bullve main’s_inghy hant the in the in atind at on tisery’t hinger.” whant..” O Gu,—himike avery becres, an liGly,” in HerPpat wiow. “Ald it Why horde!” jis Chan; a tweed mraigry tas fome and of she pus’s sonee bar ente hare nat.” xind, and hund in tig of reasme ing in ig I mamly aslcrecubpaid, am atter arase sullous noled Forny in If hare’th of griss wigg bpar licerk, cto kscot aving!” And ritked fne tewlyichee to of at Durtly Ducelte, _fow youch?” “"nmised QueterFer’t wanked platr a ith sorer a Quo vead! wheed. “lo the allishs, hod the_ 