# Text generation test using dilated convolutional networks

In [1]:
%matplotlib inline

## Global config

Name of corpus file (without txt extension)

In [2]:
corpusname = "lapeceramicro"

Number of past input tokens to use for generation

In [3]:
inputtokens = 128

Network architecture to use

In [4]:
architecture = "dilatedconv"

### Process config

Get all relevant file names

In [5]:
corpusfile = 'corpus/' + corpusname + '.txt'
encodername = corpusname + '.enc'
modelname = corpusname + '.h5'

Obtain model class

In [6]:
from neurowriter.models import modelbyname
modelclass = modelbyname(architecture)

Using TensorFlow backend.


## Load corpus

In [7]:
with open(corpusfile) as f:
    corpus = f.read()

In [8]:
corpus[0:min(1000,len(corpus))]

'2/11/2014, 11:54 - Sergio Nabil Khayyat creó el grupo “La pecera”\n2/11/2014, 11:55 - Sergio Nabil Khayyat te añadió\n2/11/2014, 11:54 - Sergio Nabil Khayyat cambió el icono de este grupo\n2/11/2014, 12:04 - Carmen Torrijos cambió el asunto a “La pecera \ue522”\n2/11/2014, 12:06 - Álvaro Barbero Jiménez: Ola k aseis\n2/11/2014, 12:07 - Álvaro Barbero Jiménez: Bamonos dE peshka\n2/11/2014, 12:07 - Álvaro Barbero Jiménez: Al bershka\n2/11/2014, 12:07 - Carmen Torrijos: Buena inauguracion del grupo jajaj \ue00e\n2/11/2014, 23:33 - Jorge López Lázaro: Capitalism Explained. This Is So Accurate It Hurts. - http://m.tickld.com/x/capitalism-explained-this-is-so-accurate-it-hurts\n3/11/2014, 0:37 - Alicia González: \ue412\n3/11/2014, 10:06 - Sergio Nabil Khayyat añadió a David Díaz Vico\n4/11/2014, 8:34 - Carmen Torrijos: Pastas en el office! 🍥🍥\n4/11/2014, 8:45 - Alicia González: llego mazo tarde \ue058\n4/11/2014, 8:46 - Carmen Torrijos: \ue115\ue230\n4/11/2014, 11:47 - Sergio Nabil Khayyat 

## Encoding

In [None]:
from neurowriter.encoding import Encoder, loadencoding
try:
    encoder = loadencoding(encodername)
    print("Loaded encoder", encodername)
except Exception as e:
    print("Encoder not found, creating new encoder:", e)
    encoder = Encoder(corpus)
    encoder.save(encodername)

Loaded encoder lapeceramicro.enc


## Model training

In [None]:
from neurowriter.optimizer import hypertrain

model, train_history = hypertrain(modelclass, inputtokens, encoder, corpus, n_calls=100)
model.save(modelname)

Params: [4, 32, 0.60276337607164387, 2, 64, 0.64589411306665612, 'rmsprop'] , loss:  3.60489155451
Params: [5, 64, 0.38344151882577771, 3, 64, 0.56804456109393231, 'adam'] , loss:  3.58911536535
Params: [2, 4, 0.020218397440325719, 3, 128, 0.87001214824681916, 'adam'] , loss:  3.59204134941
Params: [5, 16, 0.78052917628645546, 0, 128, 0.1433532874090464, 'adam'] , loss:  3.73669484456
Params: [4, 16, 0.26455561210462697, 3, 64, 0.56843394886864851, 'sgd'] , loss:  3.61186521848
Params: [4, 32, 0.61693399687475692, 3, 128, 0.35950790057378601, 'rmsprop'] , loss:  3.61531596184
Params: [4, 4, 0.66676671544566768, 2, 32, 0.12892629765485331, 'sgd'] , loss:  3.65036411285
Params: [3, 16, 0.43860151346232035, 3, 16, 0.20887675609483469, 'sgd'] , loss:  3.60718886058
Params: [4, 8, 0.46631077285630629, 0, 16, 0.11037514116430513, 'rmsprop'] , loss:  3.74341654778
Params: [2, 4, 0.36872517066096411, 3, 16, 0.8379449074988039, 'sgd'] , loss:  3.65118765831
Params: [4, 64, 0.53074244144301741, 

## Generation test

In [None]:
from neurowriter.writer import Writer

writer = Writer(model, encoder, creativity=0.1)
print(corpus[:inputtokens])
''.join(writer.write(seed=corpus[:inputtokens]))

### Manual test generation test with 0 creativity

In [None]:
import numpy as np
seed = corpus[:inputtokens]
print("Seed:", seed)
print("Generated")
print(seed, end='')
for i in range(1000):
    seedcoded = encoder.encodetext(seed)
    #cls = model.predict_classes(np.array([seedcoded]), verbose=0)
    #char = encoder.index2char[cls[0]]
    cls = np.argmax(model.predict(np.array([seedcoded])))
    char = encoder.index2char[cls]
    print(char, end='')
    seed = seed[1:] + char

## Possible improvements

* Try training with SGD and the full pecera corpus for a large number of iterations
* Add more residual blocks

From Facebook's convolutional translation paper
* Tokens are dealt with embeddings instead of one-hot encoder.
* The position of each token is also added as a parallel embedding
* Dropout for the embeddings and for the input of each convolutional block

## References

* WaveNet paper: https://arxiv.org/pdf/1609.03499.pdf
* A Keras implementation of WaveNet: https://github.com/usernaamee/keras-wavenet/blob/master/simple-generative-model.py
* Another one: https://github.com/basveeling/wavenet/blob/master/wavenet.py
* Facebook's convolutional translation paper: https://arxiv.org/pdf/1705.03122.pdf

## Scrapyard

def sampletext(logs):
    """Function that generates some sample text with the model.

    Intented to be used as a keras callback
    """
    writer = Writer(model, encoder, creativity=0.1)
    print(corpus[:inputtokens])
    print(''.join(writer.write(seed=corpus[:inputtokens])))

# Build model with input parameters
model = modelkind(inputtokens, encoder, *bestparams)
# Prepare callbacks
callbacks = [
    LambdaCallback(on_train_end=sampletext),
    ModelCheckpoint(filepath=modelname,save_best_only=True),
    EarlyStopping(patience=patience)
]
# Train model
model.fit_generator(
    traingenerator,
    steps_per_epoch=int((1-val)*(len(corpus)-inputtokens+1)/batchsize),
    validation_data=valgenerator,
    validation_steps=int(val*(len(corpus)-inputtokens+1)/batchsize),
    epochs=maxepochs,
    verbose=2,
    callbacks=callbacks
)