### Model training

This notebook trains a meme generation model based on: https://github.com/dylanwenzlau/ml-scripts/blob/master/meme_text_gen_convnet/train.py

In [None]:
import json
import random
import argparse
import numpy as np

from keras import layers
from keras import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint
from keras.models import load_model

In [0]:
BASE_PATH = "."
TRAINING_DATA_PATH = f"{BASE_PATH}/drake_hotline_bling.json"
SEQUENCE_LENGTH = 128
EMBEDDING_DIM = 16
NUM_EPOCHS = 60
BATCH_SIZE = 256
VAL_RATIO = 0.25

In [0]:
memes = json.loads(open(TRAINING_DATA_PATH).read())['memes']

In [0]:
texts = []
labels_vocabulary = {}
current_labels_vocabulary_key = 0
labels = []

def is_ascii(s):
    return all(ord(c) < 128 for c in s)

for i, meme in enumerate(memes):
    box_index = 0
    meme_id = "1".zfill(8)
    meme_text = f"{meme['text']};"

    if not is_ascii(meme_text):
        continue

    for j in range(1, len(meme_text)):
        character = meme_text[j]
        texts.append(f"{meme_id} {box_index} {meme_text[0:j]}")

        if character not in labels_vocabulary:
            labels_vocabulary[character] = current_labels_vocabulary_key
            current_labels_vocabulary_key += 1

        character_label = labels_vocabulary[character]
        labels.append(character_label)

        if character == ";":
          box_index += 1

In [0]:
"""
    Tokenization
"""
tk = Tokenizer(num_words=0, char_level=True)
tk.fit_on_texts(texts)

vocabulary = tk.word_index
sequences = tk.texts_to_sequences(texts)

In [0]:
"""
    Prepare the sequences and split the dataset in training and validation
"""
data = pad_sequences(sequences, maxlen=SEQUENCE_LENGTH)
labels = np.asarray(labels)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

val_samples_count = int(VAL_RATIO * data.shape[0])
training_samples_count = data.shape[0] - val_samples_count

X_train = data[:training_samples_count]
y_train = labels[:training_samples_count]

X_val = data[-val_samples_count:]
y_val = labels[-val_samples_count:]

In [0]:
with open(f"{BASE_PATH}/params.json", "w") as params_file:
  params_file.write(json.dumps({
      'sequence_length': SEQUENCE_LENGTH,
      'embedding_dim': EMBEDDING_DIM,
      'samples_count': len(memes),
      'num_epochs': NUM_EPOCHS,
      'batch_size': BATCH_SIZE,
      'vocabulary': vocabulary,
      'labels_vocabulary': labels_vocabulary
  }))

In [0]:
"""
    Setup the model
"""
model = Sequential()
model.add(layers.Embedding(len(vocabulary) + 1, EMBEDDING_DIM, input_length=SEQUENCE_LENGTH))
model.add(layers.Conv1D(1024, 5, activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(2))
model.add(layers.Dropout(0.25))
model.add(layers.Conv1D(1024, 5, activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.MaxPooling1D(2))
model.add(layers.Dropout(0.25))
model.add(layers.Conv1D(1024, 5, activation='relu', padding='same'))
model.add(layers.BatchNormalization())
model.add(layers.GlobalMaxPooling1D())
model.add(layers.Dropout(0.25))
model.add(layers.Dense(1024, activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dropout(0.25))
model.add(layers.Dense(len(labels_vocabulary), activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['acc'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 16)           1008      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 1024)         82944     
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 1024)         4096      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 64, 1024)          0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 64, 1024)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 64, 1024)          5243904   
_________________________________________________________________
batch_normalization_2 (Batch (None, 64, 1024)         

In [0]:
# Create a directory for saved models
!mkdir out_models

In [0]:
"""
    Setup checkpoints
"""
checkpointer = ModelCheckpoint(filepath='out_models/model.h5', verbose=1, save_best_only=True)

In [0]:
"""
    Train
"""
history = model.fit(
    X_train,
    y_train,
    validation_data=(X_val, y_val),
    epochs=NUM_EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=[checkpointer])