In [1]:
import json
import pandas as pd

In [2]:
file_path = 'explain_xkcd/xkcd_transcript3.json'
data = []
with open(file_path) as json_data:
    d = json.load(json_data)

for row in d:
    for k in row:
        data.append(row[k])

In [3]:
df = pd.DataFrame(data,columns = ['Comic','Text'])

In [4]:
df['Text2']=df.Text.str.join(' ')

In [5]:
df['Text2'][0]

"[A boy sits in a barrel which is floating in an ocean.]\n Boy: i wonder where i'll float next?\n"

In [6]:
data = ' '.join(df['Text2'])
#data = df.groupby('Comic')['Text2'].apply(lambda x: " ".join(x))
#data = df['Text2'].apply(lambda x: " ".join(x))

In [7]:
from __future__ import print_function
import numpy as np
import random
# method for generating text
def generate_text(model, length, vocab_size, ix_to_char):
	# starting with left bracket
    for ix1, char in ix_to_char.items():
        if char == "[":
            ix =[ix1]
    y_char = [ix_to_char[ix[-1]]]
    X = np.zeros((1, length, vocab_size))
    for i in range(length):
		# appending the last predicted character to sequence
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        ix = np.argmax(model.predict(X[:, :i+1, :])[0], 1)
        y_char.append(ix_to_char[ix[-1]])
    return ('').join(y_char)

def weighted_choice(choices):
    choices2 = [x if x >= .1 else 0 for x in choices]
    total = sum(w for c, w in enumerate(choices2))
    r = random.uniform(0, total)
    upto = 0
    for c, w in enumerate(choices2):
        if upto + w >= r:
            return c
        upto += w

# This is mostly used for the website generator. See flask code.
def generate_text_prompt(model, length, vocab_size, ix_to_char, prompt):
    X = np.zeros((1, len(prompt) + length, vocab_size))

    y_char=[]
    for i,p in enumerate(prompt):
        for ix1, char in ix_to_char.items():
            if char == p:
                ix =[ix1]
        X[0, i, :][ix[-1]] = 1
        print(ix_to_char[ix[-1]], end="")
        y_char.append(ix_to_char[ix[-1]])

    for i2 in range(length):
        # appending the last predicted character to sequence
        a = model.predict(X[:, :i2 + len(prompt), :])[0][-1]
        ix = weighted_choice(a)
        X[0, i2 + len(prompt), :][ix] = 1
        print(ix_to_char[ix], end="")
        y_char.append(ix_to_char[ix])       
    return ('').join(y_char)

# method for preparing the training data
def load_data(data_file, seq_length):
    data_string = data_file

    long_chars = list(set(data_string))
    for i in long_chars:
        if data_string.count(i) < 10:
            data_string = data_string.replace(i,"")

    data = list(data_string)
    chars = np.unique(list(data))
    VOCAB_SIZE = len(chars)

    print('Data length: {} characters'.format(len(data)))
    print('Vocabulary size: {} characters'.format(VOCAB_SIZE))

    ix_to_char = {ix:char for ix, char in enumerate(chars)}
    char_to_ix = {char:ix for ix, char in enumerate(chars)}

    #Need the matrix to have an integer so integer division is used
    X = np.zeros((len(data)//seq_length, seq_length, VOCAB_SIZE))
    y = np.zeros((len(data)//seq_length, seq_length, VOCAB_SIZE))
    for i in range(0, len(data)//seq_length):
        X_sequence = data[i*seq_length:(i+1)*seq_length]
        X_sequence_ix = [char_to_ix[value] for value in X_sequence]
        input_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            input_sequence[j][X_sequence_ix[j]] = 1.
            X[i] = input_sequence

        y_sequence = data[i*seq_length+1:(i+1)*seq_length+1]
        y_sequence_ix = [char_to_ix[value] for value in y_sequence]
        target_sequence = np.zeros((seq_length, VOCAB_SIZE))
        for j in range(seq_length):
            target_sequence[j][y_sequence_ix[j]] = 1.
            y[i] = target_sequence
    return X, y, VOCAB_SIZE, ix_to_char

In [8]:
from __future__ import print_function
import numpy as np
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.recurrent import LSTM
from keras.layers.wrappers import TimeDistributed

BATCH_SIZE = 40
HIDDEN_DIM = 700
SEQ_LENGTH = 100
WEIGHTS ='checkpoint_layer_3_hidden_700_epoch_350.hdf5'
MODE = 'gen' # Or 'train'
prompt = "[Cueball"

GENERATE_LENGTH = 400
LAYER_NUM = 3

# Creating training data
X, y, VOCAB_SIZE, ix_to_char = load_data(data, SEQ_LENGTH)

# Creating and compiling the Network
model = Sequential()
model.add(LSTM(HIDDEN_DIM, input_shape=(None, VOCAB_SIZE), return_sequences=True))
for i in range(LAYER_NUM - 1):
    model.add(LSTM(HIDDEN_DIM, return_sequences=True))
model.add(Dropout(0.3))
model.add(TimeDistributed(Dense(VOCAB_SIZE)))
model.add(Activation('softmax'))
model.compile(loss="categorical_crossentropy", optimizer="rmsprop")


if not WEIGHTS == '':
    model.load_weights(WEIGHTS)
    nb_epoch = int(WEIGHTS[WEIGHTS.rfind('_') + 1:WEIGHTS.find('.')])
#    generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)

else:
    nb_epoch = 0

# Training if there is no trained weights specified
if MODE == 'train' or WEIGHTS == '':
    while True:
        print('\n\nEpoch: {}\n'.format(nb_epoch))
        model.fit(X, y, batch_size=BATCH_SIZE, verbose=1, nb_epoch=1)
        nb_epoch += 1
        generate_text(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char)
        if nb_epoch % 1 == 0:
            model.save_weights('checkpoint_layer_{}_hidden_{}_epoch_{}.hdf5'.format(LAYER_NUM, HIDDEN_DIM, nb_epoch))

# Else, loading the trained weights and performing generation only
elif WEIGHTS != '':
    # Loading the trained weights
    while True: #Generating a bunch of examples
        model.load_weights(WEIGHTS)
        generate_text_prompt(model, GENERATE_LENGTH, VOCAB_SIZE, ix_to_char, prompt)
        print('\n\n')
else:
    print('\n\nNothing to do!')

Using Theano backend.


Data length: 1370731 characters
Vocabulary size: 112 characters
[Cueball stands with a bow and arrow drawn tightly, aiming off-screen.]
 [He fires the arrow points to the friend and Cueball stand in front of two small sections, Cueball and Megan holding hands.]
 Cueball: But controls the true. The came back is nice.
 [A map of the gulf comes can be seen in the background, and then it back

KeyboardInterrupt: 

# Additional ideas
possibly have chatter bots talking to each other?
train CNN on images of top characters. 
'White hat' 'Cueball' 'Ponytail' 'Megan' 'Black Hat' - Too many characters...

add images picking topics out of images for generating?

up sequence size - Done
Remove weird symbols. - removed characters with less than 10 occurences. 10 second speed increase per epoch 
Sectioned text for training. - Didn't work correctly. Some transcripts are too long

Make a api for a comic. Cueball talking to Megan or something. Use prompts that are each others. 2 Responses back and forth. - Test by dling latest epoch, loading, and generating text length.

In [17]:
#Need to pickle ix_to_char
import pickle
pickle.dump(ix_to_char, open( "Flask/vocab.p", "wb" ) )

In [None]:
from IPython.display import SVG
from keras.utils.vis_utils import model_to_dot

SVG(model_to_dot(model).create(prog='dot', format='svg'))