In [1]:
# Load LSTM network and generate text
import sys
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
import pandas as pd

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
docs = pd.read_pickle('training_set.pkl')


# for clarity, rename numbered essay topics to one-word topic summary 

topic_dict = {'topic':{1: 'computers', 
                       2: 'censorship', 
                       3: 'cyclist', 
                       4: 'hibiscus', 
                       5: 'mood', 
                       6: 'dirigibles', 
                       7: 'patience', 
                       8: 'laughter'}}

docs.replace(topic_dict, inplace=True)

docs.head()

Unnamed: 0,essay_id,topic,essay,rater1_domain1,rater2_domain1,rater3_domain1,target_score,rater1_domain2,rater2_domain2,topic2_target,...,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,char_len,word_count,tokens,lemma,pos
0,1,computers,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,1875,351,"[Dear, local, newspaper, ,, I, think, effects,...","[dear, local, newspaper, ,, -PRON-, think, eff...","[ADJ, ADJ, NOUN, PUNCT, PRON, VERB, NOUN, NOUN..."
1,2,computers,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,2288,424,"[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...","[dear, @caps1, @caps2, ,, -PRON-, believe, tha...","[ADJ, PROPN, PUNCT, PUNCT, PRON, VERB, ADP, VE..."
2,3,computers,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,1541,284,"[Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...","[dear, ,, @caps1, @caps2, @caps3, more, and, m...","[ADJ, PUNCT, PROPN, PUNCT, PROPN, ADJ, CCONJ, ..."
3,4,computers,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,3165,531,"[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...","[dear, local, newspaper, ,, @caps1, -PRON-, ha...","[ADJ, PROPN, PROPN, PUNCT, PROPN, PRON, VERB, ..."
4,5,computers,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,2569,474,"[Dear, @LOCATION1, ,, I, know, having, compute...","[dear, @location1, ,, -PRON-, know, have, comp...","[ADJ, ADP, PUNCT, PRON, VERB, VERB, NOUN, VERB..."


In [3]:
# load ascii text and covert to lowercase
raw_text = docs[((docs.topic == 'computers') &
            (docs.target_score > 6)) |
            ((docs.topic == 'censorship') & 
            (docs.target_score > 2))]\
            ['essay'].to_string().lower()

In [4]:
# create mapping of unique chars to integers, and a reverse mapping
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Vocab: ", n_vocab)

Total Characters:  188557
Total Vocab:  56


In [5]:
# prepare the dataset of input to output pairs encoded as integers
seq_length = 100
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns: ", n_patterns)

Total Patterns:  188457


In [6]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length, 1))
# normalize
X = X / float(n_vocab)
# one hot encode the output variable
y = np_utils.to_categorical(dataY)

In [7]:
# define the LSTM model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2, seed=42))
model.add(Dense(y.shape[1], activation='softmax'))

In [8]:
filepath="weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint]

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [56]:
# fit the model
model.fit(X, y, epochs=20, batch_size=128, callbacks=callbacks_list)

Epoch 1/20

Epoch 00001: loss improved from inf to 2.29283, saving model to weights-improvement-01-2.2928.hdf5
Epoch 2/20

Epoch 00002: loss improved from 2.29283 to 1.72952, saving model to weights-improvement-02-1.7295.hdf5
Epoch 3/20

Epoch 00003: loss improved from 1.72952 to 1.49683, saving model to weights-improvement-03-1.4968.hdf5
Epoch 4/20

Epoch 00004: loss improved from 1.49683 to 1.40586, saving model to weights-improvement-04-1.4059.hdf5
Epoch 5/20

Epoch 00005: loss improved from 1.40586 to 1.34094, saving model to weights-improvement-05-1.3409.hdf5
Epoch 6/20

Epoch 00006: loss improved from 1.34094 to 1.29806, saving model to weights-improvement-06-1.2981.hdf5
Epoch 7/20

Epoch 00007: loss improved from 1.29806 to 1.26667, saving model to weights-improvement-07-1.2667.hdf5
Epoch 8/20
  4608/188457 [..............................] - ETA: 12:14 - loss: 1.2235

KeyboardInterrupt: 

In [9]:
# Load weights from most improved
filename = "weights-improvement-07-1.2667.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [10]:
# pick a random seed
start = np.random.randint(0, len(dataX)-1)
pattern = dataX[start]
print("Seed:")
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

Seed:
" ..
500     dear local newspaper, i believe that there is ...
501     click, click, click, "@caps1!"  "


In [11]:
def sample_prediction(prediction):
    """Get rand index from preds based on its prob distribution.

    Params
    ——
    prediction (array (array)): array of length 1 containing array of probs that sums to 1

    Returns
    ——-
    rnd_idx (int): random index from prediction[0]

    Notes
    —–
    Helps to solve problem of repeated outputs.

    len(prediction) = 1
    len(prediction[0]) >> 1
    """
    X = prediction[0] # sum(X) is approx 1
    rnd_idx = np.random.choice(len(X), p=X)
    return rnd_idx

In [17]:
generated = ''
for i in range(400):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = sample_prediction(prediction)
    result = int_to_char[index]
#     sys.stdout.write(result)
    generated += result
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print('\nDone.')
print(generated)


Done.
 io...
3769    loardnien th aln naad thatn oooedss on iaseri...
3756    neaen of bla wo hir an wei iase awo aia nerplew...
3464    'ala oaipns late npro ne avehngl wod vhia?a ...
2020    enool, alc wu suehkog toaying tlec cdgt l .m.. 3855    dv yaiki eeo oeopon mggin oo doc ypui o?..
2063    oele yhu enefrs thebokn to tienefta kive ms...
2442    iave you ever todii teak c bloct dnocd ten t ...
235
