In [37]:
import numpy as np
import gensim
import string
import pandas as pd

from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file

In [46]:
docs = pd.read_pickle('training_set.pkl')


# for clarity, rename numbered essay topics to one-word topic summary 

topic_dict = {'topic':{1: 'computers', 
                       2: 'censorship', 
                       3: 'cyclist', 
                       4: 'hibiscus', 
                       5: 'mood', 
                       6: 'dirigibles', 
                       7: 'patience', 
                       8: 'laughter'}}

docs.replace(topic_dict, inplace=True)

docs.head()

Unnamed: 0,essay_id,topic,essay,rater1_domain1,rater2_domain1,rater3_domain1,target_score,rater1_domain2,rater2_domain2,topic2_target,...,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,char_len,word_count,tokens,lemma,pos
0,1,computers,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,1875,351,"[Dear, local, newspaper, ,, I, think, effects,...","[dear, local, newspaper, ,, -PRON-, think, eff...","[ADJ, ADJ, NOUN, PUNCT, PRON, VERB, NOUN, NOUN..."
1,2,computers,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,2288,424,"[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...","[dear, @caps1, @caps2, ,, -PRON-, believe, tha...","[ADJ, PROPN, PUNCT, PUNCT, PRON, VERB, ADP, VE..."
2,3,computers,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,1541,284,"[Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...","[dear, ,, @caps1, @caps2, @caps3, more, and, m...","[ADJ, PUNCT, PROPN, PUNCT, PROPN, ADJ, CCONJ, ..."
3,4,computers,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,3165,531,"[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...","[dear, local, newspaper, ,, @caps1, -PRON-, ha...","[ADJ, PROPN, PROPN, PUNCT, PROPN, PRON, VERB, ..."
4,5,computers,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,2569,474,"[Dear, @LOCATION1, ,, I, know, having, compute...","[dear, @location1, ,, -PRON-, know, have, comp...","[ADJ, ADP, PUNCT, PRON, VERB, VERB, NOUN, VERB..."


In [19]:
print('\nFetching the text...')
url = 'https://raw.githubusercontent.com/maxim5/stanford-tensorflow-tutorials/master/data/arxiv_abstracts.txt'
path = get_file('arxiv_abstracts.txt', origin=url)

print('\nPreparing the sentences...')
max_sentence_len = 40
with open(path) as file_:
  docs = file_.readlines()
sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in docs]
print('Num sentences:', len(sentences))


Fetching the text...

Preparing the sentences...
Num sentences: 7200


In [76]:
# load ascii text and covert to lowercase
essays = docs[((docs.topic == 'computers') &
            (docs.target_score > 6)) |
            ((docs.topic == 'censorship') & 
            (docs.target_score > 2))]\
            ['tokens']

In [78]:
max_sentence_len = 40
# sentences = [[word for word in doc.lower().translate(string.punctuation).split()[:max_sentence_len]] for doc in essays]
sentences = [[word.lower() for word in doc[:max_sentence_len]] for doc in essays]
print('Num sentences:', len(sentences))

Num sentences: 3251


In [79]:
print('\nTraining word2vec...')
word_model = gensim.models.Word2Vec(sentences, size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)
print('Checking similar words:')
for word in ['computer', 'library', 'book', 'learn']:
  most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in word_model.most_similar(word)[:8])
  print('  %s -> %s' % (word, most_similar))

def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

print('\nPreparing the data for LSTM...')
train_x = np.zeros([len(sentences), max_sentence_len], dtype=np.int32)
train_y = np.zeros([len(sentences)], dtype=np.int32)
for i, sentence in enumerate(sentences):
  for t, word in enumerate(sentence[:-1]):
    train_x[i, t] = word2idx(word)
  train_y[i] = word2idx(sentence[-1])
print('train_x shape:', train_x.shape)
print('train_y shape:', train_y.shape)


Training word2vec...
Result embedding shape: (5065, 100)
Checking similar words:
  computer -> computers (0.69), internet (0.44), effect (0.35), computors (0.33), biggest (0.32), site (0.32), time (0.31), technology (0.31)
  library -> libraries (0.67), libary (0.63), librarie (0.39), book (0.39), libray (0.35), walk (0.35), parent (0.35), stacked (0.34)
  book -> movie (0.51), material (0.50), magazine (0.48), books (0.46), story (0.43), somthing (0.42), something (0.41), song (0.41)
  learn -> talk (0.61), connect (0.51), communicate (0.47), teach (0.46), explore (0.44), give (0.43), faraway (0.42), informs (0.41)

Preparing the data for LSTM...
train_x shape: (3251, 40)
train_y shape: (3251,)


  app.launch_new_instance()


In [80]:
print('\nTraining LSTM...')
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[pretrained_weights]))
model.add(LSTM(units=emdedding_size))
model.add(Dense(units=vocab_size))
model.add(Activation('softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')


Training LSTM...


In [81]:
def sample(preds, temperature=1.0):
  if temperature <= 0:
    return np.argmax(preds)
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

def generate_next(text, num_generated=20):
  word_idxs = [word2idx(word) for word in text.lower().split()]
  for i in range(num_generated):
    prediction = model.predict(x=np.array(word_idxs))
    idx = sample(prediction[-1], temperature=0.7)
    word_idxs.append(idx)
  return ' '.join(idx2word(idx) for idx in word_idxs)

def on_epoch_end(epoch, _):
  print('\nGenerating text after epoch: %d' % epoch)
  texts = [
    'dear local newspaper every library should have these books',
    'books in library',
    'a computer',
    'a',
  ]
  for text in texts:
    gen_text = generate_next(text)
    print('%s... -> %s' % (text, gen_text))

In [82]:
model.fit(train_x, train_y,
          batch_size=128,
          epochs=5,
          callbacks=[LambdaCallback(on_epoch_end=on_epoch_end)])

Epoch 1/5

Generating text after epoch: 0
dear local newspaper every library should have these books... -> dear local newspaper every library should have these books next sir precious shealf of a think stimulate migrated els overheard humor blaring dust older offisive sister happen wish faults
books in library... -> books in library cyberspace are yes interesting someone hope days neggative doesnt't lines puplic carefully horrendous co thake discussion involve distribution less appauling
a computer... -> a computer wirters magzine await obese upheld @organization3 beleave faw documents constitution harm centainly mails ete projected opportunities unnecessary chairs wirting certainly
a... -> a swell world hassle incorporated offeneded active displayed portrails notify consern els crude violance conversations beauty support @location2 appauling tortured recieving
Epoch 2/5

Generating text after epoch: 1
dear local newspaper every library should have these books... -> dear local newspa

<keras.callbacks.History at 0x205417dc828>