### Imports

In [15]:
import itertools
from time import time
from collections import Counter
import gzip
import logging
 
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 

import numpy as np
import pandas as pd
import random
import sys
import io

import spacy

from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from gensim.models.phrases import Phrases, Phraser
from gensim.utils import simple_preprocess
from nltk.tokenize import sent_tokenize

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, LSTM
from keras.callbacks import ModelCheckpoint, LambdaCallback
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.optimizers import RMSprop



### Settings

In [4]:
pd.set_option('float_format', '{:,.2f}'.format)
np.random.seed(42)

In [5]:
def format_time(t):
    m, s = divmod(t, 60)
    h, m = divmod(m, 60)
    return '{:02.0f}:{:02.0f}:{:02.0f}'.format(h, m, s)

### Extract Data

In [6]:
docs = pd.read_pickle('training_set.pkl')


# for clarity, rename numbered essay topics to one-word topic summary 

topic_dict = {'topic':{1: 'computers', 
                       2: 'censorship', 
                       3: 'cyclist', 
                       4: 'hibiscus', 
                       5: 'mood', 
                       6: 'dirigibles', 
                       7: 'patience', 
                       8: 'laughter'}}

docs.replace(topic_dict, inplace=True)

essay_2 = docs[docs.topic == 'censorship']

docs.head()

Unnamed: 0,essay_id,topic,essay,rater1_domain1,rater2_domain1,rater3_domain1,target_score,rater1_domain2,rater2_domain2,topic2_target,...,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,char_len,word_count,tokens,lemma,pos
0,1,computers,"Dear local newspaper, I think effects computer...",4,4,,8,,,,...,,,,,,1875,351,"[Dear, local, newspaper, ,, I, think, effects,...","[dear, local, newspaper, ,, -PRON-, think, eff...","[ADJ, ADJ, NOUN, PUNCT, PRON, VERB, NOUN, NOUN..."
1,2,computers,"Dear @CAPS1 @CAPS2, I believe that using compu...",5,4,,9,,,,...,,,,,,2288,424,"[Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...","[dear, @caps1, @caps2, ,, -PRON-, believe, tha...","[ADJ, PROPN, PUNCT, PUNCT, PRON, VERB, ADP, VE..."
2,3,computers,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4,3,,7,,,,...,,,,,,1541,284,"[Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...","[dear, ,, @caps1, @caps2, @caps3, more, and, m...","[ADJ, PUNCT, PROPN, PUNCT, PROPN, ADJ, CCONJ, ..."
3,4,computers,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,,,,,,3165,531,"[Dear, Local, Newspaper, ,, @CAPS1, I, have, f...","[dear, local, newspaper, ,, @caps1, -PRON-, ha...","[ADJ, PROPN, PROPN, PUNCT, PROPN, PRON, VERB, ..."
4,5,computers,"Dear @LOCATION1, I know having computers has a...",4,4,,8,,,,...,,,,,,2569,474,"[Dear, @LOCATION1, ,, I, know, having, compute...","[dear, @location1, ,, -PRON-, know, have, comp...","[ADJ, ADP, PUNCT, PRON, VERB, VERB, NOUN, VERB..."


## Preprocess Data

### Tokenize & Clean Sentences

Models expect data provided as a single sentence per line. First we'll extract sentences using NLTK's sent_tokenizer, then we'll remove punctuation after using `spaCy`'s parser to tokenize the input text.

In [11]:
essay_list = []

for i, essay in enumerate(docs.essay):
 
    if (i % 1000 == 0):
        logging.info("read {0} essays".format(i))
    # do some pre-processing and return list of words for each review
    # text
    essay_list.append(simple_preprocess(essay))
print(i, 'essays preprocessed.')

2018-06-14 08:28:51,853 : INFO : read 0 essays
2018-06-14 08:28:52,273 : INFO : read 1000 essays
2018-06-14 08:28:52,680 : INFO : read 2000 essays
2018-06-14 08:28:53,112 : INFO : read 3000 essays
2018-06-14 08:28:53,430 : INFO : read 4000 essays
2018-06-14 08:28:53,566 : INFO : read 5000 essays
2018-06-14 08:28:53,677 : INFO : read 6000 essays
2018-06-14 08:28:53,803 : INFO : read 7000 essays
2018-06-14 08:28:53,940 : INFO : read 8000 essays
2018-06-14 08:28:54,094 : INFO : read 9000 essays
2018-06-14 08:28:54,276 : INFO : read 10000 essays
2018-06-14 08:28:54,471 : INFO : read 11000 essays
2018-06-14 08:28:54,672 : INFO : read 12000 essays


12975 essays preprocessed.


In [12]:
essay_list

[['dear',
  'local',
  'newspaper',
  'think',
  'effects',
  'computers',
  'have',
  'on',
  'people',
  'are',
  'great',
  'learning',
  'skills',
  'affects',
  'because',
  'they',
  'give',
  'us',
  'time',
  'to',
  'chat',
  'with',
  'friends',
  'new',
  'people',
  'helps',
  'us',
  'learn',
  'about',
  'the',
  'globe',
  'astronomy',
  'and',
  'keeps',
  'us',
  'out',
  'of',
  'troble',
  'thing',
  'about',
  'dont',
  'you',
  'think',
  'so',
  'how',
  'would',
  'you',
  'feel',
  'if',
  'your',
  'teenager',
  'is',
  'always',
  'on',
  'the',
  'phone',
  'with',
  'friends',
  'do',
  'you',
  'ever',
  'time',
  'to',
  'chat',
  'with',
  'your',
  'friends',
  'or',
  'buisness',
  'partner',
  'about',
  'things',
  'well',
  'now',
  'there',
  'new',
  'way',
  'to',
  'chat',
  'the',
  'computer',
  'theirs',
  'plenty',
  'of',
  'sites',
  'on',
  'the',
  'internet',
  'to',
  'do',
  'so',
  'organization',
  'organization',
  'caps',
  'facebo

In [17]:
# build vocabulary and train model
model = Word2Vec(
    essay_list,
    size=50,
    window=10,
    min_count=3,
    workers=10)
model.train(essay_list, total_examples=len(essay_list), epochs=10)
 

2018-06-14 08:38:28,260 : INFO : collecting all words and their counts
2018-06-14 08:38:28,261 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-14 08:38:28,577 : INFO : PROGRESS: at sentence #10000, processed 2019911 words, keeping 29889 word types
2018-06-14 08:38:28,708 : INFO : collected 37965 word types from a corpus of 2781414 raw words and 12976 sentences
2018-06-14 08:38:28,709 : INFO : Loading a fresh vocabulary
2018-06-14 08:38:28,753 : INFO : min_count=2 retains 17674 unique words (46% of original 37965, drops 20291)
2018-06-14 08:38:28,755 : INFO : min_count=2 leaves 2761123 word corpus (99% of original 2781414, drops 20291)
2018-06-14 08:38:28,808 : INFO : deleting the raw counts dictionary of 37965 items
2018-06-14 08:38:28,810 : INFO : sample=0.001 downsamples 71 most-common words
2018-06-14 08:38:28,810 : INFO : downsampling leaves estimated 2029216 word corpus (73.5% of prior 2761123)
2018-06-14 08:38:28,865 : INFO : estimated required 

2018-06-14 08:38:41,226 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-06-14 08:38:41,227 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-06-14 08:38:41,232 : INFO : worker thread finished; awaiting finish of 6 more threads
2018-06-14 08:38:41,242 : INFO : worker thread finished; awaiting finish of 5 more threads
2018-06-14 08:38:41,244 : INFO : worker thread finished; awaiting finish of 4 more threads
2018-06-14 08:38:41,250 : INFO : worker thread finished; awaiting finish of 3 more threads
2018-06-14 08:38:41,255 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-14 08:38:41,263 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-14 08:38:41,266 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-14 08:38:41,267 : INFO : EPOCH - 1 : training on 2781414 raw words (2029140 effective words) took 2.0s, 1008429 effective words/s
2018-06-14 08:38:42,286 : INFO : EPOCH 2 - 

2018-06-14 08:38:54,524 : INFO : worker thread finished; awaiting finish of 2 more threads
2018-06-14 08:38:54,526 : INFO : worker thread finished; awaiting finish of 1 more threads
2018-06-14 08:38:54,540 : INFO : worker thread finished; awaiting finish of 0 more threads
2018-06-14 08:38:54,542 : INFO : EPOCH - 7 : training on 2781414 raw words (2029566 effective words) took 2.3s, 865217 effective words/s
2018-06-14 08:38:55,573 : INFO : EPOCH 8 - PROGRESS: at 24.73% examples, 821114 words/s, in_qsize 19, out_qsize 0
2018-06-14 08:38:56,584 : INFO : EPOCH 8 - PROGRESS: at 95.00% examples, 862175 words/s, in_qsize 19, out_qsize 0
2018-06-14 08:38:56,815 : INFO : worker thread finished; awaiting finish of 9 more threads
2018-06-14 08:38:56,816 : INFO : worker thread finished; awaiting finish of 8 more threads
2018-06-14 08:38:56,818 : INFO : worker thread finished; awaiting finish of 7 more threads
2018-06-14 08:38:56,820 : INFO : worker thread finished; awaiting finish of 6 more thread

(20290388, 27814140)

In [18]:
model.wv.most_similar ('computer')

2018-06-14 08:41:03,850 : INFO : precomputing L2-norms of word weight vectors


[('computers', 0.6832188963890076),
 ('internet', 0.6756477952003479),
 ('internent', 0.5187956690788269),
 ('keyboard', 0.430398166179657),
 ('computor', 0.38912853598594666),
 ('comptuer', 0.36814573407173157),
 ('compuer', 0.36317992210388184),
 ('patients', 0.3618055284023285),
 ('computors', 0.3612821400165558),
 ('web', 0.3601936101913452)]

In [21]:
model.wv.doesnt_match(['computer', 'internet', 'hibiscus', 'patients'])

'hibiscus'

In [None]:
# cut the text in semi-redundant sequences of maxlen characters
maxlen = 40
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        x[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(128, input_shape=(maxlen, len(chars))))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)


def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(400):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

print_callback = LambdaCallback(on_epoch_end=on_epoch_end)

model.fit(x, y,
          batch_size=128,
          epochs=60,
          callbacks=[print_callback])

In [5]:
def read_sentences(essays, min_sent_length=3):
    """ read strings from a pandas dataframe series
        and prepare for cleaning using NLTK sent_tokenizer
    """
    stats = pd.DataFrame()
    sentences = []
    skipped, word_count = 0, 0
    
    # get sentences from essays
    data = [sent_tokenize(s) for s in essays]

    # combine essays/sentences to single list
    t_sents = list(itertools.chain.from_iterable(data))

    for sentence in t_sents:
        # remove short sentences 
        n_words = len(sentence.split())
        if n_words < min_sent_length:
            skipped += 1
        else:
            word_count += n_words
            sentences.append(sentence.strip())

    stats = pd.Series({'Sentences': len(sentences),
                       '# Words': word_count,
                       'Skipped': skipped})
    return sentences, stats

In [6]:
def clean_sentences(sents, nlp):
    exclude = ['PUNCT', 'SYM', 'X']
    start = time()
    vocab = Counter()
    sents = nlp.pipe(sents)
    d = []
    with open('ngrams_1.txt', 'a', encoding='utf8') as f:
        for i, sent in enumerate(sents):
            if i % 20000 == 0 and i > 0:
                print(i, end=' ')
            d.extend([[i, w.text, w.pos_] for w in sent])
            clean_sentence = [w.text.lower() for w in sent if w.pos_ not in exclude]
            vocab.update(clean_sentence)
            f.write(' '.join(clean_sentence) + '\n')

    vocab = pd.Series(vocab).sort_values(ascending=False).to_frame('count')
    with pd.HDFStore('vocab.h5') as store:
        store.put('vocab', vocab)
        store.put('tokens', pd.DataFrame(d, columns=['sent_id', 'token', 'pos']))
    duration = time() - start
    print('\n\tDuration: ', format_time(duration))

In [7]:
sentences, stats = {}, pd.DataFrame()

sentences, stats = read_sentences(docs.essay)

print('en_core_web_sm', end=': ')

clean_sentences(sentences, spacy.load('en_core_web_sm')) 

stats

en_core_web_sm: 20000 40000 60000 80000 100000 120000 140000 160000 
	Duration:  00:12:08


Sentences     163100
# Words      2887454
Skipped         1904
dtype: int64

### Corpus Summary Stats

In [8]:
with pd.HDFStore('vocab.h5') as store:
    store.put('stats', stats)

### Inspect Result

In [9]:
sentences[:3]

['Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble!',
 'Dont you think so?',
 'How would you feel if your teenager is always on the phone with friends!']

### Create n-grams

In [10]:
def create_ngrams(max_length=3):
    """Using gensim to create ngrams"""
    
    n_grams = pd.DataFrame()
    start = time()
    for n in range(2, max_length + 1):
        print(n, end=' ')
        
        sentences = LineSentence(str('ngrams_{}.txt'.format(n-1)))
        phrases = Phrases(sentences, threshold=100, min_count=10)

        s = pd.Series({k.decode('utf-8'): v for k,
                       v in phrases.export_phrases(sentences)}) 
        s = s.to_frame('score').reset_index().rename(
            columns={'index': 'phrase'}).assign(length=n)
        
        n_grams = pd.concat([n_grams, s])
        grams = Phraser(phrases)
        sentences = grams[sentences]
        
        with open('ngrams_{}.txt'.format(n), 'w', encoding='utf8') as f:
            for sentence in sentences:
                f.write(' '.join(sentence) + '\n')
                
    n_grams = n_grams.sort_values('score', ascending=False)
    n_grams.phrase = n_grams.phrase.str.replace('_', ' ')
    n_grams['ngram'] = n_grams.phrase.str.replace(' ', '_')
    
    with pd.HDFStore('vocab.h5') as store:
        store.put('ngrams', n_grams)
        
    print('\n\tDuration: ', format_time(time() - start))
    print('\tngrams: {:,d}\n'.format(len(n_grams)))
    print(n_grams.groupby('length').size())

In [11]:
print('\n', 'processing', end=' ')
create_ngrams()


 processing 2 3 
	Duration:  00:01:06
	ngrams: 570

length
2    420
3    150
dtype: int64


In [13]:
""" read strings from a pandas dataframe series
    and prepare for cleaning using NLTK sent_tokenizer
"""
sentences = []

# get sentences from essays
data = [sent_tokenize(s) for s in docs.essay]

# combine essays/sentences to single list
t_sents = list(itertools.chain.from_iterable(data))

for sentence in t_sents:
    # remove short sentences 
    n_words = len(sentence.split())
    if n_words >= 3:
        sentences.append(sentence.strip())

In [20]:
from nltk.corpus import brown
brown.sents()

[['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', 'Friday', 'an', 'investigation', 'of', "Atlanta's", 'recent', 'primary', 'election', 'produced', '``', 'no', 'evidence', "''", 'that', 'any', 'irregularities', 'took', 'place', '.'], ['The', 'jury', 'further', 'said', 'in', 'term-end', 'presentments', 'that', 'the', 'City', 'Executive', 'Committee', ',', 'which', 'had', 'over-all', 'charge', 'of', 'the', 'election', ',', '``', 'deserves', 'the', 'praise', 'and', 'thanks', 'of', 'the', 'City', 'of', 'Atlanta', "''", 'for', 'the', 'manner', 'in', 'which', 'the', 'election', 'was', 'conducted', '.'], ...]

In [None]:
from gensim.models import Word2Vec

model = Word2Vec(docs.tokens, min_count=5)

In [29]:
#words most similar to mother
print(model.wv.most_similar('laughter'))

#find the odd one out
print(model.wv.doesnt_match('spring snow plant geese'.split()))
# print(model.doesnt_match('cat dog table'.split()))


[('joy', 0.628595232963562), ('relationships', 0.6222796440124512), ('happiness', 0.6136453151702881), ('Laughter', 0.6105511784553528), ('humor', 0.589509129524231), ('relationship', 0.5732378959655762), ('patience', 0.5652826428413391), ('technology', 0.5648206472396851), ('laughters', 0.5565989017486572), ('friendship', 0.5425175428390503)]
plant


In [30]:
sentences = LineSentence(docs.tokens)

In [31]:
start = time()

model = Word2Vec(sentences,
                 sg=1,
                 size=300,
                 window=5,
                 min_count=10,
                 negative=10,
                 workers=8,
                 iter=20,
                 alpha=0.05)

model.wv.save('word_vectors.bin')
print('Duration: {:,.1f}s'.format(time() - start))

# gensim computes accuracy based on source text files
# detailed_accuracy = model.wv.accuracy(str(ANALOGIES_PATH), case_insensitive=True)

# get accuracy per category
# summary = accuracy_by_category(detailed_accuracy)
# print('Base Accuracy: Correct {:,.0f} | Wrong {:,.0f} | Avg {:,.2%}\n'.format(*summary))

TypeError: don't know how to handle uri 0        [Dear, local, newspaper, ,, I, think, effects,...
1        [Dear, @CAPS1, @CAPS2, ,, I, believe, that, us...
2        [Dear, ,, @CAPS1, @CAPS2, @CAPS3, More, and, m...
3        [Dear, Local, Newspaper, ,, @CAPS1, I, have, f...
4        [Dear, @LOCATION1, ,, I, know, having, compute...
5        [Dear, @LOCATION1, ,, I, think, that, computer...
6        [Did, you, know, that, more, and, more, people...
7        [@PERCENT1, of, people, agree, that, computers...
8        [Dear, reader, ,, @ORGANIZATION1, has, had, a,...
9        [In, the, @LOCATION1, we, have, the, technolog...
10       [Dear, @LOCATION1, ,, @CAPS1, people, acknowle...
11       [Dear, @CAPS1, @CAPS2, I, feel, that, computer...
12       [Dear, local, newspaper, I, raed, ur, argument...
13       [My, three, detaileds, for, this, news, paper,...
14       [Dear, ,, In, this, world, today, we, should, ...
15       [Dear, @ORGANIZATION1, ,, The, computer, blink...
16       [Dear, Local, Newspaper, ,, I, belive, that, c...
17       [Dear, Local, Newspaper, ,, I, must, admit, th...
18       [I, aegre, waf, the, evansmant, ov, tnachnolag...
19       [Well, computers, can, be, a, good, or, a, bad...
20       [Dear, @CAPS1, of, the, @CAPS2, @CAPS3, daily,...
21       [Dear, local, Newspaper, @CAPS1, a, take, all,...
22       [Dear, local, newspaper, ,, @CAPS1, you, ever,...
23       [Dear, local, newspaper, ,, I, 've, heard, tha...
24       [Dear, @CAPS1, ,, @CAPS2, off, ,, I, beileve, ...
25       [Do, you, think, that, computers, are, useless...
26       [Computers, a, good, because, you, can, get, i...
27       [Dear, Newspaper, ,, Computers, are, high, tec...
28       [Dear, local, newspaper, ,, @CAPS1, people, th...
29       [Dear, Newspaper, People, ,, I, think, that, c...
                               ...                        
12946    [ , We, all, understand, the, benefits, of, la...
12947    [      , It, was, midsummer, ,, and, i, could,...
12948    [ , Have, you, ever, experienced, a, time, wit...
12949    [ , I, woke, up, just, like, any, other, day, ...
12950    [ , Laughter, is, an, important, part, of, my,...
12951    [ , I, sat, at, the, table, ,, speechless, ,, ...
12952    [ , As, I, remember, back, ,, it, was, @DATE1,...
12953    [ , Those, eyes, ,, it, was, like, I, was, loo...
12954    [Some, say, that, laugh, is, the, common, lang...
12955    [ , Laughter, is, an, integral, element, to, m...
12956    [One, time, I, was, at, my, friend, @PERSON1, ...
12957    [ , LAUGHTER, @CAPS1, knows, that, laughter, i...
12958    [One, thing, that, people, in, the, world, lov...
12959    [ , Laughter, ,, to, me, ,, is, an, important,...
12960    [ , People, always, say, that, the, worst, par...
12961    [ , Why, is, it, that, people, can, look, back...
12962    [ , Before, my, best, friend, moved, away, ,, ...
12963    [                              , @ORGANIZATION...
12964    [ , Morose, and, somnolent, ,, I, woke, up, .,...
12965    [ , A, while, back, my, mom, had, decided, to,...
12966                         [I, do, nt, like, computers]
12967    [ , Everyone, knows, how, important, a, laugh,...
12968    [ , Laughter, is, an, important, part, of, my,...
12969    [ , laughter, is, an, important, part, of, any...
12970    [Sometime, ago, on, a, hot, @DATE1, day, my, @...
12971    [ , In, most, stories, mothers, and, daughters...
12972    [ , I, never, understood, the, meaning, laught...
12973    [When, you, laugh, ,, is, @CAPS5, out, of, hab...
12974    [                               , Trippin, ', ...
12975    [ , Many, people, believe, that, laughter, can...
Name: tokens, Length: 12976, dtype: object