In [1]:
import nltk
from collections import defaultdict, Counter
import gensim, logging
import nltk
from gensim.models.keyedvectors import KeyedVectors
from gensim.similarities.index import AnnoyIndexer
from multiprocessing import Process, Pool
import os
import collections

In [2]:
nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [3]:
# total_corpus = []
# for file in nltk.corpus.gutenberg.fileids():
#     with open((nltk.corpus.gutenberg.words(file).fileid.path), 'r') as f:
#         for line in f.readlines():
#             try:
#                 total_corpus.append(line.split())
#             except:
#                 print ("hi")
#                 print (line)

In [4]:
emma = nltk.corpus.gutenberg.words('austen-emma.txt')

In [5]:
# with open(emma.fileid.path) as f:
#     for line in f.readlines():
#         print (line)

In [6]:
vocab = list(set(emma))
len(vocab)

7811

In [7]:
def extract_patterns_in_words(patterns,pattern_counter,word1,word2,max_len):
    i = 1
    while(word1[:i] == word2[:i]):
        i = i + 1
    if i != 1 and i > max(len(word1[i-1:]), len(word2[i-1:])) < max_len:
        pattern_counter[("suffix",word1[i-1:], word2[i-1:])] += 1
        if ("suffix",word1[i-1:], word2[i-1:]) in patterns:
            patterns[("suffix",word1[i-1:], word2[i-1:])].append((word1, word2))
        else:
            patterns[("suffix",word1[i-1:], word2[i-1:])] = [(word1, word2)]
#         patterns[("suffix",word1[i-1:], word2[i-1:], word1, word2)] += 1
    i = 1
    while(word1[-i:] == word2[-i:]):
        i = i + 1
    if i != 1 and max(len(word1[:-i+1]), len(word2[:-i+1])) < max_len:
        pattern_counter[("prefix",word1[:-i+1], word2[:-i+1])] += 1
        if ("prefix",word1[:-i+1], word2[:-i+1]) in patterns:
            patterns[("prefix",word1[:-i+1], word2[:-i+1])].append((word1, word2))
        else:
            patterns[("prefix",word1[:-i+1], word2[:-i+1])] = [(word1, word2)]
#         patterns[("prefix",word1[:-i+1], word2[:-i+1], word1, word2)] += 1
    return patterns

In [8]:
def build_pattern_dict(vocab,max_len = 6):
    patterns  = defaultdict(int)
    pattern_counter = Counter()
    for word in vocab:
        for second_word in vocab:
            if word != second_word:
                extract_patterns_in_words(patterns,pattern_counter,word,second_word,max_len)
    return patterns, pattern_counter

In [9]:
%time patterns,pattern_counter = build_pattern_dict(vocab)

CPU times: user 1min 27s, sys: 664 ms, total: 1min 28s
Wall time: 1min 28s


In [10]:
len(patterns)

1177224

## Most Common patterns

In [11]:
pattern_counter.most_common(10)

[(('suffix', '', 's'), 700),
 (('suffix', 's', ''), 700),
 (('suffix', 'ed', 'ing'), 365),
 (('suffix', 'ing', 'ed'), 365),
 (('suffix', '', 'ed'), 324),
 (('suffix', 'ed', ''), 324),
 (('suffix', '', 'ing'), 305),
 (('suffix', 'ing', ''), 305),
 (('suffix', '', 'ly'), 275),
 (('suffix', 'ly', ''), 275)]

## Least Common Patterns

In [12]:
pattern_counter.most_common()[:-20:-1]

[(('prefix', 'the', 'nur'), 1),
 (('prefix', 'stai', 'a'), 1),
 (('prefix', 'rod', 'Whil'), 1),
 (('prefix', 'usag', 'Non'), 1),
 (('suffix', 'e', 'tled'), 1),
 (('prefix', 'seem', 'soul'), 1),
 (('prefix', 'ha', 'telli'), 1),
 (('prefix', 'Te', 'frow'), 1),
 (('prefix', 'vow', 'prais'), 1),
 (('prefix', 'misse', 'fiel'), 1),
 (('prefix', 'poore', 'you'), 1),
 (('prefix', 'Fin', 'war'), 1),
 (('prefix', 'pull', 'see'), 1),
 (('prefix', 'Tak', 'lat'), 1),
 (('prefix', 'Hodge', 'boy'), 1),
 (('suffix', 're', 'pe'), 1),
 (('prefix', 'hurry', 'Rous'), 1),
 (('prefix', 'stair', 'curl'), 1),
 (('prefix', 'wors', 'undon'), 1)]

In [13]:
common_patterns = Counter()
for key in pattern_counter:
    if pattern_counter[key] > 50:
        common_patterns[key] = pattern_counter[key]

In [14]:
common_patterns.most_common(10)

[(('suffix', '', 's'), 700),
 (('suffix', 's', ''), 700),
 (('suffix', 'ed', 'ing'), 365),
 (('suffix', 'ing', 'ed'), 365),
 (('suffix', '', 'ed'), 324),
 (('suffix', 'ed', ''), 324),
 (('suffix', 'ing', ''), 305),
 (('suffix', '', 'ing'), 305),
 (('suffix', '', 'ly'), 275),
 (('suffix', 'ly', ''), 275)]

In [58]:
%time word_vectors = KeyedVectors.load_word2vec_format('/home/raja/GoogleNews-vectors-negative300.bin.gz', binary=True, limit=10000)

CPU times: user 268 ms, sys: 8 ms, total: 276 ms
Wall time: 272 ms


In [59]:
len(word_vectors.vocab)

10000

In [60]:
def index_vector(word_vectors, dimensions=300):
    fname = '../data/annoy.index'
    # Persist index to disk
    if os.path.exists(fname):
        annoy_index = AnnoyIndexer()
        annoy_index.load(fname)
        annoy_index.model = word_vectors
    else:
        annoy_index = AnnoyIndexer(word_vectors, dimensions)
    annoy_index.save(fname)
    return annoy_index

In [61]:
word_vectors.init_sims()
%time annoy_index = index_vector(word_vectors=word_vectors, dimensions=300)

CPU times: user 15.5 s, sys: 64 ms, total: 15.6 s
Wall time: 15.6 s


In [73]:
# Dry run to make sure both indices are fully in RAM
vector = word_vectors.wv.syn0norm[1119]

In [74]:
word_vectors.most_similar([vector], topn=5,indexer=annoy_index)

[('successful', 1.0),
 ('success', 0.5622546076774597),
 ('profitable', 0.4990977644920349),
 ('successes', 0.49772655963897705),
 ('accomplished', 0.49642592668533325)]

In [76]:
word_vectors.most_similar([vector], topn=5)

[('successful', 0.9999998807907104),
 ('success', 0.6167577505111694),
 ('unsuccessful', 0.501818060874939),
 ('profitable', 0.4981939196586609),
 ('successes', 0.49544286727905273)]