In [1]:
import os
import random
import numpy as np
import json
import pickle as pkl
import logging
from pprint import pprint

import gensim
import pytrec_eval
from tqdm import tqdm

import read_ap
import download_ap

[nltk_data] Downloading package stopwords to /home/klm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/klm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# ensure dataset is downloaded
download_ap.download_dataset()
# pre-process the text
docs_by_id = read_ap.get_processed_docs()

Docs already processed. Loading from disk


In [3]:
def divide_test_train(docs, doc_train="train_docs", doc_test="test_docs"):
    path_train = f"./pickles/{doc_train}.pkl"
    path_test = f"./pickles/{doc_test}.pkl"

    if not os.path.exists(path_train) or not os.path.exists(path_test):

        train_docs_len = len(list(docs.keys())) - (len(list(docs.keys())) // 10)
        train_docs, test_docs = {}, {}
        for i, doc_id in enumerate(docs):
            if i < train_docs_len:
                train_docs[doc_id] = docs[doc_id]
            else:
                test_docs[doc_id] = docs[doc_id]


        with open(path_train, "wb") as writer:
            pkl.dump(train_docs, writer)

        with open(path_test, "wb") as writer:
            pkl.dump(test_docs, writer)

        return train_docs, test_docs

    else:
        print("Docs already processed. Loading from disk")

        with open(path_train, 'rb') as reader_train, open(path_test, 'rb') as reader_test:
            return pkl.load(reader_train), pkl.load(reader_test)

        
def read_corpus(docs, keys_tags_dict, tokens_only=False):
    length_dict = len(list(keys_tags_dict.keys()))
    for i, key in enumerate(docs):
        if tokens_only:
            keys_tags_dict[length_dict + i] = key
            yield docs[key]
        else:
            keys_tags_dict[i] = key
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(docs[key], [i])

In [None]:
train_docs, test_docs = divide_test_train(docs_by_id)
keys_tags_dict = {}
train_corpus = list(read_corpus(train_docs, keys_tags_dict))
test_corpus = list(read_corpus(test_docs, keys_tags_dict, tokens_only=True))

Docs already processed. Loading from disk


In [5]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=100, window=2, min_count=50, workers=4, start_alpha=0.025, end_alpha=0.005, epochs=10)

In [6]:
model.build_vocab(train_corpus)

2020-02-26 16:24:01,729 : INFO : collecting all words and their counts
2020-02-26 16:24:01,730 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-02-26 16:24:02,260 : INFO : PROGRESS: at example #10000, processed 2789090 words (5264391/s), 71506 word types, 10000 tags
2020-02-26 16:24:02,810 : INFO : PROGRESS: at example #20000, processed 5586885 words (5094640/s), 104372 word types, 20000 tags
2020-02-26 16:24:03,363 : INFO : PROGRESS: at example #30000, processed 8373626 words (5045785/s), 131265 word types, 30000 tags
2020-02-26 16:24:03,921 : INFO : PROGRESS: at example #40000, processed 11178544 words (5030523/s), 154289 word types, 40000 tags
2020-02-26 16:24:04,482 : INFO : PROGRESS: at example #50000, processed 13972902 words (4994456/s), 173747 word types, 50000 tags
2020-02-26 16:24:05,051 : INFO : PROGRESS: at example #60000, processed 16773284 words (4922940/s), 191603 word types, 60000 tags
2020-02-26 16:24:05,612 : INFO : PROGRESS: at exa

In [7]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

2020-02-26 16:24:44,597 : INFO : training model with 4 workers on 24196 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=2
2020-02-26 16:24:45,603 : INFO : EPOCH 1 - PROGRESS: at 2.30% examples, 859890 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:24:46,610 : INFO : EPOCH 1 - PROGRESS: at 4.67% examples, 876717 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:24:47,624 : INFO : EPOCH 1 - PROGRESS: at 7.10% examples, 880352 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:24:48,639 : INFO : EPOCH 1 - PROGRESS: at 9.46% examples, 875642 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:24:49,651 : INFO : EPOCH 1 - PROGRESS: at 11.77% examples, 873317 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:24:50,658 : INFO : EPOCH 1 - PROGRESS: at 14.10% examples, 875682 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:24:51,677 : INFO : EPOCH 1 - PROGRESS: at 16.46% examples, 874515 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:24:52,682 : INFO : EPOCH 1 - PROGRESS: at 18.80% 

2020-02-26 16:25:54,014 : INFO : EPOCH 2 - PROGRESS: at 60.93% examples, 874995 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:25:55,033 : INFO : EPOCH 2 - PROGRESS: at 63.28% examples, 874688 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:25:56,061 : INFO : EPOCH 2 - PROGRESS: at 65.67% examples, 875019 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:25:57,091 : INFO : EPOCH 2 - PROGRESS: at 68.01% examples, 874714 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:25:58,097 : INFO : EPOCH 2 - PROGRESS: at 70.39% examples, 874870 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:25:59,113 : INFO : EPOCH 2 - PROGRESS: at 72.69% examples, 874584 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:26:00,117 : INFO : EPOCH 2 - PROGRESS: at 75.01% examples, 874731 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:26:01,127 : INFO : EPOCH 2 - PROGRESS: at 77.37% examples, 874950 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:26:02,138 : INFO : EPOCH 2 - PROGRESS: at 79.72% examples, 875214 words/s, in_qsiz

2020-02-26 16:26:59,766 : INFO : EPOCH 4 - PROGRESS: at 11.58% examples, 864296 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:00,774 : INFO : EPOCH 4 - PROGRESS: at 13.94% examples, 869474 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:01,775 : INFO : EPOCH 4 - PROGRESS: at 16.16% examples, 863822 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:02,775 : INFO : EPOCH 4 - PROGRESS: at 18.48% examples, 864884 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:03,779 : INFO : EPOCH 4 - PROGRESS: at 20.81% examples, 865509 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:04,791 : INFO : EPOCH 4 - PROGRESS: at 23.16% examples, 866402 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:27:05,792 : INFO : EPOCH 4 - PROGRESS: at 25.50% examples, 867842 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:06,801 : INFO : EPOCH 4 - PROGRESS: at 27.74% examples, 864052 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:27:07,804 : INFO : EPOCH 4 - PROGRESS: at 30.08% examples, 865265 words/s, in_qsiz

2020-02-26 16:28:09,591 : INFO : EPOCH 5 - PROGRESS: at 72.04% examples, 866365 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:10,606 : INFO : EPOCH 5 - PROGRESS: at 74.32% examples, 865897 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:11,610 : INFO : EPOCH 5 - PROGRESS: at 76.67% examples, 866304 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:12,617 : INFO : EPOCH 5 - PROGRESS: at 78.97% examples, 866344 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:13,635 : INFO : EPOCH 5 - PROGRESS: at 81.29% examples, 866118 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:14,640 : INFO : EPOCH 5 - PROGRESS: at 83.61% examples, 866398 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:15,664 : INFO : EPOCH 5 - PROGRESS: at 85.84% examples, 865601 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:28:16,684 : INFO : EPOCH 5 - PROGRESS: at 88.15% examples, 865625 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:28:17,705 : INFO : EPOCH 5 - PROGRESS: at 90.45% examples, 865814 words/s, in_qsiz

2020-02-26 16:29:14,663 : INFO : EPOCH 7 - PROGRESS: at 20.77% examples, 860705 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:29:15,685 : INFO : EPOCH 7 - PROGRESS: at 23.08% examples, 860272 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:29:16,696 : INFO : EPOCH 7 - PROGRESS: at 25.45% examples, 862377 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:29:17,702 : INFO : EPOCH 7 - PROGRESS: at 27.83% examples, 863670 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:29:18,726 : INFO : EPOCH 7 - PROGRESS: at 30.13% examples, 862153 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:29:19,738 : INFO : EPOCH 7 - PROGRESS: at 32.43% examples, 861644 words/s, in_qsize 8, out_qsize 1
2020-02-26 16:29:20,746 : INFO : EPOCH 7 - PROGRESS: at 34.77% examples, 863147 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:29:21,746 : INFO : EPOCH 7 - PROGRESS: at 37.12% examples, 864744 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:29:22,750 : INFO : EPOCH 7 - PROGRESS: at 39.48% examples, 865680 words/s, in_qsiz

2020-02-26 16:30:24,784 : INFO : EPOCH 8 - PROGRESS: at 80.91% examples, 860898 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:30:25,792 : INFO : EPOCH 8 - PROGRESS: at 83.19% examples, 861000 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:30:26,803 : INFO : EPOCH 8 - PROGRESS: at 85.48% examples, 860895 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:30:27,820 : INFO : EPOCH 8 - PROGRESS: at 87.75% examples, 860875 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:30:28,834 : INFO : EPOCH 8 - PROGRESS: at 90.07% examples, 861548 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:30:29,835 : INFO : EPOCH 8 - PROGRESS: at 92.39% examples, 862088 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:30:30,845 : INFO : EPOCH 8 - PROGRESS: at 94.64% examples, 861906 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:30:31,853 : INFO : EPOCH 8 - PROGRESS: at 96.95% examples, 862420 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:30:32,859 : INFO : EPOCH 8 - PROGRESS: at 99.31% examples, 862978 words/s, in_qsiz

2020-02-26 16:31:30,102 : INFO : EPOCH 10 - PROGRESS: at 29.78% examples, 853540 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:31:31,103 : INFO : EPOCH 10 - PROGRESS: at 31.96% examples, 851042 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:31:32,118 : INFO : EPOCH 10 - PROGRESS: at 34.24% examples, 851082 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:31:33,124 : INFO : EPOCH 10 - PROGRESS: at 36.54% examples, 852586 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:31:34,143 : INFO : EPOCH 10 - PROGRESS: at 38.58% examples, 846673 words/s, in_qsize 7, out_qsize 0
2020-02-26 16:31:35,155 : INFO : EPOCH 10 - PROGRESS: at 40.85% examples, 845785 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:31:36,155 : INFO : EPOCH 10 - PROGRESS: at 43.08% examples, 845792 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:31:37,165 : INFO : EPOCH 10 - PROGRESS: at 45.40% examples, 845921 words/s, in_qsize 8, out_qsize 0
2020-02-26 16:31:38,183 : INFO : EPOCH 10 - PROGRESS: at 47.66% examples, 845225 words/s

In [8]:
from gensim.test.utils import get_tmpfile

fname = get_tmpfile("my_doc2vec_model")
model.save(fname)
model = gensim.models.doc2vec.Doc2Vec.load(fname)

2020-02-26 16:32:01,298 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2020-02-26 16:32:01,299 : INFO : built Dictionary(12 unique tokens: ['computer', 'human', 'interface', 'response', 'survey']...) from 9 documents (total 29 corpus positions)
2020-02-26 16:32:01,301 : INFO : saving Doc2Vec object under /tmp/my_doc2vec_model, separately None
2020-02-26 16:32:01,302 : INFO : storing np array 'vectors_docs' to /tmp/my_doc2vec_model.docvecs.vectors_docs.npy
2020-02-26 16:32:01,562 : INFO : saved /tmp/my_doc2vec_model
2020-02-26 16:32:01,563 : INFO : loading Doc2Vec object from /tmp/my_doc2vec_model
2020-02-26 16:32:03,069 : INFO : loading vocabulary recursively from /tmp/my_doc2vec_model.vocabulary.* with mmap=None
2020-02-26 16:32:03,070 : INFO : loading trainables recursively from /tmp/my_doc2vec_model.trainables.* with mmap=None
2020-02-26 16:32:03,071 : INFO : loading wv recursively from /tmp/my_doc2vec_model.wv.* with mmap=None
2020-02-26 16:32:03,071 : INFO : loadin

In [9]:
model.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [10]:
qrels, queries = read_ap.read_qrels()

In [11]:
overall_ser = {}

print("Running Doc2Vec Benchmark")
# collect results
for i, qid in enumerate(tqdm(qrels)):
    query_text = queries[qid]
    processed_query = read_ap.process_text(query_text)
    inferred_vector = model.infer_vector(processed_query)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    sims = [(keys_tags_dict[doc_id], np.float64(score)) for (doc_id, score) in sims]
    overall_ser[qid] = dict(sims)

  0%|          | 0/149 [00:00<?, ?it/s]2020-02-26 16:32:03,201 : INFO : precomputing L2-norms of doc weight vectors


Running Doc2Vec Benchmark


100%|██████████| 149/149 [01:24<00:00,  1.77it/s]


In [12]:
# run evaluation with `qrels` as the ground truth relevance judgements
# here, we are measuring MAP and NDCG, but this can be changed to 
# whatever you prefer
evaluator = pytrec_eval.RelevanceEvaluator(qrels, {'map', 'ndcg'})
metrics = evaluator.evaluate(overall_ser)

# # dump this to JSON
# # *Not* Optional - This is submitted in the assignment!
with open(f"./pickles/doc2vec.json", "w") as writer:
    json.dump(metrics, writer, indent=1)

## Test model

In [13]:
subsample = (len(train_corpus) // 1000)
ranks = []
second_ranks = []
for doc_id in range(subsample):
    if doc_id % 100 == 0:
        print(doc_id)
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

0
100


In [14]:
import collections

counter = collections.Counter(ranks)
print(counter)

Counter({0: 147, 327: 1})


In [15]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (147): «nation polic agenc ask interpol aid hunt 26 suspect member japanes red armi tri prevent terrorist act 1988 olymp agenc offici said thursday list name 26 sent wednesday paris-bas interpol intern crimin polic organ said offici speak condit anonym said list includ updat portrait seven member base inform obtain recent japanes polic agenc offici said polic fear revolutionari group would carri terrorist act insid outsid japan olymp game neighbor south korea said request would help search red armi member interpol 's 146 member nation nation polic agenc also plan distribut 10,000 poster leaflet describ want member japanes embassi offic local compani abroad includ airport offici said said list includ fusako shigenobu 42 suspect mastermind 1972 machin gun attack tel aviv 's lod airport kill 28 peopl polic napl itali said ms. shigenobu may involv explos car bomb car outsid u.s. militari club napl last week blast kill american servicewoman four italian junzo okudaira anoth red arm

In [16]:
# Pick a random document from the corpus and infer a vector from the model
doc_id = random.randint(0, subsample - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

Train Document (123): «hungari 's first pet cemeteri open fall necessari permit issu privat group run govern newspap report thursday last rest place `` love one '' establish rakosvoelgy cooper farm `` true spirit age engag entrepreneuri activ '' daili magyar hirlap said anim cemeteri locat outsid capit `` cours buri pet cost money '' magyar hirlap said report say expens would»

Similar Document (55589, 0.5703246593475342): «despit promis horse-diap care driver luxuri resort said neigh 100-an-hour horse-and-carriag servic similar oper new york 's central park `` hors love anim think 're beauti someon els 's town '' town councilman paul ilyinski said thursday palm beach public safeti committe turn propos ilyinski said would accid `` lot mangl hors '' allow trot around town darci canevari 24 run horse-drawn carriag servic norwalk conn. ask trial run `` think peopl would love '' said `` 's romant '' anyon object manur street canevari said anim fit diaper said howev town `` like manur stree

In [17]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (5315): «justic depart tighten rule bring racket case seiz defend asset trial accord memo releas tuesday polici requir approv senior justic depart offici effort seiz defend 's asset impos respons critic tactic sever highli public case accord one memo sent feder prosecutor racket influenc corrupt organ act allow prosecutor obtain temporari restrain order seiz asset defend ensur fund avail juri order forfeit part verdict assist attorney gener edward s.g. denni jr. said prosecutor make clear court `` govern seek disrupt normal legitim busi activ defend '' `` highli public case ... subject consider critic press percept pre-trial freez asset tantamount seizur properti without due process '' denni wrote june 30 memorandum anoth memo assist attorney gener shirley d. peterson head tax divis sharpli limit use rico law tax-fraud case mail-fraud charg may sustitut tax-fraud charg form basi rico prosecut `` except circumst '' tax return mail promot `` tax-fraud scheme '' wrote separ 