In [None]:
from pathlib import Path

import gensim
from gensim import corpora

In [None]:
test_dat_dir = Path(gensim.__path__[0])/"test"/"test_data"
lee_train_file = (test_dat_dir/"lee_background.cor")
lee_test_file = (test_dat_dir/"lee.cor")

In [None]:
import smart_open
def read_corpus(fname, tokens_only=False):
    with smart_open.open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            tokens = gensim.utils.simple_preprocess(line)
            if tokens_only:
                yield tokens
            else:
                yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [None]:
model.build_vocab(train_corpus)

In [None]:
print(f"Word 'penalty' appeared {model.wv.get_vecattr('penalty', 'count')} times in the training data")

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [None]:
model.infer_vector(["only", "you", "can", "prevent", "forest", "fires"])

In [None]:
ranks = list()
second_ranks = list()
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, _ in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
import collections

counter = collections.Counter(ranks)
counter

In [None]:
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))