In [78]:
import os
import random
import smart_open
import gensim

In [79]:
#The dataset chosen is 'lee' dataset. Documents in this dataset typically consists of one paragraph. There are in total 350 documents in this dataset 
#training set => 300 documents
#test set => 50 documents
test_data_dir = '{}'.format(os.sep).join([gensim.__path__[0], 'test', 'test_data'])
lee_train_file = test_data_dir + os.sep + 'lee_background.cor'
lee_test_file = test_data_dir + os.sep + 'lee.cor'

In [80]:
#preprocessing train data => word_tokenize and remove punctutations
def read_train(fname):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), [i])

train_corpus = list(read_train(lee_train_file))

In [81]:
#preprocessing test data in the same way as train data
def read_test(fname):
    with smart_open.smart_open(fname, encoding="iso-8859-1") as f:
        for i, line in enumerate(f):
            yield gensim.utils.simple_preprocess(line)

test_corpus = list(read_test(lee_test_file))

In [82]:
#sample train document
print(train_corpus[12])
print(len(train_corpus))

TaggedDocument(['president', 'general', 'pervez', 'musharraf', 'says', 'pakistan', 'wants', 'to', 'defuse', 'the', 'brewing', 'crisis', 'with', 'india', 'but', 'was', 'prepared', 'to', 'respond', 'vigorously', 'to', 'any', 'attack', 'pakistan', 'stands', 'for', 'peace', 'pakistan', 'wants', 'peace', 'pakistan', 'wants', 'to', 'reduce', 'tension', 'he', 'said', 'let', 'the', 'two', 'countries', 'move', 'towards', 'peace', 'and', 'harmony', 'however', 'pakistan', 'has', 'taken', 'all', 'counter', 'measures', 'if', 'any', 'war', 'is', 'thrust', 'on', 'pakistan', 'the', 'pakistan', 'armed', 'forces', 'and', 'the', 'million', 'people', 'of', 'pakistan', 'are', 'fully', 'prepared', 'to', 'face', 'all', 'consequences', 'with', 'all', 'their', 'might', 'the', 'president', 'said', 'he', 'had', 'received', 'the', 'support', 'of', 'all', 'political', 'parties', 'president', 'musharraf', 'also', 'said', 'he', 'welcomed', 'the', 'intervention', 'of', 'the', 'international', 'community', 'in', 'tryi

In [83]:
#sample test document
print(test_corpus[12])
print(len(test_corpus))

['drug', 'squad', 'detectives', 'have', 'asked', 'the', 'police', 'ombudsman', 'to', 'investigate', 'the', 'taskforce', 'that', 'is', 'examining', 'allegations', 'of', 'widespread', 'corruption', 'within', 'the', 'squad', 'this', 'coincides', 'with', 'the', 'creation', 'of', 'special', 'unit', 'within', 'the', 'taskforce', 'to', 'track', 'the', 'spending', 'of', 'at', 'least', 'serving', 'and', 'former', 'squad', 'members', 'the', 'corruption', 'taskforce', 'codenamed', 'ceja', 'will', 'check', 'tax', 'records', 'and', 'financial', 'statements', 'in', 'bid', 'to', 'establish', 'if', 'any', 'of', 'the', 'suspects', 'have', 'accrued', 'unexplained', 'wealth', 'over', 'the', 'past', 'seven', 'years', 'but', 'drug', 'squad', 'detectives', 'have', 'countered', 'with', 'their', 'own', 'set', 'of', 'allegations', 'complaining', 'to', 'the', 'ombudsman', 'that', 'the', 'internal', 'investigation', 'is', 'flawed', 'biased', 'and', 'over', 'zealous']
50


In [84]:
#creating doc2vec model and training it on train dataset.
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=50)
model.build_vocab(train_corpus)
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [85]:
#Infer_vector for a sample sentence
model.infer_vector(['the', 'goals', 'of', 'sunday', 'meeting'])

array([-0.13270625,  0.09512972, -0.08795411, -0.10251645,  0.05757366,
        0.06673998,  0.24308419, -0.12858823, -0.03118136, -0.00537211,
        0.03388078,  0.19685769, -0.32362285, -0.08160591, -0.05812686,
       -0.10060833, -0.19091047, -0.04901925, -0.01709976,  0.24761358,
        0.16995327, -0.13583817, -0.15756229,  0.29302782, -0.18278548,
       -0.14364474,  0.12958069, -0.16061226,  0.00894992, -0.0746446 ,
        0.09672628,  0.16661008,  0.0102212 ,  0.04128732, -0.31543544,
        0.2391256 ,  0.08478929, -0.06208804, -0.02146271, -0.14739004,
        0.00327396,  0.21987358, -0.21758032, -0.02214502,  0.07057989,
        0.21428053,  0.35280928,  0.3091761 ,  0.2844299 ,  0.51703686],
      dtype=float32)

In [86]:
doc_id = random.randint(0, len(test_corpus) - 1) #select a random document from the test dataset
inferred_vector = model.infer_vector(test_corpus[doc_id]) # get the infer_vector for the selected document
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
#print the most similar, median similar, least similar documents(from among the training dataset) to test document.
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Test Document (38): «they dress in black and disguise their identities with bandannas and sunglasses their logo is an image of the southern cross constellation superimposed with pair of crossed boomerangs which resembles swastika the blackshirts are former husbands aggrieved by their treatment at the hands of their ex wives and the courts who regard themselves as the vanguard of men rights movement in australia and say that their actions will be remembered as marking turning point in history»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (169, 0.456496924161911): «united nations panel of judges in east timor has found militia gang members guilty of crimes against humanity they were given jail terms up to years for their part in massacre soon after the territory vote for independence two years ago three judge panel found the men guilty of killing two nuns three priests and an indonesian journalist as well as carrying out other murders in east timor los 

  if np.issubdtype(vec.dtype, np.int):


In [87]:
#also printing the top 5 most similar documents to the test document.
for index in range(5):
     print(u'%s: «%s»\n' % (sims[index], ' '.join(train_corpus[sims[index][0]].words)))

(169, 0.456496924161911): «united nations panel of judges in east timor has found militia gang members guilty of crimes against humanity they were given jail terms up to years for their part in massacre soon after the territory vote for independence two years ago three judge panel found the men guilty of killing two nuns three priests and an indonesian journalist as well as carrying out other murders in east timor los palos district they also found the gang members guilty of torture persecution and forced deportation the court heard how soon after the vote the militia gang members still loyal to indonesia carried out murders burnt several villages and forced residents of los palos to flee their homes they are the first people to be convicted of crimes against humanity in connection with the violence that surrounded east timor vote for independence»

(51, 0.44735291600227356): «russian authorities have sentenced chechen warlord salman raduyev to life in prison for hostage siege in which