# 1. Doc2Vec Model

In [1]:
import os
from smart_open import open
import random
import collections
import gensim
from gensim.models import doc2vec

## **Prepare the Training and Test Data**

In [2]:
# Set file names for train and test data
test_data_dir = os.path.join(gensim.__path__[0], 'test', 'test_data')
lee_train_file = os.path.join(test_data_dir, 'lee_background.cor')
lee_test_file = os.path.join(test_data_dir, 'lee.cor')

In [3]:
# Define a function to read and preprocess Text
def read_corpus(fname, tokens_only=False):
  with open(fname, encoding="iso=8859-1") as f:
    for i, line in enumerate(f):
      tokens = gensim.utils.simple_preprocess(line)

      if tokens_only:
        yield tokens
      
      else:
        # add tags for for training data
        yield gensim.models.doc2vec.TaggedDocument(tokens, [i])

In [4]:
train_corpus = list(read_corpus(lee_train_file))
test_corpus = list(read_corpus(lee_test_file, tokens_only=True))

In [5]:
# print train corpus - list of tagged documents
print(train_corpus[:2])

[TaggedDocument(words=['hundreds', 'of', 'people', 'have', 'been', 'forced', 'to', 'vacate', 'their', 'homes', 'in', 'the', 'southern', 'highlands', 'of', 'new', 'south', 'wales', 'as', 'strong', 'winds', 'today', 'pushed', 'huge', 'bushfire', 'towards', 'the', 'town', 'of', 'hill', 'top', 'new', 'blaze', 'near', 'goulburn', 'south', 'west', 'of', 'sydney', 'has', 'forced', 'the', 'closure', 'of', 'the', 'hume', 'highway', 'at', 'about', 'pm', 'aedt', 'marked', 'deterioration', 'in', 'the', 'weather', 'as', 'storm', 'cell', 'moved', 'east', 'across', 'the', 'blue', 'mountains', 'forced', 'authorities', 'to', 'make', 'decision', 'to', 'evacuate', 'people', 'from', 'homes', 'in', 'outlying', 'streets', 'at', 'hill', 'top', 'in', 'the', 'new', 'south', 'wales', 'southern', 'highlands', 'an', 'estimated', 'residents', 'have', 'left', 'their', 'homes', 'for', 'nearby', 'mittagong', 'the', 'new', 'south', 'wales', 'rural', 'fire', 'service', 'says', 'the', 'weather', 'conditions', 'which', '

In [6]:
# print testing corpus - list of lists (no tags)
print(test_corpus[:2])

[['the', 'national', 'executive', 'of', 'the', 'strife', 'torn', 'democrats', 'last', 'night', 'appointed', 'little', 'known', 'west', 'australian', 'senator', 'brian', 'greig', 'as', 'interim', 'leader', 'shock', 'move', 'likely', 'to', 'provoke', 'further', 'conflict', 'between', 'the', 'party', 'senators', 'and', 'its', 'organisation', 'in', 'move', 'to', 'reassert', 'control', 'over', 'the', 'party', 'seven', 'senators', 'the', 'national', 'executive', 'last', 'night', 'rejected', 'aden', 'ridgeway', 'bid', 'to', 'become', 'interim', 'leader', 'in', 'favour', 'of', 'senator', 'greig', 'supporter', 'of', 'deposed', 'leader', 'natasha', 'stott', 'despoja', 'and', 'an', 'outspoken', 'gay', 'rights', 'activist'], ['cash', 'strapped', 'financial', 'services', 'group', 'amp', 'has', 'shelved', 'million', 'plan', 'to', 'buy', 'shares', 'back', 'from', 'investors', 'and', 'will', 'raise', 'million', 'in', 'fresh', 'capital', 'after', 'profits', 'crashed', 'in', 'the', 'six', 'months', 'to'

## **Training the model**

In [7]:
# Instantiate a Doc2Vec model
model = doc2vec.Doc2Vec(vector_size=50,     # embedding dimensionality
                        min_count=2,        # discard words which appear less than 2 times
                        epochs = 50)        # no of passes through the training corpus

# Build a vocabulary
model.build_vocab(train_corpus)

# Train model on the corpus
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [8]:
# Print the number of times the word 'hundreds' appears in the corpus
model.wv.vocab['hundreds'].count

18

In [9]:
# Use the trained model to 'infer' the vector embedding for any list of tokens
model.infer_vector(['the', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'])

array([-0.23265894,  0.3393085 , -0.2385683 ,  0.04905006,  0.12513739,
        0.41508305,  0.0079556 ,  0.03333636,  0.04071604, -0.17089702,
       -0.02401664,  0.23534991, -0.05696955,  0.40091512,  0.3124733 ,
       -0.18520485,  0.37017894,  0.01130299, -0.07685146,  0.23932266,
        0.06701266,  0.29256415, -0.19343553, -0.23276073, -0.06889956,
       -0.37002292,  0.00125259,  0.3126959 ,  0.22234789, -0.1490294 ,
       -0.03062874,  0.48139194, -0.08569314, -0.007669  , -0.07289602,
        0.12197638, -0.10752778,  0.26965573,  0.1639427 , -0.05119265,
        0.03738489, -0.00359745,  0.14559452,  0.12811476,  0.18786794,
       -0.0756438 ,  0.2800887 , -0.03182833,  0.34028265,  0.3972578 ],
      dtype=float32)

## **Assessing the Model**

In [10]:
ranks = []
second_ranks = []

for doc_id in range(len(train_corpus)):
  # infer the vector for each doc in training corpus
  inferred_vector = model.infer_vector(train_corpus[doc_id].words)

  # calculate similarity
  sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

  # get top ranked vec
  rank = [ docid for docid, sim in sims].index(doc_id)
  
  ranks.append(rank)
  second_ranks.append(sims[1])

  if np.issubdtype(vec.dtype, np.int):


In [11]:
counter = collections.Counter(ranks)
print(counter)

Counter({0: 291, 1: 9})


Basically, greater than 95% of the inferred documents are found to be most similar to itself and about 5% of the time it is mistakenly most similar to another document. Checking the inferred-vector against a training-vector is a sort of ‘sanity check’ as to whether the model is behaving in a usefully consistent manner, though not a real ‘accuracy’ value.

In [12]:
print('Document ({}): "{}"\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print('SIMILAR/DISSIMILAR DOCS AS PER MODEL {}:\n'.format(model))

for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims)-1)]:
  print('{0} {1}: "{2}"\n'.format(label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

Document (299): "australia will take on france in the doubles rubber of the davis cup tennis final today with the tie levelled at wayne arthurs and todd woodbridge are scheduled to lead australia in the doubles against cedric pioline and fabrice santoro however changes can be made to the line up up to an hour before the match and australian team captain john fitzgerald suggested he might do just that we ll make team appraisal of the whole situation go over the pros and cons and make decision french team captain guy forget says he will not make changes but does not know what to expect from australia todd is the best doubles player in the world right now so expect him to play he said would probably use wayne arthurs but don know what to expect really pat rafter salvaged australia davis cup campaign yesterday with win in the second singles match rafter overcame an arm injury to defeat french number one sebastien grosjean in three sets the australian says he is happy with his form it not v

Notice above that the most similar document (usually the same text) is has a similarity score approaching 1.0. However, the similarity score for the second-ranked documents should be significantly lower (assuming the documents are in fact different) and the reasoning becomes obvious when we examine the text itself.

In [13]:
# Pick a random document from the train corpus as infer a vector from the model
doc_id = random.randint(0, len(train_corpus)-1)
print('Train document ({}): "{}"\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))

# Compare and print the second-most-similar document
sim_id = second_ranks[doc_id]
print('Similar document ({}): "{}"\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))


Similar document ((202, 0.4837689697742462)): "an international study has found thousands of australians are involved in the child sex industry but it remains largely invisible problem the report author and national director of child wise bernadette mcmenamin says the year study revealed growing number of australians are involved in paedophilia child pornography and child sex tours it also found growing number of children are working as prostitutes in order to survive ms mcmenamin says the advent of the internet has made it easier for offenders to promote sex tours share images and information and establish international networks the child sex trade report has made recommendations to the federal government on how to improve investigations of child sex offences and provide improved services to affected children ms mcmenamin says australian authorities are ignoring the growing number of children working as prostitutes preferring to call them homeless or children without support ms mcmen

## **Testing the Model**

In [14]:
# Pick a random document from the test corpus as infer a vector from the model
doc_id = random.randint(0, len(test_corpus)-1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test document ({}): "{}"\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print('SIMILAR/DISSIMILAR DOCS AS PER MODEL {}:\n'.format(model))
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims)-1)]:
  print('{0} {1}: "{2}"\n'.format(label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))


SIMILAR/DISSIMILAR DOCS AS PER MODEL Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3):

MOST (1, 0.6656010150909424): "indian security forces have shot dead eight suspected militants in night long encounter in southern kashmir the shootout took place at dora village some kilometers south of the kashmiri summer capital srinagar the deaths came as pakistani police arrested more than two dozen militants from extremist groups accused of staging an attack on india parliament india has accused pakistan based lashkar taiba and jaish mohammad of carrying out the attack on december at the behest of pakistani military intelligence military tensions have soared since the raid with both sides massing troops along their border and trading tit for tat diplomatic sanctions yesterday pakistan announced it had arrested lashkar taiba chief hafiz mohammed saeed police in karachi say it is likely more raids will be launched against the two groups as well as other militant organisations accused of targetting india m

  if np.issubdtype(vec.dtype, np.int):
