# Doc2Vec:
    
Doc2vec is an extension of Word2vec

Concepts:
1. PV-DM(Distributed model)
2. PV-DBOW(Distributed Bag of words)


In [1]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
#sample text
documents=['Word embeddings capture semantic relationships.',
           'Word2vec is a popular technique in nlp',
           'Word embedding model in a continuos vector space']
#Tokenize & tag documents
tagged_data=[TaggedDocument(words=word_tokenize(doc.lower()),tags=[str(i)]) for i,doc in enumerate(documents)]
print(tagged_data)

[TaggedDocument(words=['word', 'embeddings', 'capture', 'semantic', 'relationships', '.'], tags=['0']), TaggedDocument(words=['word2vec', 'is', 'a', 'popular', 'technique', 'in', 'nlp'], tags=['1']), TaggedDocument(words=['word', 'embedding', 'model', 'in', 'a', 'continuos', 'vector', 'space'], tags=['2'])]


In [2]:
#Train Doc2vec model
model=Doc2Vec(vector_size=100,window=2,min_count=1,workers=5,epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data,total_examples=model.corpus_count,epochs=model.epochs)

In [3]:
vector_doc_1=model.infer_vector(word_tokenize("Word embeddings capture semantic relation"))

In [4]:
vector_doc_1

array([ 2.5279885e-03, -9.9850690e-04,  2.7232795e-04, -2.9513596e-03,
       -4.6969024e-03,  3.2790299e-03, -3.3828181e-03, -3.1424188e-03,
       -3.0997166e-04, -4.0712836e-03, -4.1918140e-03,  4.2506307e-03,
       -1.6440435e-03,  6.6861376e-04, -1.7682081e-03,  3.5665294e-03,
        1.3013239e-03,  3.1250678e-03,  4.4974820e-03, -2.9628291e-03,
        3.2059434e-03,  2.6109407e-03, -4.4054952e-03,  4.1160844e-03,
       -3.0168113e-03,  4.8914626e-03, -3.8569081e-03, -1.5252280e-03,
        4.2748991e-03,  4.0028063e-03,  4.5701526e-03,  3.8155555e-04,
        3.6424282e-03,  2.9265536e-03,  2.8427190e-03, -6.0105795e-04,
        1.3413741e-03,  9.0766890e-04,  2.3489713e-03, -9.4353553e-04,
       -3.6188847e-04, -1.2603049e-03, -9.3865994e-04,  3.9095711e-03,
        2.8401553e-03,  1.7476442e-03, -2.1629834e-03,  1.4895867e-03,
        3.7823638e-03, -1.2908645e-03,  1.8818652e-04, -2.4702464e-04,
       -7.1926083e-04, -3.6576453e-03, -1.7188290e-03, -8.4332016e-05,
      

In [5]:
#find the most similar document
similar_doc=model.docvecs.most_similar(positive=[vector_doc_1])
print(f"vector for 'Word embeddings capture semantic relationships.':{vector_doc_1}")
print()
print(f"Most similar document:{similar_doc}")

  similar_doc=model.docvecs.most_similar(positive=[vector_doc_1])


vector for 'Word embeddings capture semantic relationships.':[ 2.5279885e-03 -9.9850690e-04  2.7232795e-04 -2.9513596e-03
 -4.6969024e-03  3.2790299e-03 -3.3828181e-03 -3.1424188e-03
 -3.0997166e-04 -4.0712836e-03 -4.1918140e-03  4.2506307e-03
 -1.6440435e-03  6.6861376e-04 -1.7682081e-03  3.5665294e-03
  1.3013239e-03  3.1250678e-03  4.4974820e-03 -2.9628291e-03
  3.2059434e-03  2.6109407e-03 -4.4054952e-03  4.1160844e-03
 -3.0168113e-03  4.8914626e-03 -3.8569081e-03 -1.5252280e-03
  4.2748991e-03  4.0028063e-03  4.5701526e-03  3.8155555e-04
  3.6424282e-03  2.9265536e-03  2.8427190e-03 -6.0105795e-04
  1.3413741e-03  9.0766890e-04  2.3489713e-03 -9.4353553e-04
 -3.6188847e-04 -1.2603049e-03 -9.3865994e-04  3.9095711e-03
  2.8401553e-03  1.7476442e-03 -2.1629834e-03  1.4895867e-03
  3.7823638e-03 -1.2908645e-03  1.8818652e-04 -2.4702464e-04
 -7.1926083e-04 -3.6576453e-03 -1.7188290e-03 -8.4332016e-05
 -2.4845442e-03 -1.0411099e-03  1.6309004e-03 -3.6284204e-03
 -2.2176732e-03 -1.18474