In [1]:
# Importing the reuqired library


from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [3]:
# sample text for practical

text = "Doc2Vec is used for creating document embeddings. It captures the context of entire documents."
text

'Doc2Vec is used for creating document embeddings. It captures the context of entire documents.'

# Tokenize into sentences and words 

In [4]:
print("Tokenizing into sentences and words")
sentences = sent_tokenize(text)
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
print("Tokenized sentences:", tokenized_sentences)
print()

Tokenizing into sentences and words
Tokenized sentences: [['doc2vec', 'is', 'used', 'for', 'creating', 'document', 'embeddings', '.'], ['it', 'captures', 'the', 'context', 'of', 'entire', 'documents', '.']]



# Prepare tagged documents 

In [5]:
print("Preparing tagged documents")
tagged_data = [TaggedDocument(words=words, tags=[str(idx)]) for idx, words in enumerate(tokenized_sentences)]

Preparing tagged documents


# Train the Doc2Vec model 

In [6]:
print("Train the Doc2Vec model")
model = Doc2Vec(vector_size=100, window=5, min_count=1, dm=1, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
print("Doc2Vec model trained successfully")
print()

Train the Doc2Vec model
Doc2Vec model trained successfully



# Infer document vectors 

In [7]:
print("Infering document vectors")
doc_vector = model.infer_vector(word_tokenize("Doc2Vec is a powerful tool for document embeddings."))
print("Inferred document vector:", doc_vector)
print()

Infering document vectors
Inferred document vector: [-3.8413841e-03 -9.6587243e-04  1.2822049e-03  4.2428784e-03
 -3.4659868e-03 -1.5062141e-03  4.1416907e-03  4.1624922e-03
 -2.0395233e-03  2.2970436e-03  1.0200735e-03  1.8484684e-03
  1.4011435e-03  8.8950986e-04 -3.7751931e-03  1.6184825e-03
 -5.0244536e-03  3.8770179e-03 -1.4584443e-03  3.5635170e-03
 -2.4003519e-03 -3.8530671e-03 -1.8980850e-04 -4.7652214e-03
  1.6642825e-03  3.3244414e-03 -4.0195473e-03  4.9300776e-03
 -4.3796326e-04  2.9727047e-05  1.3628080e-03  2.2126269e-03
  1.5748548e-03  1.0884719e-03  3.3459174e-03  2.3015023e-03
  2.7488552e-03  2.1034258e-03  4.4801598e-03 -3.3130862e-03
  4.5728724e-04  4.3028551e-03 -1.0275168e-03 -2.3165785e-03
 -8.9369150e-04  8.7271270e-04  6.5214255e-05 -1.5871155e-03
  1.5119698e-03 -9.4704283e-04 -1.7782578e-03 -4.5106341e-03
 -2.5577673e-03 -3.8290219e-03  2.4180464e-03  1.6273867e-03
 -4.0323180e-03  3.4187252e-03  4.0243650e-03  3.6692361e-03
 -2.0018849e-03  3.2814121e-04 -1