# Importing neccessary libraries 

In [1]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

# Sample text 

In [2]:
text = "Doc2Vec is used for creating document embeddings. It captures the context of entire documents."

# Tokenize into sentences and words 

In [3]:
print("Tokenize into sentences and words")
sentences = sent_tokenize(text)
tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]
print("Tokenized sentences:", tokenized_sentences)
print()

Tokenize into sentences and words
Tokenized sentences: [['doc2vec', 'is', 'used', 'for', 'creating', 'document', 'embeddings', '.'], ['it', 'captures', 'the', 'context', 'of', 'entire', 'documents', '.']]



# Prepare tagged documents 

In [4]:
print("Prepare tagged documents")
tagged_data = [TaggedDocument(words=words, tags=[str(idx)]) for idx, words in enumerate(tokenized_sentences)]

Prepare tagged documents


# Train the Doc2Vec model 

In [5]:
print("Train the Doc2Vec model")
model = Doc2Vec(vector_size=100, window=5, min_count=1, dm=1, epochs=20)
model.build_vocab(tagged_data)
model.train(tagged_data, total_examples=model.corpus_count, epochs=model.epochs)
print("Doc2Vec model trained successfully")
print()

Train the Doc2Vec model
Doc2Vec model trained successfully



# Infer document vectors 

In [6]:
print("Infer document vectors")
doc_vector = model.infer_vector(word_tokenize("Doc2Vec is a powerful tool for document embeddings."))
print("Inferred document vector:", doc_vector)
print()

Infer document vectors
Inferred document vector: [-4.1424781e-03 -4.0114441e-04  2.3183709e-03 -4.0177065e-03
 -1.1191838e-03  1.6187805e-03 -1.1995027e-03  1.4420497e-03
 -3.1252012e-03  6.0391880e-04 -2.2552486e-03 -2.6541543e-03
 -2.8061941e-03 -3.8749811e-03  2.4329959e-03  2.7665100e-03
  3.0000391e-03 -2.6117559e-03 -2.9998017e-03 -4.0841778e-03
 -9.7367662e-04 -1.6249843e-03  3.5360865e-03 -3.0385721e-03
 -3.4603369e-04  9.3565596e-04  3.9531058e-03  2.3074539e-03
 -3.1594697e-03 -6.5406627e-04  4.1001802e-03 -1.6420709e-03
 -4.0211721e-04  3.7291690e-04  2.9035779e-03  2.5485682e-03
  3.2693946e-03 -3.7717199e-04 -2.9156294e-03 -2.1361746e-03
 -1.6228079e-03  9.8268050e-05 -5.9050729e-04 -1.2567226e-03
  4.0058882e-04 -3.5795898e-04 -2.5644591e-03  3.7930338e-03
 -1.5285388e-03  4.4577629e-03  4.5979250e-04 -4.2725531e-03
  2.6322978e-03  4.4851108e-03  5.9749582e-04  1.7769572e-03
  2.3198908e-03  3.2828856e-04 -5.1420962e-04  2.6940787e-03
  1.7399180e-03  7.7003997e-04  4.38