In [4]:
from gensim.models import Word2Vec, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
documents = [
    "Natural language processing is a subfield of linguistics, computer science, and artificial intelligence.",
    "Machine learning is a subfield of artificial intelligence that focuses on the development of algorithms.",
    "Artificial intelligence is the simulation of human intelligence processes by machines.",
    "Deep learning is a subset of machine learning that focuses on neural networks.",
]

tokenized_docs = [word_tokenize(doc.lower()) for doc in documents]
word2vec_model = Word2Vec(tokenized_docs, vector_size=100, window=5, min_count=1, workers=4)
tagged_documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_docs)]
doc2vec_model = Doc2Vec(tagged_documents, vector_size=100, window=5, min_count=1, workers=4)

def cosine_similarity_word2vec(doc1, doc2):
    vec1 = sum([word2vec_model.wv[word] for word in doc1 if word in word2vec_model.wv])
    vec2 = sum([word2vec_model.wv[word] for word in doc2 if word in word2vec_model.wv])
    return cosine_similarity([vec1], [vec2])[0][0]

def cosine_similarity_doc2vec(doc1, doc2):
    vec1 = doc2vec_model.infer_vector(doc1)
    vec2 = doc2vec_model.infer_vector(doc2)
    return cosine_similarity([vec1], [vec2])[0][0]

def jaccard_similarity(doc1, doc2):
    intersection = len(set(doc1).intersection(doc2))
    union = len(set(doc1).union(doc2))
    return intersection / union if union != 0 else 0

doc1 = word_tokenize("Natural language processing is a subfield of artificial intelligence.")
doc2 = word_tokenize("Deep learning is a subset of machine learning.")
doc3 = word_tokenize("Machine learning involves the use of algorithms.")

print("Cosine similarity using Word2Vec embeddings:", cosine_similarity_word2vec(doc1, doc2))
print("Cosine similarity using Doc2Vec embeddings:", cosine_similarity_doc2vec(doc1, doc2))
print("Jaccard similarity:", jaccard_similarity(doc1, doc3))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Cosine similarity using Word2Vec embeddings: 0.48333097
Cosine similarity using Doc2Vec embeddings: 0.12993178
Jaccard similarity: 0.125
