## Document Vectors

In [1]:
#Import spacy and load the model
import spacy
nlp = spacy.load("en_core_web_sm") #here nlp object refers to the 'en_core_web_sm' language model instance.

In [2]:

#Assume each sentence in documents corresponds to a separate document.
documents = ["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs = [doc.lower().replace(".","") for doc in documents]
processed_docs

print("Document After Pre-Processing:",processed_docs)


#Iterate over each document and initiate an nlp instance.
for doc in processed_docs:
    doc_nlp = nlp(doc) #creating a spacy "Doc" object which is a container for accessing linguistic annotations. 
    print()
    print("*"*30)
    print("Average Vector of '{}'\n".format(doc),doc_nlp.vector) # this gives the average vector of each document
    for token in doc_nlp:
        print()
        print(token.text,token.vector) #this gives the text of each word in the doc and their respective vectors.

Document After Pre-Processing: ['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

******************************
Average Vector of 'dog bites man'
 [ 1.4044472  -0.4777551   0.5396909   0.00797181 -0.082297   -0.25214553
  0.18398042  0.6799763   0.9407212  -0.06676003 -0.32357794 -0.08495045
  0.29986614 -0.3343738   0.21674366 -0.65523845 -0.0212789   0.3415761
 -0.62740695 -0.25812858  0.76212794  0.11232078  0.60678256  0.4023213
  0.18347915  0.23531693 -0.18476516  0.07432339 -0.10718719 -0.71177834
  1.0242397  -0.59737945 -0.90014166 -0.50639766  0.03797626 -0.6916008
 -0.3508688   0.4520282  -0.08950873 -0.5635819  -0.44965398 -0.38179985
 -0.5832341  -0.5415993   0.00696069 -0.46420357  0.7901923  -1.03137
  0.5052544   0.2813643  -1.1869272   0.11953595 -0.1808639   0.24608605
 -0.05974054  0.49752358  0.8618197   0.3821298  -0.06824117 -0.27646792
 -0.9340474  -0.30925164  0.15117423  0.4797438  -0.5022076  -0.38505545
  0.6072712  -0.49272838 -0.08657108

## Doc2Vec

In [3]:
import warnings
warnings.filterwarnings('ignore')
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
from pprint import pprint
import nltk
#nltk.download('punkt')

In [4]:
data = ["dog bites man",
        "man bites dog",
        "dog eats meat",
        "man eats food"]

tagged_data = [TaggedDocument(words=word_tokenize(word.lower()), tags=[str(i)]) for i, word in enumerate(data)]
tagged_data

[TaggedDocument(words=['dog', 'bites', 'man'], tags=['0']),
 TaggedDocument(words=['man', 'bites', 'dog'], tags=['1']),
 TaggedDocument(words=['dog', 'eats', 'meat'], tags=['2']),
 TaggedDocument(words=['man', 'eats', 'food'], tags=['3'])]

In [5]:
#dbow
model_dbow = Doc2Vec(tagged_data,vector_size=20, min_count=1, epochs=2,dm=0)

In [6]:
print(model_dbow.infer_vector(['man','eats','food']))    #feature vector of man eats food

[-0.00826636  0.02283587 -0.02087237 -0.00017898 -0.01807472  0.00034097
 -0.02361825  0.00147991  0.01903956  0.01927918  0.00078825 -0.01282219
 -0.00743386  0.01775407  0.00590702  0.01361771  0.01520974 -0.00983114
  0.01750618  0.01895466]


In [7]:
model_dbow.wv.most_similar("man",topn=5)   #top 5 most simlar words.

[('meat', 0.39641645550727844),
 ('bites', 0.05595850199460983),
 ('dog', 0.050179000943899155),
 ('food', -0.06502582132816315),
 ('eats', -0.2928891181945801)]

In [8]:
 model_dbow.wv.n_similarity(["dog"],["man"])

0.050179023

In [9]:
#dm
model_dm = Doc2Vec(tagged_data, min_count=1, vector_size=20, epochs=2,dm=1)

print("Inference Vector of man eats food\n ",model_dm.infer_vector(['man','eats','food']))

print("Most similar words to man in our corpus\n",model_dm.wv.most_similar("man",topn=5))
print("Similarity between man and dog: ",model_dm.wv.n_similarity(["dog"],["man"]))

Inference Vector of man eats food
  [-0.00826637  0.02283577 -0.02087227 -0.00017914 -0.01807469  0.0003409
 -0.02361821  0.0014799   0.01903967  0.01927905  0.00078821 -0.01282221
 -0.00743395  0.01775397  0.005907    0.01361787  0.01520972 -0.00983112
  0.01750612  0.0189548 ]
Most similar words to man in our corpus
 [('meat', 0.39641645550727844), ('bites', 0.05595850199460983), ('dog', 0.050179000943899155), ('food', -0.06502582132816315), ('eats', -0.2928891181945801)]
Similarity between man and dog:  0.050179023


What happens when we compare between words which are not in the vocabulary?

In [10]:
# model_dm.wv.n_similarity(['covid'],['man'])