## Computing Document Similarity using Doc2Vec Model

In [27]:
import warnings
warnings.filterwarnings('ignore')

### EXERCISE-1

### 1.Import dependencies

In [28]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [29]:
from nltk.tokenize import word_tokenize

In [30]:
from sklearn import utils

### 2.Create dataset

In [31]:
data = ["I love machine learning. Its awesome.",
        "I love coding in python",
        "I love building chatbots",
        "they chat amazingly well"]

In [32]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shirl\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### 3.Create TaggedDocument

In [33]:
tagged_data = [TaggedDocument(words=word_tokenize(d.lower()),
                              tags=[str(i)]) for i, d in enumerate(data)]

### 4.Train Model

In [34]:
# model parameters
vec_size = 20
alpha = 0.025

In [35]:
# create model
model = Doc2Vec(vector_size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm=1)

In [36]:
# build vocabulary
model.build_vocab(tagged_data)

In [37]:
# shuffle data
tagged_data = utils.shuffle(tagged_data)

In [38]:
# train Doc2Vec model
model.train(tagged_data,
            total_examples=model.corpus_count,
            epochs=30)

In [39]:
model.save("d2v.model")
print("Model Saved")

Model Saved


### 5.Find Similar documents for the given document

In [40]:
from gensim.models.doc2vec import Doc2Vec

In [41]:
model = Doc2Vec.load('d2v.model')

In [42]:
# to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer",v1)

V1_infer [ 0.01912867 -0.01765504  0.01510629  0.01799483 -0.00892928  0.00998703
 -0.00090134  0.01546164 -0.02002329  0.02377238 -0.00724893 -0.00405359
 -0.00414919  0.02146302 -0.02229237  0.00507969 -0.00810554 -0.00867667
 -0.00991263  0.02451484]


In [43]:
# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

[('2', 0.3256344795227051), ('0', 0.2736329436302185), ('3', 0.2126191258430481)]


In [44]:
# to find vector of doc in training data using tags or
# in other words, printing the vector of document at index 1 in training data
print(model.docvecs['1'])

[-0.01904064  0.01291113 -0.02862327  0.01310186  0.02923953 -0.04078041
 -0.04189338 -0.0500794   0.02472007 -0.04591568  0.02931122  0.03417829
 -0.03288323 -0.02290827 -0.00638708  0.00833787 -0.0074406  -0.04273077
 -0.01832722  0.00885403]


### EXERCISE-2

### Question1. Train the following documents using Doc2Vec model

In [45]:
docs = ["the house had a tiny little mouse",
        "the cat saw the mouse",
        "the mouse ran away from the house",
        "the cat finally ate the mouse",
        "the end of the mouse story"]

### Question2. Find the most similar TWO documents for the query document "cat stayed in the house".

In [46]:
from gensim.models.doc2vec import Doc2Vec
model=Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data=word_tokenize("cat stayed in the house".lower())
v1=model.infer_vector(test_data)
print("v1_infer",v1)
#to find most similar doc using tags
similar_doc=model.dv.most_similar('2')
print(similar_doc)
#to find vector of doc in training data using tags
print(model.dv["2"])

v1_infer [-0.01588836 -0.01788474 -0.0145947   0.01575964 -0.01147471 -0.01840468
 -0.00824837 -0.02024175 -0.0197826  -0.00048647  0.01636968  0.01537332
  0.01926726 -0.00716028 -0.0037462   0.0159885  -0.00883282  0.02255341
  0.00963103 -0.02159291]
[('3', 0.33215874433517456), ('1', 0.32563456892967224), ('0', -0.10534212738275528)]
[-1.0793915e-02 -3.6800183e-02  2.0387791e-02 -4.2787164e-02
  1.4085785e-02 -2.3475422e-02  2.5354682e-03 -1.0975582e-02
  2.6901973e-02 -4.0534161e-02 -1.0368363e-02 -9.6229851e-05
 -3.4025688e-02 -3.3354104e-02 -9.9996375e-03  4.4075567e-02
 -6.2743672e-03  1.8161051e-02 -2.9713351e-02  4.4486530e-02]
