In [26]:
!pip install gensim==4.1.2

Collecting gensim==4.1.2
[?25l  Downloading https://files.pythonhosted.org/packages/ba/b3/668ace2f0517b7fb01f780f93a75cb0592754d6365d808d2adccb2a94b92/gensim-4.1.2-cp36-cp36m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (24.1MB)
[K     |████████████████████████████████| 24.1MB 923kB/s  eta 0:00:01
Installing collected packages: gensim
  Found existing installation: gensim 3.8.0
    Uninstalling gensim-3.8.0:
      Successfully uninstalled gensim-3.8.0
Successfully installed gensim-4.1.2
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [27]:
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from pathlib import Path

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Dataset

In [28]:
data_fn = Path('../input/tokenized_lemmatized_paragraphs.txt')

In [29]:
tokenized_paras = [para.split(' ') for para in data_fn.read_text().split('\n')]

In [30]:
tokenized_paras[0][:5]

['ན་མོ་', 'གུ་རུ་', 'དེ་བ་', 'ཌཱ་ཀི་', 'ནཱི་']

In [31]:
word_freq = defaultdict(int)
for para in tokenized_paras:
    for i in para:
        word_freq[i] += 1
len(word_freq)

48539

In [32]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['ལ་', 'གི་', '།_', '།_།', 'དང་', 'ན་', 'གིས་', 'དུ་', 'ཀྱི་', 'དེ་']

In [33]:
train_corpus = [TaggedDocument(para, [i]) for i, para in enumerate(tokenized_paras)]

In [34]:
train_data[0]

TaggedDocument(words=['ན་མོ་', 'གུ་རུ་', 'དེ་བ་', 'ཌཱ་ཀི་', 'ནཱི་', 'ཡཻ', '།_', 'དགོངས་པ་', 'གི་', 'སྟོབས་', 'དང་', 'ཚུལ་ལྡན་', 'ཆོ་ག་', 'གི་', 'མཐུ་', 'གིས་', '།_།', 'ཐོག་མེད་', 'འཁྲུལ་པ་', 'གི་', 'འཆིང་བ་', 'སྐད་ཅིག་', 'ལ་', '།_།', 'བྲལ་', 'ན་', 'མངོན་སུམ་', 'ཡེ་ཤེས་', 'སད་', 'མཛད་པ་', '།_།', 'དཀྱིལ་འཁོར་', 'དབང་ཕྱུག་', 'དཔལ་ལྡན་', 'བླ་མ་', 'ལ་', 'འདུད་', '།_།', 'རྡོ་རྗེ་', 'ཐེག་པ་', 'གི་', 'རྩ་བ་', 'སྨིན་', 'བྱེད་', 'ཀྱི་', '།_།', 'ཚུལ་', 'འདི་', 'ཟབ་', 'རྒྱ་', 'ཉིད་', 'ཕྱི་', 'རྟོགས་དཀའ་', 'ཡང་', '།_།', 'དང་པོ་', 'གི་', 'ལས་ཅན་', 'ཕྱོགས་', 'ཙམ་', 'ངེས་', 'རྙེད་', 'ཕྱི་', '།_།', 'གོ་', 'བདེ་', 'གི་', 'ངག་', 'གིས་', 'མདོར་བསྡུས་', 'བརྗོད་པ་', 'ལ་', 'བྱ་', '།_།', 'དེ་', 'ཀྱང་', 'རྡོ་རྗེ་', 'ཐེག་པ་', 'གི་', 'ལམ་', 'གྱི་', 'གནད་', 'ཐམས་ཅད་', 'ཚང་', 'ཞིང་', 'ཁྱད་པར་', 'གསང་སྔགས་', 'ཀྱི་', 'རྒྱུད་', 'ལུང་', 'མན་ངག་', 'རྣམས་', 'ལ་', 'ཐོས་བསམ་', 'སྒོམ་པ་', 'གང་', 'བྱེད་', 'ཀྱང་', 'ངེས་པ་', 'ལ་', 'སྔོན་', 'དུ་མ་', 'སོང་', 'ཐབས་མེད་པ་', 'ནི་', 'དབང་', 'བསྐུར་བ་', 'དང་', '།_', 'དེ་ལས་', 'ཐོབ་པ

# Training the Model

In [35]:
model = Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [36]:
model.build_vocab(train_corpus)

In [39]:
t = time()

model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 9.3 mins


In [48]:
model.infer_vector(["རིམ་པ་", "བཞིན་", "དུ་", "སྦྱིན་པ་", "ལ་", "བྱ་","།_།"])

array([ 0.35797277, -0.28207123,  0.18900879,  0.11906499, -0.24782394,
       -0.47151944,  0.2278079 , -0.5385187 , -0.24642773, -0.23693329,
       -0.4404652 ,  0.33178723, -0.2826047 , -0.36461526, -0.11994781,
       -0.3469704 ,  0.00275858, -0.1558539 , -0.266859  , -0.20364438,
        0.11912236,  0.12776877, -0.05577536, -0.09960724,  0.26648524,
        0.00688452, -0.07664044,  0.05464343, -1.1439594 ,  0.07546379,
       -0.30470034, -0.12048067,  0.08598229, -0.24721168, -0.2837789 ,
        0.4743852 , -0.23593839, -0.42313078,  0.09637984,  0.5161193 ,
        0.00335056,  0.3642293 ,  0.17687108,  0.16725211, -0.11284601,
       -0.7571143 , -0.20528845,  0.06559303, -0.30941555, -0.11846107],
      dtype=float32)

# Assessing the Model

To assess our new model
1. we’ll first infer new vectors for each document of the training corpus, compare the inferred vectors with the training corpus
1. and then returning the rank of the document based on self-similarity. Basically, we’re pretending as if the training corpus is some new unseen data and then seeing how they compare with the trained model. 

> The expectation is that we’ve likely overfit our model (i.e., all of the ranks will be less than 2) and so we should be able to find similar documents very easily. 

Additionally, we’ll keep track of the second ranks for a comparison of less similar documents.

In [47]:
# ranks = []
# second_ranks = []
# for doc_id in range(len(train_corpus)):
#     inferred_vector = model.infer_vector(train_corpus[doc_id].words)
#     sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))
#     rank = [docid for docid, sim in sims].index(doc_id)
#     ranks.append(rank)

#     second_ranks.append(sims[1])

KeyboardInterrupt: 

In [None]:
# import collections

# counter = collections.Counter(ranks)
# print(counter)

# Exploring the model

# Save the word2vec

In [52]:
model_path = "./doc2vec_classical_bo"
model.save(model_path)

In [53]:
!ls

__notebook_source__.ipynb  doc2vec_classical_bo


In [55]:
new_model = Doc2Vec.load(model_path)

In [63]:
new_model.infer_vector(["རིམ་པ་", "བཞིན་", "དུ་", "སྦྱིན་པ་", "ལ་", "བྱ་","།_།"])

array([ 0.33605957, -0.29641768,  0.04734034,  0.04585143, -0.24677654,
       -0.50026655,  0.19406824, -0.51561415, -0.39491773, -0.26678264,
       -0.4449173 ,  0.25829437, -0.38156644, -0.40229952, -0.25292578,
       -0.25280827,  0.08062779, -0.24071622, -0.20443948, -0.19244958,
        0.1038167 ,  0.13420986,  0.07711165, -0.03003781,  0.21018094,
        0.1167739 ,  0.01449351,  0.04121908, -0.9493679 ,  0.20313726,
       -0.30546504, -0.17832182,  0.16809662, -0.13758343, -0.3379911 ,
        0.34955662, -0.24969485, -0.39256075,  0.15194893,  0.51274407,
        0.15116078,  0.41996974,  0.04129816,  0.21703869, -0.08284596,
       -0.8264714 , -0.13513891,  0.01037692, -0.20604113, -0.16772035],
      dtype=float32)