In [None]:
# !pip install -U gensim -q

In [3]:
import logging
from pathlib import Path
import random
import unicodedata

import gensim
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Prepare Dataset

In [5]:
BASE_DIR = Path.home() / ".models"
DATA_DIR = BASE_DIR / "data"
MODELS_DIR = BASE_DIR / "models"

corpus_path = DATA_DIR / "classical_bo"
model_path = str((MODELS_DIR / "doc2vec_classical_bo").resolve())

In [7]:
def get_files(path):
    for pecha_path in path.iterdir():
        if not pecha_path.is_dir(): continue
        for fn in pecha_path.iterdir():
            if not 'tokenize' in fn.name: continue
            yield fn
            
def is_punt(word):
    for punt in ["།", "།།", "༄༅"]:
        if punt in word:
            return True
    return False
    
def tokenize(text):
    return [token for token in text.split() if token and not is_punt(token)]
    
def get_sentences(fns):
    for fn in fns:
        for sentence in fn.open('r'):
            if len(sentence.split()) < 3: continue
            yield tokenize(unicodedata.normalize("NFKC", sentence.strip()))

def create_dataset(sentences, tokens_only=False):
    for i, sentence in enumerate(sentences):
        tokens = sentence # already tokenize
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])       

In [13]:
files = get_files(corpus_path)
train_files, test_files = train_test_split(list(files), test_size=0.2, random_state=42)

In [14]:
train_sents = get_sentences(train_files)
test_sents = get_sentences(test_files)

In [15]:
train_corpus = list(create_dataset(train_sents))
test_corpus = list(create_dataset(test_sents, tokens_only=True))

In [20]:
doc_id = random.randint(0, len(train_corpus) - 1)
doc_id, train_corpus[doc_id: doc_id+3]

(495842,
 [TaggedDocument(words=['བདེ་གཤེགས་', 'སྙིང་པོ', 'འང་', 'འོད་གསལ་བ', 'འི་', 'སེམས་', 'འདི་', 'ལ་', 'བརྗོད་པ་', 'ཡིན་', 'ནོ'], tags=[495842]),
  TaggedDocument(words=['རྫོགས་ཆེན་', 'ངེས་དོན་', 'འདུས་པ', 'འི་', 'རྒྱུད་', 'ལ', 'ས', 'སེམས་', 'ལ་དུ', 'ས་', 'གསུམ་', 'ཡོད་པ', 'འི་', 'ཕྱིར', 'ཚེ་', 'ལ་', 'སྔ་ཕྱི་', 'ད་ལྟ་', 'བྱུང་'], tags=[495843]),
  TaggedDocument(words=['སེམས་', 'ལ་', 'འཕོ་འགྱུར་', 'ཡོད་པ', 'འི་', 'ཕྱིར', 'དེ་ཕྱིར་', 'ལུས་', 'ལ་', 'སྐྱེ་འཆི་', 'བྱུང་'], tags=[495844])])

In [21]:
assert len(train_corpus) == train_corpus[-1].tags[0] + 1

## Training the Model

In [22]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

2022-04-29 11:19:13,326 : INFO : Doc2Vec lifecycle event {'params': 'Doc2Vec(dm/m,d50,n5,w5,mc2,s0.001,t3)', 'datetime': '2022-04-29T11:19:13.326610', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'created'}


In [None]:
model.build_vocab(train_corpus)

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Inference

In [25]:
model.infer_vector(["རིམ་པ་", "བཞིན་", "དུ་", "སྦྱིན་པ་", "ལ་", "བྱ་"])

array([-0.13087395, -0.01936051, -0.0087766 ,  0.12466373, -0.5885176 ,
       -0.24993752,  0.00286173,  0.38588482, -0.01656335,  0.17086115,
        0.30103028,  0.33869773, -0.24266653, -0.4151375 , -0.6437144 ,
        0.12202694,  0.3668979 , -0.3083544 ,  0.11743522, -0.49491903,
        0.05449725,  0.00839138,  0.1312454 , -0.28812018,  0.01965531,
        0.53242975, -0.34315082,  0.6922586 , -0.6106972 ,  0.19089296,
        0.9451179 , -0.39846855,  0.27619553, -0.8079054 , -0.21874213,
        0.4931009 , -0.21541193,  0.48782465, -0.15523456,  0.81462157,
       -0.2974267 ,  0.31295067, -0.44193265,  0.24974515,  0.05155938,
       -0.03777373, -0.6853228 , -0.39054087, -0.67567915, -0.25841784],
      dtype=float32)

## Save and Load model

In [26]:
model.save(model_path)

2022-04-29 12:07:44,680 : INFO : Doc2Vec lifecycle event {'fname_or_handle': '/home/studio-lab-user/.models/models/doc2vec_classical_bo', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-04-29T12:07:44.680554', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'saving'}
2022-04-29 12:07:44,681 : INFO : storing np array 'vectors' to /home/studio-lab-user/.models/models/doc2vec_classical_bo.dv.vectors.npy
2022-04-29 12:07:44,751 : INFO : not storing attribute cum_table
2022-04-29 12:07:44,842 : INFO : saved /home/studio-lab-user/.models/models/doc2vec_classical_bo


In [27]:
model = gensim.models.doc2vec.Doc2Vec.load(model_path)

2022-04-29 12:07:44,847 : INFO : loading Doc2Vec object from /home/studio-lab-user/.models/models/doc2vec_classical_bo
2022-04-29 12:07:44,909 : INFO : loading dv recursively from /home/studio-lab-user/.models/models/doc2vec_classical_bo.dv.* with mmap=None
2022-04-29 12:07:44,910 : INFO : loading vectors from /home/studio-lab-user/.models/models/doc2vec_classical_bo.dv.vectors.npy with mmap=None
2022-04-29 12:07:44,964 : INFO : loading wv recursively from /home/studio-lab-user/.models/models/doc2vec_classical_bo.wv.* with mmap=None
2022-04-29 12:07:44,965 : INFO : setting ignored attribute cum_table to None
2022-04-29 12:07:45,326 : INFO : Doc2Vec lifecycle event {'fname': '/home/studio-lab-user/.models/models/doc2vec_classical_bo', 'datetime': '2022-04-29T12:07:45.325962', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'loaded'}


## Assessing the Model

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

In [None]:
doc_id = 1
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

In [None]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

## Testing the Model
Using the same approach above, we’ll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))