In [None]:
# !pip install -U gensim -q

In [None]:
import logging
from pathlib import Path
import random
import unicodedata

import gensim
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

## Prepare Dataset

In [None]:
BASE_DIR = Path.home() / ".models"
DATA_DIR = BASE_DIR / "data"
MODELS_DIR = BASE_DIR / "models"

corpus_path = DATA_DIR / "classical_bo"
model_path = str((MODELS_DIR / "doc2vec_classical_bo").resolve())

In [None]:
def get_files(path):
    for pecha_path in path.iterdir():
        if not pecha_path.is_dir(): continue
        for fn in pecha_path.iterdir():
            if not 'tokenize' in fn.name: continue
            yield fn
            
def is_punt(word):
    for punt in ["།", "།།", "༄༅"]:
        if punt in word:
            return True
    return False
    
def tokenize(text):
    return [token for token in text.split() if token and not is_punt(token)]
    
def get_sentences(fns):
    for fn in fns:
        for sentence in fn.open('r'):
            if len(sentence.split()) < 3: continue
            yield tokenize(unicodedata.normalize("NFKC", sentence.strip()))

def create_dataset(sentences, tokens_only=False):
    for i, sentence in enumerate(sentences):
        tokens = sentence # already tokenize
        print(tokens)
        if tokens_only:
            yield tokens
        else:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [i])       

In [None]:
files = get_files(corpus_path)
train_files, test_files = train_test_split(list(files), test_size=0.2, random_state=42)

In [None]:
train_sents = get_sentences(train_files)
test_sents = get_sentences(test_files)

In [None]:
train_corpus = list(create_dataset(train_sents))
test_corpus = list(create_dataset(test_sents, tokens_only=True))

In [None]:
doc_id = random.randint(0, len(train_corpus) - 1)
doc_id, train_corpus[doc_id: doc_id+3]

In [None]:
assert len(train_corpus) == train_corpus[-1].tags[0] + 1

## Training the Model

In [None]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)

In [None]:
model.build_vocab(train_corpus)

In [None]:
model.train(train_corpus, total_examples=model.corpus_count, epochs=model.epochs)

## Inference

In [None]:
model.infer_vector(["རིམ་པ་", "བཞིན་", "དུ་", "སྦྱིན་པ་", "ལ་", "བྱ་"])

## Save and Load model

In [None]:
model.save(model_path)

In [None]:
model = gensim.models.doc2vec.Doc2Vec.load(model_path)

## Assessing the Model

In [None]:
ranks = []
second_ranks = []
for doc_id in range(len(train_corpus)):
    inferred_vector = model.infer_vector(train_corpus[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])

In [None]:
import collections

counter = collections.Counter(ranks)
print(counter)

In [None]:
doc_id = 1
print('Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('SECOND-MOST', 1), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

In [None]:
# Pick a random document from the corpus and infer a vector from the model
import random
doc_id = random.randint(0, len(train_corpus) - 1)

# Compare and print the second-most-similar document
print('Train Document ({}): «{}»\n'.format(doc_id, ' '.join(train_corpus[doc_id].words)))
sim_id = second_ranks[doc_id]
print('Similar Document {}: «{}»\n'.format(sim_id, ' '.join(train_corpus[sim_id[0]].words)))

## Testing the Model
Using the same approach above, we’ll infer the vector for a randomly chosen test document, and compare the document to our model by eye.

In [None]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))