In [36]:
from time import time
from collections import defaultdict
import multiprocessing
from pathlib import Path
import logging
import unicodedata

from gensim.models import Word2Vec
from gensim.models import KeyedVectors

import config

logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

## Dataset

In [27]:
model_name = "word2vec_classical_bo.model"
model_path = config.MODELS_DIR / model_name

corpus_path = config.DATA_DIR / "classical_bo"

In [5]:
def get_files(path):
    for pecha_path in path.iterdir():
        if not pecha_path.is_dir(): continue
        for fn in pecha_path.iterdir():
            if not 'tokenize' in fn.name: continue
            yield fn
            
def is_punt(word):
    for punt in ["།", "།།", "༄༅"]:
        if punt in word:
            return True
    return False
    
def tokenize(text):
    return [token for token in text.split() if token and not is_punt(token)]
    
def get_sentences(corpus_path):
    fns = get_files(corpus_path)
    for fn in fns:
        for sentence in fn.open('r'):
            if len(sentence.split()) < 3: continue
            yield tokenize(unicodedata.normalize("NFKC", sentence.strip()))

In [6]:
sentences = list(get_sentences(corpus_path))

In [7]:
len(sentences)

986814

In [8]:
def top_k_highest_freq_word(sentences, k=10):
    word_freq = defaultdict(int)
    for sent in sentences:
        for i in sent:
            word_freq[i] += 1
    return sorted(word_freq, key=word_freq.get, reverse=True)[:10]

top_k_highest_freq_word(sentences)

['འི་', 'ར་', 'དང་', 'ས་', 'ལ་', 'ཀྱི་', 'དུ་', 'ལ', 'གྱི་', 'ནས་']

## Train model

In [10]:
cores = multiprocessing.cpu_count()

In [14]:
model = Word2Vec(
    vector_size=100,
    window=5,
    min_count=1,
    min_alpha=0.0007, 
    negative=20,
    workers=cores-1
)

INFO - 12:53:45: Word2Vec lifecycle event {'params': 'Word2Vec(vocab=0, vector_size=100, alpha=0.025)', 'datetime': '2022-05-03T12:53:45.391100', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'created'}


In [None]:
t = time()
model.build_vocab(sentences, progress_per=10000)
print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

In [None]:
t = time()
model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)
print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

In [28]:
model.save(str(model_path))

INFO - 13:16:18: Word2Vec lifecycle event {'fname_or_handle': '/home/studio-lab-user/.models/models/word2vec_classical_bo.model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-05-03T13:16:18.204689', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'saving'}
INFO - 13:16:18: not storing attribute cum_table
INFO - 13:16:18: saved /home/studio-lab-user/.models/models/word2vec_classical_bo.model


## Exploring the model

In [31]:
model.wv.most_similar("སྟོབས་")

[('ཤུགས་', 0.6270895600318909),
 ('སྟོབས་ཤུགས་', 0.5976985096931458),
 ('མཐུ་', 0.5726722478866577),
 ('སྟོབས', 0.537462592124939),
 ('རང་ཤུགས་', 0.5301303863525391),
 ('ནུས་སྟོབས་', 0.5233063697814941),
 ('བརྩོན་འགྲུས་', 0.5218465328216553),
 ('ཤན་སྟོབས་', 0.48222506046295166),
 ('མངོན་ཤེས་', 0.47309374809265137),
 ('ཤེས་རབ་', 0.46450451016426086)]

In [32]:
model.wv.most_similar("མཛད་པ་")

[('མཛད', 0.6354176998138428),
 ('མཛད་པ', 0.6226857900619507),
 ('སྟོན་པ་', 0.5863264203071594),
 ('མཛད་', 0.5766376852989197),
 ('བཞེད་པ་', 0.5742528438568115),
 ('མཛད་པ་པོ་', 0.5446878671646118),
 ('བྱོན་པ་', 0.5159367322921753),
 ('བཀྲལ་བ་', 0.5106030106544495),
 ('གསུང་རབ་', 0.5089299082756042),
 ('གདུལ་བྱ་', 0.502116858959198)]

In [33]:
model.wv.most_similar("བླ་མ་")

[('བླ་མ', 0.6644584536552429),
 ('ཡོངས་འཛིན་', 0.5951176881790161),
 ('སྐྱབས་མགོན་', 0.5435715913772583),
 ('མར་མེ་མཛད', 0.5370640754699707),
 ('ཇོ་བོ་', 0.5248255133628845),
 ('བཀའ་དྲིན་་་་་་', 0.5221753120422363),
 ('སྐྱབས་གནས་', 0.521973192691803),
 ('སྐུ་ཞབས་', 0.5005108714103699),
 ('འབྲུག་པ', 0.4933638274669647),
 ('དྭགས་པོ་', 0.48295798897743225)]

In [34]:
model.wv.most_similar("རྩ་བ་")

[('རྩད་', 0.505094587802887),
 ('བཤེས་གཉེན་པ་', 0.5023690462112427),
 ('སྲོག་རྩ་', 0.499602347612381),
 ('བཤེས', 0.47952309250831604),
 ('གཞི་རྩ་', 0.47937679290771484),
 ('བཤེས་གཉེན་', 0.472102552652359),
 ('གཞི་མ་', 0.46959346532821655),
 ('རྩ་བ', 0.45777443051338196),
 ('གཞི་མ', 0.4474014937877655),
 ('རྟེན་གཞི་', 0.43366846442222595)]

In [35]:
model.wv.most_similar("ཉིད་")

[('དེ་ཉིད་', 0.7768874168395996),
 ('འདི་ཉིད་', 0.5357514023780823),
 ('སྟོང་པ་ཉིད་', 0.5227730870246887),
 ('ཆོས་ཉིད་', 0.49807819724082947),
 ('གཅིག་ཉིད་', 0.4918588399887085),
 ('དེ་བཞིན་ཉིད་', 0.49043431878089905),
 ('མཉམ་པ་ཉིད་', 0.4587320387363434),
 ('མཆོག་ཉིད་', 0.4542542099952698),
 ('རང་ཉིད་', 0.450390487909317),
 ('ཉིད༷་', 0.4407891631126404)]

## Save only word vector

In [38]:
word_vectors_path = config.MODELS_DIR / "classical_bo.wordvectors"

In [39]:
# Store just the words + their trained embeddings.
word_vectors = model.wv
word_vectors.save(str(word_vectors_path))

INFO - 13:25:35: KeyedVectors lifecycle event {'fname_or_handle': '/home/studio-lab-user/.models/models/classical_bo.wordvectors', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-05-03T13:25:35.164019', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'saving'}
INFO - 13:25:35: saved /home/studio-lab-user/.models/models/classical_bo.wordvectors


In [40]:
# Load back with memory-mapping = read-only, shared across processes.
wv = KeyedVectors.load(str(word_vectors_path), mmap='r')

INFO - 13:26:02: loading KeyedVectors object from /home/studio-lab-user/.models/models/classical_bo.wordvectors
INFO - 13:26:03: KeyedVectors lifecycle event {'fname': '/home/studio-lab-user/.models/models/classical_bo.wordvectors', 'datetime': '2022-05-03T13:26:03.102040', 'gensim': '4.1.2', 'python': '3.9.10 | packaged by conda-forge | (main, Feb  1 2022, 21:24:37) \n[GCC 9.4.0]', 'platform': 'Linux-4.14.262-200.489.amzn2.x86_64-x86_64-with-glibc2.31', 'event': 'loaded'}


In [41]:
wv.most_similar("སྟོབས་")

[('ཤུགས་', 0.6270895600318909),
 ('སྟོབས་ཤུགས་', 0.5976985096931458),
 ('མཐུ་', 0.5726722478866577),
 ('སྟོབས', 0.537462592124939),
 ('རང་ཤུགས་', 0.5301303863525391),
 ('ནུས་སྟོབས་', 0.5233063697814941),
 ('བརྩོན་འགྲུས་', 0.5218465328216553),
 ('ཤན་སྟོབས་', 0.48222506046295166),
 ('མངོན་ཤེས་', 0.47309374809265137),
 ('ཤེས་རབ་', 0.46450451016426086)]