In [1]:
from time import time  # To time our operations
from collections import defaultdict  # For word frequency
from pathlib import Path

import logging  # Setting up the loggings to monitor gensim
logging.basicConfig(format="%(levelname)s - %(asctime)s: %(message)s", datefmt= '%H:%M:%S', level=logging.INFO)

# Dataset

In [2]:
data_fn = Path('../input/tokenized_lemmatized_paragraphs.txt')

In [3]:
tokenized_paras = [para.split(' ') for para in data_fn.read_text().split('\n')]

In [4]:
tokenized_paras[0]

['ན་མོ་',
 'གུ་རུ་',
 'དེ་བ་',
 'ཌཱ་ཀི་',
 'ནཱི་',
 'ཡཻ',
 '།_',
 'དགོངས་པ་',
 'གི་',
 'སྟོབས་',
 'དང་',
 'ཚུལ་ལྡན་',
 'ཆོ་ག་',
 'གི་',
 'མཐུ་',
 'གིས་',
 '།_།',
 'ཐོག་མེད་',
 'འཁྲུལ་པ་',
 'གི་',
 'འཆིང་བ་',
 'སྐད་ཅིག་',
 'ལ་',
 '།_།',
 'བྲལ་',
 'ན་',
 'མངོན་སུམ་',
 'ཡེ་ཤེས་',
 'སད་',
 'མཛད་པ་',
 '།_།',
 'དཀྱིལ་འཁོར་',
 'དབང་ཕྱུག་',
 'དཔལ་ལྡན་',
 'བླ་མ་',
 'ལ་',
 'འདུད་',
 '།_།',
 'རྡོ་རྗེ་',
 'ཐེག་པ་',
 'གི་',
 'རྩ་བ་',
 'སྨིན་',
 'བྱེད་',
 'ཀྱི་',
 '།_།',
 'ཚུལ་',
 'འདི་',
 'ཟབ་',
 'རྒྱ་',
 'ཉིད་',
 'ཕྱི་',
 'རྟོགས་དཀའ་',
 'ཡང་',
 '།_།',
 'དང་པོ་',
 'གི་',
 'ལས་ཅན་',
 'ཕྱོགས་',
 'ཙམ་',
 'ངེས་',
 'རྙེད་',
 'ཕྱི་',
 '།_།',
 'གོ་',
 'བདེ་',
 'གི་',
 'ངག་',
 'གིས་',
 'མདོར་བསྡུས་',
 'བརྗོད་པ་',
 'ལ་',
 'བྱ་',
 '།_།',
 'དེ་',
 'ཀྱང་',
 'རྡོ་རྗེ་',
 'ཐེག་པ་',
 'གི་',
 'ལམ་',
 'གྱི་',
 'གནད་',
 'ཐམས་ཅད་',
 'ཚང་',
 'ཞིང་',
 'ཁྱད་པར་',
 'གསང་སྔགས་',
 'ཀྱི་',
 'རྒྱུད་',
 'ལུང་',
 'མན་ངག་',
 'རྣམས་',
 'ལ་',
 'ཐོས་བསམ་',
 'སྒོམ་པ་',
 'གང་',
 'བྱེད་',
 'ཀྱང་',
 'ངེས་པ་',
 'ལ་',
 'སྔོན་',
 'དུ་མ་'

In [5]:
word_freq = defaultdict(int)
for para in tokenized_paras:
    for i in para:
        word_freq[i] += 1
len(word_freq)

48539

In [6]:
sorted(word_freq, key=word_freq.get, reverse=True)[:10]

['ལ་', 'གི་', '།_', '།_།', 'དང་', 'ན་', 'གིས་', 'དུ་', 'ཀྱི་', 'དེ་']

# Training the Model

In [7]:
import multiprocessing

from gensim.models import Word2Vec

## Why I seperate the training of the model in 3 steps:
I prefer to separate the training in 3 distinctive steps for clarity and monitoring.
1. `Word2Vec()`: 
>In this first step, I set up the parameters of the model one-by-one. <br>I do not supply the parameter `sentences`, and therefore leave the model uninitialized, purposefully.
2. `.build_vocab()`: 
>Here it builds the vocabulary from a sequence of sentences and thus initialized the model. <br>With the loggings, I can follow the progress and even more important, the effect of `min_count` and `sample` on the word corpus. I noticed that these two parameters, and in particular `sample`, have a great influence over the performance of a model. Displaying both allows for a more accurate and an easier management of their influence.
3. `.train()`:
>Finally, trains the model.<br>
The loggings here are mainly useful for monitoring, making sure that no threads are executed instantaneously.

In [8]:
cores = multiprocessing.cpu_count() # Count the number of cores in a computer

## The parameters:

* `min_count` <font color='purple'>=</font> <font color='green'>int</font> - Ignores all words with total absolute frequency lower than this - (2, 100)


* `window` <font color='purple'>=</font> <font color='green'>int</font> - The maximum distance between the current and predicted word within a sentence. E.g. `window` words on the left and `window` words on the left of our target - (2, 10)


* `size` <font color='purple'>=</font> <font color='green'>int</font> - Dimensionality of the feature vectors. - (50, 300)


* `sample` <font color='purple'>=</font> <font color='green'>float</font> - The threshold for configuring which higher-frequency words are randomly downsampled. Highly influencial.  - (0, 1e-5)


* `alpha` <font color='purple'>=</font> <font color='green'>float</font> - The initial learning rate - (0.01, 0.05)


* `min_alpha` <font color='purple'>=</font> <font color='green'>float</font> - Learning rate will linearly drop to `min_alpha` as training progresses. To set it: alpha - (min_alpha * epochs) ~ 0.00


* `negative` <font color='purple'>=</font> <font color='green'>int</font> - If > 0, negative sampling will be used, the int for negative specifies how many "noise words" should be drown. If set to 0, no negative sampling is used. - (5, 20)


* `workers` <font color='purple'>=</font> <font color='green'>int</font> - Use these many worker threads to train the model (=faster training with multicore machines)

### Dimension of word embedding
The optimal dimensionality of word embeddings is mostly task-dependent: a smaller dimensionality works better for more syntactic tasks such as named entity recognition (Melamud et al., 2016) [3] or part-of-speech (POS) tagging (Plank et al., 2016) [4], while a larger dimensionality is more useful for more semantic tasks such as sentiment analysis (Ruder et al., 2016) [5].

- [3] -> http://arxiv.org/abs/1601.00893
- [4] -> Plank, B., Søgaard, A., & Goldberg, Y. (2016). Multilingual Part-of-Speech Tagging with Bidirectional Long Short-Term Memory Models and Auxiliary Loss. In Proceedings of the 54th Annual Meeting of the Association for Computational Linguistics. 
- [5] -> http://arxiv.org/abs/1609.02745

In [9]:
w2v_model = Word2Vec(min_count=20,
                     window=5,
                     size=150,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)

## Building the Vocabulary Table:
Word2Vec requires us to build the vocabulary table (simply digesting all the words and filtering out the unique words, and doing some basic counts on them):

In [10]:
t = time()

w2v_model.build_vocab(tokenized_paras, progress_per=10000)

print('Time to build vocab: {} mins'.format(round((time() - t) / 60, 2)))

Time to build vocab: 0.11 mins


## Training of the model:
_Parameters of the training:_
* `total_examples` <font color='purple'>=</font> <font color='green'>int</font> - Count of sentences;
* `epochs` <font color='purple'>=</font> <font color='green'>int</font> - Number of iterations (epochs) over the corpus - [10, 20, 30]

In [11]:
t = time()

w2v_model.train(tokenized_paras, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

print('Time to train the model: {} mins'.format(round((time() - t) / 60, 2)))

Time to train the model: 5.6 mins


In [12]:
w2v_model.init_sims(replace=True)

# Exploring the model

In [13]:
w2v_model.wv.most_similar(positive=["སྟོབས་"])

[('མཐུ་', 0.608696699142456),
 ('ནུས་སྟོབས་', 0.45071011781692505),
 ('ཤུགས་', 0.4431358575820923),
 ('གོམས་པ་', 0.4223802387714386),
 ('སོར་ཆུད་པ་', 0.42098259925842285),
 ('ཡོན་ཏན་', 0.4176880419254303),
 ('སྐྱེས་ཐོབ་', 0.4152085483074188),
 ('ལྡན་པ་', 0.41036728024482727),
 ('མཐུ་ཆེན་པོ་', 0.4098166823387146),
 ('བརྩོན་འགྲུས་', 0.4016813039779663)]

In [14]:
w2v_model.wv.most_similar(positive=["མཛད་པ་"])

[('མཛད་', 0.7612118721008301),
 ('ཕྲིན་ལས་', 0.5403375029563904),
 ('མཛད་པ་པོ་', 0.49853041768074036),
 ('སྟོན་པ་', 0.4814871549606323),
 ('འགྲེལ་པ་', 0.47240862250328064),
 ('སྒྲུབ་ཐབས་', 0.4716480076313019),
 ('གྲགས་པ་', 0.46867769956588745),
 ('གདུལ་བྱ་', 0.4681282937526703),
 ('བསྟན་པ་', 0.4617062211036682),
 ('འཕྲིན་ལས་', 0.4564966857433319)]

In [15]:
w2v_model.wv.most_similar(positive=["བླ་མ་"])

[('བྱིན་རླབས་', 0.6062471866607666),
 ('མོས་གུས་', 0.564081072807312),
 ('མཚན་ལྡན་', 0.5319912433624268),
 ('རྗེ་', 0.5182524919509888),
 ('བཀའ་དྲིན་', 0.5181214809417725),
 ('བརྒྱུད་པ་', 0.5165523886680603),
 ('ཡི་དམ་', 0.5100193023681641),
 ('དད་གུས་', 0.5081140995025635),
 ('དྲིན་ཅན་', 0.4949682950973511),
 ('སྐྱབས་གནས་', 0.48522794246673584)]

In [16]:
w2v_model.wv.most_similar(positive=["རྩ་བ་"])

[('ལ་རྩ་བ་', 0.504997730255127),
 ('མདོར་ན་', 0.40671637654304504),
 ('ཐམས་ཅད་', 0.37312084436416626),
 ('བསྐྱེད་པ་', 0.36299213767051697),
 ('རྟེན་གཞི་', 0.3564617335796356),
 ('རྒྱུད་', 0.35559332370758057),
 ('སྡོམ་པ་', 0.35458317399024963),
 ('གཙོ་བོ་', 0.35075682401657104),
 ('གསུམ་པོ་', 0.3506447374820709),
 ('སྤྱིར་', 0.3458601236343384)]

In [17]:
w2v_model.wv.most_similar(positive=["ཉིད་"])

[('དེ་ཉིད་', 0.784591794013977),
 ('རང་བཞིན་', 0.6753857731819153),
 ('ངོ་བོ་', 0.6689523458480835),
 ('བདག་ཉིད་', 0.6479246616363525),
 ('ནི་', 0.636000394821167),
 ('ཕྱི་', 0.6137641668319702),
 ('དེ་བཞིན་ཉིད་', 0.5992171764373779),
 ('ནོ་', 0.5715460181236267),
 ('ཞེ་', 0.5701719522476196),
 ('གི་', 0.5472181439399719)]

# Save the word2vec

In [18]:
w2v_model.wv.save_word2vec_format("./bo_word2vec_lammatized",
                              "./vocab",
                               binary=False)

In [19]:
!ls

__notebook__.ipynb  __output__.json  bo_word2vec_lammatized  vocab


In [20]:
from gensim.models import KeyedVectors

In [21]:
wv_from_text = KeyedVectors.load_word2vec_format('bo_word2vec_lammatized', binary=False)

In [22]:
wv_from_text.wv.most_similar(positive=["ཉིད་"])

  """Entry point for launching an IPython kernel.


[('དེ་ཉིད་', 0.784591794013977),
 ('རང་བཞིན་', 0.6753857731819153),
 ('ངོ་བོ་', 0.6689522862434387),
 ('བདག་ཉིད་', 0.6479246616363525),
 ('ནི་', 0.636000394821167),
 ('ཕྱི་', 0.613764226436615),
 ('དེ་བཞིན་ཉིད་', 0.5992171764373779),
 ('ནོ་', 0.5715460181236267),
 ('ཞེ་', 0.5701720118522644),
 ('གི་', 0.5472180843353271)]