In [2]:
import logging
import multiprocessing
import os
from collections import Counter
from typing import List, Union

from gensim.models import Word2Vec
from indicnlp.tokenize.indic_tokenize import trivial_tokenize_indic
from tqdm import tqdm

tqdm.pandas()

In [3]:
# set up the logging to monitor gensim
logging.basicConfig(
    format="%(levelname)s - %(asctime)s: %(message)s",
    datefmt='%H:%M:%S',
    level=logging.INFO,
)

logger = logging.getLogger(__name__)

## Utilities

In [4]:
def tokenize_text(text: List[str]) -> List[List[str]]:
    """Tokenize text"""
    return [trivial_tokenize_indic(sent) for sent in tqdm(text, desc='tokenize', unit=' sentences')]

In [21]:
def train_w2v(tokenized_text: List[List[str]], size: int = 100, window: int = 5, min_count: int = 1, epochs: int = 10,
              random_seed: int = 123, vec_file_path: Union[str, None] = None, ):
    "Learn word2vec embeddings"
    # count the number of cores
    cores = multiprocessing.cpu_count()
    # create word2vec model
    model = Word2Vec(
        size=size,
        window=window,
        min_count=min_count,
        workers=cores - 1,
        seed=random_seed,
    )
    # build vocab
    model.build_vocab(sentences=tokenized_text, progress_per=1000000)  # show progress after processing every 1M words
    # train
    model.train(sentences=tokenized_text, total_examples=model.corpus_count, epochs=epochs,
                report_delay=10)  # show progress after every 10 seconds
    if vec_file_path is not None:
        model.wv.save_word2vec_format(vec_file_path, binary=False)
    return model

## Load data

For learning the Odia word embeddings, we need monolingual Odia text data.
You can possibly scrape data from an online source such as Wikipedia.
For our experiments now, let's take the Odia monolingual text data available as part of the [Indic NLP corpus](https://github.com/AI4Bharat/indicnlp_corpus).

In [8]:
filename = os.path.join('data/or')
assert os.path.isfile(filename)  # sanity check

In [9]:
with open(filename, 'r', encoding='utf-8') as f:
    lines = [s.strip() for s in tqdm(f.readlines(), desc='read lines from file')]

read lines from file: 100%|██████████| 3594672/3594672 [00:02<00:00, 1523841.53it/s]


In [10]:
# tokenize
tokens = tokenize_text(lines)

tokenize: 100%|██████████| 3594672/3594672 [01:21<00:00, 44039.58 sentences/s]


In [11]:
num_running_toks, num_unique_toks = 0, 0
counter = Counter()
for toks in tqdm(tokens, desc='compute frequencies of tokens', unit=' sentences'):
    counter.update(toks)

compute frequencies of tokens: 100%|██████████| 3594672/3594672 [00:21<00:00, 169660.67 sentences/s]


In [12]:
print(f'Number of sentences: {len(lines):,}')
print(f'Number of unique words or equivalantly, the size of vocabulary: {len(counter):,}')
print(f'Number of running words: {sum([freq for _, freq in counter.items()]):,}')

Number of sentences: 3,594,672
Number of unique words or equivalantly, the size of vocabulary: 778,862
Number of running words: 51,151,273


In [13]:
# most common words
counter.most_common(n=20)

[('।', 3393061),
 (',', 1191253),
 ('ଓ', 534792),
 ('ଏହି', 437185),
 ('ପାଇଁ', 373726),
 ('ସେ', 240775),
 ('ବୋଲି', 239837),
 ('ପରେ', 224959),
 ('କରି', 221628),
 ('ଏକ', 213516),
 ('ମଧ୍ୟ', 210907),
 ('ଏବଂ', 198988),
 ('କରିଥିଲେ', 195168),
 ('ସହ', 177040),
 ('-', 174796),
 ('ଖବର', 169373),
 ('.', 166728),
 ('କରିବା', 166276),
 ('ନେଇ', 161728),
 ('ବେଳେ', 156327)]

## Learn embeddings

In [22]:
w2vmodel = train_w2v(tokenized_text=tokens, size=100, window=5, min_count=20, epochs=10, random_seed=123,
                     vec_file_path=os.path.join('embeddings.txt'))

INFO - 02:35:17: collecting all words and their counts
INFO - 02:35:17: PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
INFO - 02:35:21: PROGRESS: at sentence #1000000, processed 14406915 words, keeping 356423 word types
INFO - 02:35:24: PROGRESS: at sentence #2000000, processed 28227997 words, keeping 518060 word types
INFO - 02:35:28: PROGRESS: at sentence #3000000, processed 42532970 words, keeping 692676 word types
INFO - 02:35:30: collected 778862 word types from a corpus of 51151273 raw words and 3594672 sentences
INFO - 02:35:30: Loading a fresh vocabulary
INFO - 02:35:30: effective_min_count=20 retains 72827 unique words (9% of original 778862, drops 706035)
INFO - 02:35:30: effective_min_count=20 leaves 49262024 word corpus (96% of original 51151273, drops 1889249)
INFO - 02:35:31: deleting the raw counts dictionary of 778862 items
INFO - 02:35:31: sample=0.001 downsamples 22 most-common words
INFO - 02:35:31: downsampling leaves estimated 43948889 word corpu

## Evaluate embeddings

Here we evaluate the embeddings learned by just  👀  at the neighbours of a few words and examining if they are similar.

In [25]:
## find words similar to a given word
w2vmodel.wv.most_similar('ଗଛ', topn=10)

[('ଗଛକୁ', 0.8128724694252014),
 ('ଗଛଗୁଡ଼ିକ', 0.7983400821685791),
 ('ଆମ୍ବଗଛ', 0.7657703161239624),
 ('ଗଛର', 0.7544435262680054),
 ('ତାଳଗଛ', 0.753383994102478),
 ('ଗଛଟିଏ', 0.7315728664398193),
 ('ଗଛଟି', 0.7306054830551147),
 ('ଗଛପତ୍ର', 0.7241652011871338),
 ('ବୃକ୍ଷ', 0.714855968952179),
 ('ଗଛଗୁଡିକୁ', 0.7127572298049927)]

In [26]:
w2vmodel.wv.most_similar('ସଙ୍ଗୀତ', topn=10)

[('ସଂଗୀତ', 0.9450031518936157),
 ('ସଂଙ୍ଗୀତ', 0.8198200464248657),
 ('ଓଡ଼ିଶୀ', 0.7925612926483154),
 ('ସଙ୍ଗିତ', 0.7761118412017822),
 ('ନାଟ୍ୟ', 0.7644525766372681),
 ('ନୃତ୍ୟ', 0.7506800889968872),
 ('ସଙ୍ଗୀତରେ', 0.7364827394485474),
 ('ସଙ୍ଗୀତର', 0.7345959544181824),
 ('ହିନ୍ଦୁସ୍ଥାନୀ', 0.7241092920303345),
 ('ନୃତ୍ୟାଙ୍ଗନା', 0.71225905418396)]

In [27]:
w2vmodel.wv.most_similar('ଚଳଚ୍ଚିତ୍ର', topn=10)

[('ଚଳଚିତ୍ର', 0.8641869425773621),
 ('ଫିଲ୍ମ', 0.8291294574737549),
 ('ସିନେମା', 0.8122552633285522),
 ('ସିନେ', 0.748349666595459),
 ('ଚଳଚ୍ଚିତ୍ରର', 0.7161926031112671),
 ('ଧାରାବାହିକ', 0.7104039192199707),
 ('ଚଳଚ୍ଚିତ୍ରଟି', 0.7052581906318665),
 ('ସିନେମାର', 0.7023676037788391),
 ('ଧାରାବାହିକର', 0.6961969137191772),
 ('ଆଲବମ୍', 0.6868106722831726)]