In [1]:
import multiprocessing
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np

In [2]:
import logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
cores = multiprocessing.cpu_count()

In [4]:
texts = LineSentence('Data/monolingual_processed.hi')

In [9]:
from gensim.models.callbacks import CallbackAny2Vec

class losscallback(CallbackAny2Vec):
    """
    Callback to print loss after each epoch
    """
    def __init__(self):
        self.epoch = 0
        self.losses = []

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        self.losses.append(float(loss))
        print('Loss after epoch {}: {}'.format(self.epoch, loss))

        self.epoch += 1
        self.loss_previous_step = loss

        model.running_training_loss = 0.0

In [4]:
w2v_model = Word2Vec(workers = cores - 1, vector_size = 200, epochs = 20)

2023-07-04 08:59:39,799 : INFO : Word2Vec lifecycle event {'params': 'Word2Vec<vocab=0, vector_size=200, alpha=0.025>', 'datetime': '2023-07-04T08:59:39.799383', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'created'}


In [7]:
w2v_model.build_vocab(texts)

2023-07-03 22:56:22,696 : INFO : collecting all words and their counts
2023-07-03 22:56:23,583 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-07-03 22:56:23,668 : INFO : PROGRESS: at sentence #10000, processed 264835 words, keeping 20686 word types
2023-07-03 22:56:23,755 : INFO : PROGRESS: at sentence #20000, processed 524586 words, keeping 34609 word types
2023-07-03 22:56:23,838 : INFO : PROGRESS: at sentence #30000, processed 721931 words, keeping 57814 word types
2023-07-03 22:56:23,909 : INFO : PROGRESS: at sentence #40000, processed 924646 words, keeping 75529 word types
2023-07-03 22:56:24,004 : INFO : PROGRESS: at sentence #50000, processed 1114071 words, keeping 89641 word types
2023-07-03 22:56:24,077 : INFO : PROGRESS: at sentence #60000, processed 1310358 words, keeping 104915 word types
2023-07-03 22:56:24,163 : INFO : PROGRESS: at sentence #70000, processed 1515973 words, keeping 121481 word types
2023-07-03 22:56:24,251 : INFO : PROGRESS

In [8]:
callBack1 = losscallback()
w2v_model.train(texts, total_examples = w2v_model.corpus_count, epochs = w2v_model.epochs, report_delay = 60, callbacks = [callBack1], compute_loss = True)

2023-07-03 23:02:02,970 : INFO : Word2Vec lifecycle event {'msg': 'training model with 5 workers on 740394 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-07-03T23:02:02.970515', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'train'}
2023-07-03 23:02:04,095 : INFO : EPOCH 0 - PROGRESS: at 0.00% examples, 7147 words/s, in_qsize 8, out_qsize 1
2023-07-03 23:03:04,096 : INFO : EPOCH 0 - PROGRESS: at 8.99% examples, 848788 words/s, in_qsize 9, out_qsize 0
2023-07-03 23:04:04,119 : INFO : EPOCH 0 - PROGRESS: at 18.04% examples, 855741 words/s, in_qsize 9, out_qsize 0
2023-07-03 23:05:04,137 : INFO : EPOCH 0 - PROGRESS: at 27.04% examples, 856345 words/s, in_qsize 9, out_qsize 0
2023-07-03 23:06:04,152 : INFO : EPOCH 0 - PROGRESS: at 36.10% examples, 858180 words/s, in_qsize 9, out_qsize 0
2023-07-03 23:07:04,152 : IN

Loss after epoch 0: 78535904.0
Loss after epoch 1: 79059480.0
Loss after epoch 2: 78365112.0
Loss after epoch 3: 79930576.0
Loss after epoch 4: 78301584.0
Loss after epoch 5: 79019376.0
Loss after epoch 6: 78766920.0
Loss after epoch 7: 78452648.0
Loss after epoch 8: 77023200.0
Loss after epoch 9: 77386176.0
Loss after epoch 10: 77515152.0
Loss after epoch 11: 77350352.0
Loss after epoch 12: 75947288.0
Loss after epoch 13: 75895560.0
Loss after epoch 14: 74675920.0
Loss after epoch 15: 74035488.0
Loss after epoch 16: 73609560.0
Loss after epoch 17: 72379432.0
Loss after epoch 18: 70213040.0
Loss after epoch 19: 69539624.0


(11553297175, 14519090460)

In [9]:
w2v_model.save('w2vecModels/w2vecModel_pretrained')

2023-07-04 02:39:40,925 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'w2vecModels/w2vecModel_pretrained', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-07-04T02:39:40.925313', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'saving'}
2023-07-04 02:39:40,926 : INFO : storing np array 'vectors' to w2vecModels/w2vecModel_pretrained.wv.vectors.npy
2023-07-04 02:39:45,548 : INFO : storing np array 'syn1neg' to w2vecModels/w2vecModel_pretrained.syn1neg.npy
2023-07-04 02:39:49,535 : INFO : not storing attribute cum_table
2023-07-04 02:39:50,202 : INFO : saved w2vecModels/w2vecModel_pretrained


Training word2vec on Training data

In [6]:
w2v_model = Word2Vec.load('w2vecModels/w2vecModel_pretrained')

2023-07-04 08:36:55,551 : INFO : loading Word2Vec object from w2vecModels/w2vecModel_pretrained
2023-07-04 08:36:56,669 : INFO : loading wv recursively from w2vecModels/w2vecModel_pretrained.wv.* with mmap=None
2023-07-04 08:36:56,670 : INFO : loading vectors from w2vecModels/w2vecModel_pretrained.wv.vectors.npy with mmap=None
2023-07-04 08:37:11,498 : INFO : loading syn1neg from w2vecModels/w2vecModel_pretrained.syn1neg.npy with mmap=None
2023-07-04 08:37:26,018 : INFO : setting ignored attribute cum_table to None
2023-07-04 08:37:30,947 : INFO : Word2Vec lifecycle event {'fname': 'w2vecModels/w2vecModel_pretrained', 'datetime': '2023-07-04T08:37:30.947556', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'loaded'}


In [7]:
texts = LineSentence('Data/train_processed.hi')

In [8]:
w2v_model.build_vocab(texts, update=True)

2023-07-04 08:37:31,003 : INFO : collecting all words and their counts
2023-07-04 08:37:31,138 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-07-04 08:37:31,159 : INFO : PROGRESS: at sentence #10000, processed 33103 words, keeping 1671 word types
2023-07-04 08:37:31,179 : INFO : PROGRESS: at sentence #20000, processed 62742 words, keeping 2049 word types
2023-07-04 08:37:31,202 : INFO : PROGRESS: at sentence #30000, processed 98109 words, keeping 2901 word types
2023-07-04 08:37:31,228 : INFO : PROGRESS: at sentence #40000, processed 152401 words, keeping 3334 word types
2023-07-04 08:37:31,251 : INFO : PROGRESS: at sentence #50000, processed 195898 words, keeping 3935 word types
2023-07-04 08:37:31,276 : INFO : PROGRESS: at sentence #60000, processed 238895 words, keeping 4401 word types
2023-07-04 08:37:31,298 : INFO : PROGRESS: at sentence #70000, processed 276937 words, keeping 4672 word types
2023-07-04 08:37:31,324 : INFO : PROGRESS: at sentence #

In [10]:
w2v_model.train(texts, total_examples = w2v_model.corpus_count, epochs = w2v_model.epochs, report_delay = 10, callbacks = [losscallback()], compute_loss = True)

2023-07-04 08:38:06,046 : INFO : Word2Vec lifecycle event {'msg': 'training model with 5 workers on 750325 vocabulary and 200 features, using sg=0 hs=0 sample=0.001 negative=5 window=5 shrink_windows=True', 'datetime': '2023-07-04T08:38:06.046906', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'train'}
2023-07-04 08:38:07,050 : INFO : EPOCH 0 - PROGRESS: at 8.75% examples, 485363 words/s, in_qsize 8, out_qsize 0
2023-07-04 08:38:17,071 : INFO : EPOCH 0 - PROGRESS: at 83.30% examples, 805965 words/s, in_qsize 10, out_qsize 1
2023-07-04 08:38:20,659 : INFO : EPOCH 0: training on 15383151 raw words (12194885 effective words) took 14.6s, 834691 effective words/s
2023-07-04 08:38:21,686 : INFO : EPOCH 1 - PROGRESS: at 10.10% examples, 539607 words/s, in_qsize 9, out_qsize 0
2023-07-04 08:38:31,695 : INFO : EPOCH 1 - PROGRESS: at 83.34% examples, 806191 words/s, in_qsize 9, out_qsize 0


Loss after epoch 0: 3243746.25
Loss after epoch 1: 3652342.25
Loss after epoch 2: 3568681.25
Loss after epoch 3: 3298875.25
Loss after epoch 4: 3273404.75
Loss after epoch 5: 3278452.5
Loss after epoch 6: 3175872.75
Loss after epoch 7: 3070387.75
Loss after epoch 8: 3146217.0
Loss after epoch 9: 3108932.75
Loss after epoch 10: 2962722.25
Loss after epoch 11: 2730233.5
Loss after epoch 12: 2989535.5
Loss after epoch 13: 2946964.0
Loss after epoch 14: 2748697.25
Loss after epoch 15: 2942714.75
Loss after epoch 16: 2880073.5
Loss after epoch 17: 2663759.75
Loss after epoch 18: 2731174.0
Loss after epoch 19: 2700848.0


(243916081, 307663020)

In [11]:
w2v_model.save('w2vecModels/w2vecModel_trained.hi')

2023-07-04 08:43:01,846 : INFO : Word2Vec lifecycle event {'fname_or_handle': 'w2vecModels/w2vecModel_trained.hi', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2023-07-04T08:43:01.846155', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'saving'}
2023-07-04 08:43:01,847 : INFO : storing np array 'vectors' to w2vecModels/w2vecModel_trained.hi.wv.vectors.npy
2023-07-04 08:43:06,621 : INFO : storing np array 'syn1neg' to w2vecModels/w2vecModel_trained.hi.syn1neg.npy
2023-07-04 08:43:10,858 : INFO : not storing attribute cum_table
2023-07-04 08:43:11,553 : INFO : saved w2vecModels/w2vecModel_trained.hi


In [5]:
w2v_model = Word2Vec.load('w2vecModels/w2vecModel_trained.hi')

2023-07-04 09:00:02,272 : INFO : loading Word2Vec object from w2vecModels/w2vecModel_trained.hi
2023-07-04 09:00:03,612 : INFO : loading wv recursively from w2vecModels/w2vecModel_trained.hi.wv.* with mmap=None
2023-07-04 09:00:03,613 : INFO : loading vectors from w2vecModels/w2vecModel_trained.hi.wv.vectors.npy with mmap=None
2023-07-04 09:00:24,787 : INFO : loading syn1neg from w2vecModels/w2vecModel_trained.hi.syn1neg.npy with mmap=None
2023-07-04 09:00:44,628 : INFO : setting ignored attribute cum_table to None
2023-07-04 09:00:49,960 : INFO : Word2Vec lifecycle event {'fname': 'w2vecModels/w2vecModel_trained.hi', 'datetime': '2023-07-04T09:00:49.960291', 'gensim': '4.3.1', 'python': '3.10.11 (main, May 16 2023, 00:28:57) [GCC 11.2.0]', 'platform': 'Linux-5.15.0-1035-azure-x86_64-with-glibc2.31', 'event': 'loaded'}
