# Creating Word2Vec Embeddings

We will use sentences from periodika to create a word2vec model

In [1]:
# TODO load chunked data into a single dataframe
# TODO install gensim if it is not already installed
# TODO load lemma data in a format suitable for Word2Vec

# first check python version
import sys
print(f"Python version: {sys.version}")
import pickle

# date
from datetime import datetime
now = datetime.now()
print(f"Date: {now}")

# we will use Path from pathlib to work with paths
from pathlib import Path

# we will use tqdm for progress bars
from tqdm import tqdm

## We will be loading all parquet files from a parquet folder into a single pandas dataframe

## then we will calculate word richness for all novels and plot it using Plotly
# first install Pandas if needed using pip and optional dependencies
# https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#install-optional-dependencies

# pip install "pandas[performance, plot, output_formatting, computation, excel, html, parquet, compression, consortium-standard]"

# try to import pandas
try:
    import pandas as pd
    print(f"pandas version: {pd.__version__}")
except ImportError:
    print("pandas not found")
    raise ImportError

# issue with scipy library removing triu function
# https://stackoverflow.com/questions/78279136/importerror-cannot-import-name-triu-from-scipy-linalg-when-importing-gens
# https://github.com/piskvorky/gensim/issues/3525
try:
    import gensim
    print(f"gensim version: {gensim.__version__}")
except ImportError:
    print("gensim not found")
    raise ImportError

Python version: 3.12.6 (tags/v3.12.6:a4a2d2b, Sep  6 2024, 20:11:23) [MSC v.1940 64 bit (AMD64)]
Date: 2025-02-25 10:11:06.638636
pandas version: 2.2.1
gensim version: 4.3.2


In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [3]:
import gensim.models

## Load and prepare parquet from latsenrom

In [4]:
src = Path("../../lnb_lat_sen_rom/parquet/latsenrom_2025_02_05.parquet")
# assert file exists
if src.exists():
    print(f"File {src} exists")
else:
    print(f"File does not exist")


File ..\..\lnb_lat_sen_rom\parquet\latsenrom_2025_02_05.parquet exists


In [5]:
# let's load df
df = pd.read_parquet(src)
# shape
print(f"Shape: {df.shape}")
# head
df.head()

Shape: (37605476, 17)


Unnamed: 0,deprel,form,index,lemma,parent,pos,tag,ufeats,upos,sent_ndx,author,title,dom_id,file_stem,file_stem_short,firstEdition,term
0,nmod,Mīlas,1,mīla,2.0,ncfsg_,ncfsg4,Case=Gen|Gender=Fem|Number=Sing,NOUN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,mīla
1,nmod,ārprāta,2,ārprāts,3.0,ncmsg_,ncmsg1,Case=Gen|Gender=Masc|Number=Sing,NOUN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,ārprāts
2,obl,varā,3,vara,6.0,ncfsl_,ncfsl4,Case=Loc|Gender=Fem|Number=Sing,NOUN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,vara
3,nmod,ROMĀNS,4,Romāns,6.0,npmsn_,npmsn1,Case=Nom|Gender=Masc|Number=Sing,PROPN,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,Romāns
4,punct,„,5,"""",6.0,zq,zq,_,PUNCT,0,AizsV,MilaU,1049452,AizsV_MilaU_1049452,AizsV_MilaU,1933,""""


In [6]:
# how many unique file_stem
print(f"Unique file_stem: {df.file_stem.nunique()}")

Unique file_stem: 485


In [7]:
# now we need to create a list of sentences from the lemma column
# we will group by file_stem and sent_ndx and then join the lemmas with a space
# we will use tqdm to show progress bar

# first we will group by file_stem and sent_ndx
grouped = df.groupby(["file_stem", "sent_ndx"])["lemma"].apply(list)
# let's check the first 5 rows
grouped.head()


file_stem            sent_ndx
AizsV_MilaU_1049452  0           [mīla, ārprāts, vara, Romāns, ", redzēt, ,, Ve...
                     1           [Līdija, pieliekties, un, nebēdīgi, uzšļākt, V...
                     2           [nu, tu, savs, sodīt, izbēgt, !, ", likties, ,...
                     3           [Vents, piedraudēt, un, satvert, viņa, roka, t...
                     4           [nekas, nebij, !, ", zināt, zinātnieks, rādīt,...
Name: lemma, dtype: object

In [8]:
# create a datafame from the grouped series that also has a sentence_length column
sentences_df = grouped.reset_index(name="sentences")
# add sentence_length
sentences_df["sentence_length"] = sentences.sentences.apply(len)
# check first 5 rows
sentences_df.head()

Unnamed: 0,file_stem,sent_ndx,sentences,sentence_length
0,AizsV_MilaU_1049452,0,"[mīla, ārprāts, vara, Romāns, "", redzēt, ,, Ve...",34
1,AizsV_MilaU_1049452,1,"[Līdija, pieliekties, un, nebēdīgi, uzšļākt, V...",34
2,AizsV_MilaU_1049452,2,"[nu, tu, savs, sodīt, izbēgt, !, "", likties, ,...",28
3,AizsV_MilaU_1049452,3,"[Vents, piedraudēt, un, satvert, viņa, roka, t...",35
4,AizsV_MilaU_1049452,4,"[nekas, nebij, !, "", zināt, zinātnieks, rādīt,...",34


In [9]:
# describe sentence_length
sentences_df.sentence_length.describe()

count    680143.000000
mean         55.290543
std          46.876084
min           1.000000
25%          25.000000
50%          43.000000
75%          70.000000
max         671.000000
Name: sentence_length, dtype: float64

## Save and Load sentences for word2vec parquet

In [4]:
# save sentences to parquet
sentences_dst = Path("../parquet/sentences_for_word2vec.parquet")
# sentences_df.to_parquet(sentences_dst, index=False, compression="gzip")
sentences_df = pd.read_parquet(sentences_dst)
# shape
print(f"Shape: {sentences_df.shape}")
# head
sentences_df.head()

Shape: (680143, 4)


Unnamed: 0,file_stem,sent_ndx,sentences,sentence_length
0,AizsV_MilaU_1049452,0,"[mīla, ārprāts, vara, Romāns, "", redzēt, ,, Ve...",34
1,AizsV_MilaU_1049452,1,"[Līdija, pieliekties, un, nebēdīgi, uzšļākt, V...",34
2,AizsV_MilaU_1049452,2,"[nu, tu, savs, sodīt, izbēgt, !, "", likties, ,...",28
3,AizsV_MilaU_1049452,3,"[Vents, piedraudēt, un, satvert, viņa, roka, t...",35
4,AizsV_MilaU_1049452,4,"[nekas, nebij, !, "", zināt, zinātnieks, rādīt,...",34


In [11]:
# sentences_df = sentences


In [5]:
# create a list of sentences by joining the lemmas with a space 
# sentences = sentences_df.sentences.apply(lambda x: " ".join(x)).tolist()
sentences = sentences_df.sentences.tolist()
# how many sentences?
print(f"Number of sentences: {len(sentences)}")

Number of sentences: 680143


In [12]:
# save sentences to pickle
sentences_pickle = Path("../pickle/sentences_for_word2vec_2025_02_07.pickle")
with open(sentences_pickle, "wb") as f:
    pickle.dump(sentences, f)
# print resulting file size
print(f"File size: {sentences_pickle.stat().st_size}")

File size: 128703282


In [15]:
# first 3 sentences
print(sentences[:3])

[['mīla', 'ārprāts', 'vara', 'Romāns', 'redzēt', 'Vents', 'cik', 'līksmi', 'griezties', 'dzirnaviņas', 'spārns', 'kā', 'zelts', 'taurenītis', 'mīla', 'rotaļa', 'viņš', 'nostatīt', 'tāļredze', 'spogulis', 'acs', 'Vents', 'kļūt', 'tā', 'žēl'], ['Līdija', 'pieliekties', 'un', 'nebēdīgi', 'uzšļākt', 'Vents', 'šalts', 'vēss', 'ūdens', 'desmit', 'kilometrs', 'attāļums', 'no', 'Bornholma', 'ārprātīgs', 'orkāns', 'sirmot', 'jūra', 'dzelme', 'viņš', 'pateikties', 'dievs', 'kas', 'būt', 'ļaut', 'nogalināt', 'šī', 'nelaimīgs', 'ģēnijs'], ['nu', 'tu', 'savs', 'sodīt', 'izbēgt', 'likties', 'ka', 'milzīgs', 'vilnis', 'skārta', 'padebesis', 'tik', 'augstu', 'lidot', 'vilnis', 'strūkla', 'ārprātīgs', 'ciešana', 'atspoguļoties', 'ikviens', 'Ādlers', 'vaibsts']]


In [17]:
# save sentences as text with each sentence on a new line
# then save it in a compressed zip file
sentences_txt = Path("../txt/sentences_for_word2vec_2025_02_07.txt")
with open(sentences_txt, "w", encoding="utf-8") as f:
    f.write("\n".join((" ".join(row_list) for row_list in sentences)))
# print resulting file size
print(f"File size: {sentences_txt.stat().st_size}")

# now compress the file using zip

import zipfile
zipfile_dst = Path("../zip/sentences_for_word2vec_2025_02_07.zip")
with zipfile.ZipFile(zipfile_dst, "w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write(sentences_txt)

# print resulting file size
print(f"File size: {zipfile_dst.stat().st_size}")


File size: 194258734
File size: 70280009


In [6]:
# let's get rid of any elements that are punctuation
import string
# let's iterate over sentences and remove punctuation
sentences = [[word for word in sentence if word not in string.punctuation] for sentence in sentences]
# how many sentences?
print(f"Number of sentences: {len(sentences)}")

Number of sentences: 680143


In [7]:
# remove sentences that have less than 2 words
sentences = [sentence for sentence in sentences if len(sentence) > 1]
# how many sentences?
print(f"Number of sentences: {len(sentences)}")

Number of sentences: 676869


## Load Pickle

In [5]:
# # src will be sentences_periodika.pkl in pickles folder
# src = Path("../pickles/sentences_periodika.pkl")
# # assert that the file exists
# assert src.exists(), f"{src} does not exist"
# # read pickle
# with open(src, "rb") as f:
#     sentences = pickle.load(f)
# # how many sentences do we have?
# print(f"Number of sentences: {len(sentences)}")



Number of sentences: 9767880


In [8]:
# sentence lengths
sentence_lengths = [len(sentence) for sentence in sentences]
# min, avg and max length of sentences
min_length = min(sentence_lengths)
avg_length = sum(sentence_lengths) / len(sentences)
max_length = max(sentence_lengths)
print(f"Min sentence length: {min_length}")
print(f"Avg sentence length: {avg_length}")
print(f"Max sentence length: {max_length}")

Min sentence length: 2
Avg sentence length: 44.00467446433505
Max sentence length: 553


In [9]:
# lets do some extra logging using a callback
# https://datascience.stackexchange.com/questions/9819/number-of-epochs-in-gensim-word2vec-implementation

from gensim.models.callbacks import CallbackAny2Vec



class LossLogger(CallbackAny2Vec):
    '''Output loss at each epoch'''
    def __init__(self):
        self.epoch = 1
        self.start_losses = []
        self.end_losses = []

    def on_epoch_begin(self, model):
        loss = model.get_latest_training_loss()
        print(f'Epoch: {self.epoch} - START loss {loss}')
        self.start_losses.append(loss)

    def on_epoch_end(self, model):
        loss = model.get_latest_training_loss()
        print(f'Epoch {self.epoch} - END  Loss: {loss}')
        self.end_losses.append(loss)
        self.epoch += 1

In [10]:
# Your model params:
loss_logger = LossLogger()
CONTEXT_WINDOW = 5
NEGATIVES = 5
MIN_COUNT = 5
EPOCHS = 20
sentences = sentences
# start time
start = datetime.now()
print(f"Start time: {start}")
model = gensim.models.word2vec.Word2Vec(sentences=sentences,
                                      sg=1,
                                      window=CONTEXT_WINDOW,
                                      negative=NEGATIVES,
                                      min_count=MIN_COUNT,
                                      callbacks=[loss_logger],
                                      compute_loss=True,
                                      epochs=EPOCHS)
# end time
end = datetime.now()
print(f"End time: {end}")
# save model
model.save("../models/word2vec_latsenrom.model")

2025-02-07 11:29:46,057 : INFO : collecting all words and their counts
2025-02-07 11:29:46,058 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2025-02-07 11:29:46,132 : INFO : PROGRESS: at sentence #10000, processed 404960 words, keeping 53341 word types
2025-02-07 11:29:46,203 : INFO : PROGRESS: at sentence #20000, processed 919455 words, keeping 78732 word types


Start time: 2025-02-07 11:29:46.056113


2025-02-07 11:29:46,283 : INFO : PROGRESS: at sentence #30000, processed 1364882 words, keeping 97160 word types
2025-02-07 11:29:46,339 : INFO : PROGRESS: at sentence #40000, processed 1618759 words, keeping 117713 word types
2025-02-07 11:29:46,391 : INFO : PROGRESS: at sentence #50000, processed 1945606 words, keeping 128202 word types
2025-02-07 11:29:46,461 : INFO : PROGRESS: at sentence #60000, processed 2372376 words, keeping 137170 word types
2025-02-07 11:29:46,518 : INFO : PROGRESS: at sentence #70000, processed 2686281 words, keeping 147298 word types
2025-02-07 11:29:46,671 : INFO : PROGRESS: at sentence #80000, processed 3499235 words, keeping 184174 word types
2025-02-07 11:29:46,778 : INFO : PROGRESS: at sentence #90000, processed 4208269 words, keeping 200358 word types
2025-02-07 11:29:46,838 : INFO : PROGRESS: at sentence #100000, processed 4585048 words, keeping 214584 word types
2025-02-07 11:29:46,905 : INFO : PROGRESS: at sentence #110000, processed 5004970 words,

Epoch: 1 - START loss 0.0


2025-02-07 11:29:54,882 : INFO : EPOCH 0 - PROGRESS: at 1.58% examples, 328881 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:29:55,931 : INFO : EPOCH 0 - PROGRESS: at 2.90% examples, 332301 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:29:56,950 : INFO : EPOCH 0 - PROGRESS: at 3.98% examples, 332426 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:29:57,948 : INFO : EPOCH 0 - PROGRESS: at 6.25% examples, 338527 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:29:58,968 : INFO : EPOCH 0 - PROGRESS: at 8.25% examples, 339139 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:30:00,017 : INFO : EPOCH 0 - PROGRESS: at 9.98% examples, 339801 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:30:01,034 : INFO : EPOCH 0 - PROGRESS: at 11.27% examples, 338684 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:30:02,054 : INFO : EPOCH 0 - PROGRESS: at 11.69% examples, 336754 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:30:03,083 : INFO : EPOCH 0 - PROGRESS: at 12.51% examples, 336549 words/s, in_qsize 5, o

Epoch 1 - END  Loss: 51315424.0
Epoch: 2 - START loss 51315424.0


2025-02-07 11:31:05,422 : INFO : EPOCH 1 - PROGRESS: at 1.49% examples, 307412 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:31:06,452 : INFO : EPOCH 1 - PROGRESS: at 2.88% examples, 325462 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:31:07,469 : INFO : EPOCH 1 - PROGRESS: at 3.74% examples, 324104 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:31:08,489 : INFO : EPOCH 1 - PROGRESS: at 6.05% examples, 330172 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:31:09,505 : INFO : EPOCH 1 - PROGRESS: at 8.07% examples, 332624 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:31:10,543 : INFO : EPOCH 1 - PROGRESS: at 9.47% examples, 334092 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:31:11,580 : INFO : EPOCH 1 - PROGRESS: at 11.25% examples, 336154 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:31:12,591 : INFO : EPOCH 1 - PROGRESS: at 11.65% examples, 334564 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:31:13,607 : INFO : EPOCH 1 - PROGRESS: at 12.48% examples, 335195 words/s, in_qsize 5, o

Epoch 2 - END  Loss: 68194984.0
Epoch: 3 - START loss 68194984.0


2025-02-07 11:32:16,180 : INFO : EPOCH 2 - PROGRESS: at 1.58% examples, 323830 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:17,184 : INFO : EPOCH 2 - PROGRESS: at 2.88% examples, 328409 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:18,228 : INFO : EPOCH 2 - PROGRESS: at 3.92% examples, 328920 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:19,247 : INFO : EPOCH 2 - PROGRESS: at 6.25% examples, 337667 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:20,262 : INFO : EPOCH 2 - PROGRESS: at 8.27% examples, 340170 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:21,285 : INFO : EPOCH 2 - PROGRESS: at 9.69% examples, 337129 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:32:22,281 : INFO : EPOCH 2 - PROGRESS: at 11.25% examples, 336782 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:23,298 : INFO : EPOCH 2 - PROGRESS: at 11.66% examples, 336492 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:32:24,344 : INFO : EPOCH 2 - PROGRESS: at 12.49% examples, 335644 words/s, in_qsize 5, o

Epoch 3 - END  Loss: 70496864.0
Epoch: 4 - START loss 70496864.0


2025-02-07 11:33:26,724 : INFO : EPOCH 3 - PROGRESS: at 1.58% examples, 318672 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:27,755 : INFO : EPOCH 3 - PROGRESS: at 2.89% examples, 325846 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:28,768 : INFO : EPOCH 3 - PROGRESS: at 3.86% examples, 326246 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:33:29,787 : INFO : EPOCH 3 - PROGRESS: at 6.15% examples, 332868 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:30,820 : INFO : EPOCH 3 - PROGRESS: at 8.25% examples, 335982 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:31,838 : INFO : EPOCH 3 - PROGRESS: at 9.98% examples, 338866 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:32,888 : INFO : EPOCH 3 - PROGRESS: at 11.28% examples, 339107 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:33,906 : INFO : EPOCH 3 - PROGRESS: at 11.78% examples, 337943 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:33:34,975 : INFO : EPOCH 3 - PROGRESS: at 12.59% examples, 336158 words/s, in_qsize 5, o

Epoch 4 - END  Loss: 72775416.0
Epoch: 5 - START loss 72775416.0


2025-02-07 11:34:37,675 : INFO : EPOCH 4 - PROGRESS: at 1.58% examples, 322711 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:34:38,709 : INFO : EPOCH 4 - PROGRESS: at 2.88% examples, 325011 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:34:39,727 : INFO : EPOCH 4 - PROGRESS: at 3.74% examples, 322037 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:34:40,728 : INFO : EPOCH 4 - PROGRESS: at 6.08% examples, 330631 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:34:41,762 : INFO : EPOCH 4 - PROGRESS: at 8.19% examples, 335072 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:34:42,801 : INFO : EPOCH 4 - PROGRESS: at 9.69% examples, 336505 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:34:43,797 : INFO : EPOCH 4 - PROGRESS: at 11.26% examples, 338593 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:34:44,837 : INFO : EPOCH 4 - PROGRESS: at 11.67% examples, 336320 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:34:45,865 : INFO : EPOCH 4 - PROGRESS: at 12.50% examples, 335643 words/s, in_qsize 5, o

Epoch 5 - END  Loss: 75099304.0
Epoch: 6 - START loss 75099304.0


2025-02-07 11:35:48,253 : INFO : EPOCH 5 - PROGRESS: at 1.60% examples, 333831 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:35:49,270 : INFO : EPOCH 5 - PROGRESS: at 2.90% examples, 335907 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:35:50,268 : INFO : EPOCH 5 - PROGRESS: at 3.98% examples, 336496 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:35:51,302 : INFO : EPOCH 5 - PROGRESS: at 6.25% examples, 338734 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:35:52,319 : INFO : EPOCH 5 - PROGRESS: at 8.27% examples, 341138 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:35:53,321 : INFO : EPOCH 5 - PROGRESS: at 9.98% examples, 342813 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:35:54,326 : INFO : EPOCH 5 - PROGRESS: at 11.27% examples, 342896 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:35:55,354 : INFO : EPOCH 5 - PROGRESS: at 11.71% examples, 340650 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:35:56,356 : INFO : EPOCH 5 - PROGRESS: at 12.51% examples, 339704 words/s, in_qsize 5, o

Epoch 6 - END  Loss: 77436248.0
Epoch: 7 - START loss 77436248.0


2025-02-07 11:36:58,193 : INFO : EPOCH 6 - PROGRESS: at 1.58% examples, 326339 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:36:59,202 : INFO : EPOCH 6 - PROGRESS: at 2.89% examples, 333704 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:37:00,241 : INFO : EPOCH 6 - PROGRESS: at 3.92% examples, 331126 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:37:01,259 : INFO : EPOCH 6 - PROGRESS: at 6.25% examples, 337669 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:37:02,277 : INFO : EPOCH 6 - PROGRESS: at 8.27% examples, 339518 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:37:03,294 : INFO : EPOCH 6 - PROGRESS: at 9.88% examples, 339864 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:37:04,328 : INFO : EPOCH 6 - PROGRESS: at 11.27% examples, 338641 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:37:05,362 : INFO : EPOCH 6 - PROGRESS: at 11.67% examples, 335845 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:37:06,363 : INFO : EPOCH 6 - PROGRESS: at 12.50% examples, 336260 words/s, in_qsize 5, o

Epoch 7 - END  Loss: 79717616.0
Epoch: 8 - START loss 79717616.0


2025-02-07 11:38:08,696 : INFO : EPOCH 7 - PROGRESS: at 1.42% examples, 306908 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:38:09,715 : INFO : EPOCH 7 - PROGRESS: at 2.87% examples, 321091 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:38:10,716 : INFO : EPOCH 7 - PROGRESS: at 3.71% examples, 322796 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:38:11,733 : INFO : EPOCH 7 - PROGRESS: at 6.02% examples, 328470 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:38:12,739 : INFO : EPOCH 7 - PROGRESS: at 8.07% examples, 334127 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:38:13,768 : INFO : EPOCH 7 - PROGRESS: at 9.42% examples, 334563 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:38:14,802 : INFO : EPOCH 7 - PROGRESS: at 11.23% examples, 335261 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:38:15,820 : INFO : EPOCH 7 - PROGRESS: at 11.61% examples, 333076 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:38:16,838 : INFO : EPOCH 7 - PROGRESS: at 12.45% examples, 333682 words/s, in_qsize 5, o

Epoch 8 - END  Loss: 81959000.0
Epoch: 9 - START loss 81959000.0


2025-02-07 11:39:19,055 : INFO : EPOCH 8 - PROGRESS: at 1.58% examples, 325220 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:39:20,072 : INFO : EPOCH 8 - PROGRESS: at 2.89% examples, 332662 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:39:21,102 : INFO : EPOCH 8 - PROGRESS: at 3.98% examples, 334305 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:39:22,112 : INFO : EPOCH 8 - PROGRESS: at 6.15% examples, 335082 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:39:23,108 : INFO : EPOCH 8 - PROGRESS: at 8.22% examples, 338853 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:39:24,126 : INFO : EPOCH 8 - PROGRESS: at 9.69% examples, 339380 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:39:25,132 : INFO : EPOCH 8 - PROGRESS: at 11.26% examples, 341121 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:39:26,128 : INFO : EPOCH 8 - PROGRESS: at 11.66% examples, 338919 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:39:27,145 : INFO : EPOCH 8 - PROGRESS: at 12.47% examples, 336770 words/s, in_qsize 5, o

Epoch 9 - END  Loss: 84180160.0
Epoch: 10 - START loss 84180160.0


2025-02-07 11:40:29,313 : INFO : EPOCH 9 - PROGRESS: at 1.56% examples, 319750 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:40:30,347 : INFO : EPOCH 9 - PROGRESS: at 2.88% examples, 326849 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:40:31,381 : INFO : EPOCH 9 - PROGRESS: at 3.91% examples, 329515 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:40:32,415 : INFO : EPOCH 9 - PROGRESS: at 6.12% examples, 330107 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:40:33,433 : INFO : EPOCH 9 - PROGRESS: at 8.22% examples, 335135 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:40:34,450 : INFO : EPOCH 9 - PROGRESS: at 9.80% examples, 337493 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:40:35,480 : INFO : EPOCH 9 - PROGRESS: at 11.26% examples, 337392 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:40:36,502 : INFO : EPOCH 9 - PROGRESS: at 11.67% examples, 335719 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:40:37,508 : INFO : EPOCH 9 - PROGRESS: at 12.50% examples, 336078 words/s, in_qsize 6, o

Epoch 10 - END  Loss: 86395552.0
Epoch: 11 - START loss 86395552.0


2025-02-07 11:41:39,386 : INFO : EPOCH 10 - PROGRESS: at 1.62% examples, 338668 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:40,404 : INFO : EPOCH 10 - PROGRESS: at 2.92% examples, 340712 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:41,421 : INFO : EPOCH 10 - PROGRESS: at 4.01% examples, 333975 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:42,423 : INFO : EPOCH 10 - PROGRESS: at 6.25% examples, 339950 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:43,423 : INFO : EPOCH 10 - PROGRESS: at 8.25% examples, 340948 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:44,440 : INFO : EPOCH 10 - PROGRESS: at 9.98% examples, 343104 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:45,458 : INFO : EPOCH 10 - PROGRESS: at 11.27% examples, 341781 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:41:46,459 : INFO : EPOCH 10 - PROGRESS: at 11.67% examples, 339162 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:41:47,493 : INFO : EPOCH 10 - PROGRESS: at 12.51% examples, 339243 words/s, in_q

Epoch 11 - END  Loss: 88559400.0
Epoch: 12 - START loss 88559400.0


2025-02-07 11:42:49,293 : INFO : EPOCH 11 - PROGRESS: at 1.58% examples, 325740 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:42:50,298 : INFO : EPOCH 11 - PROGRESS: at 2.89% examples, 333936 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:42:51,345 : INFO : EPOCH 11 - PROGRESS: at 3.98% examples, 332640 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:42:52,380 : INFO : EPOCH 11 - PROGRESS: at 6.30% examples, 337240 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:42:53,398 : INFO : EPOCH 11 - PROGRESS: at 8.25% examples, 337443 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:42:54,398 : INFO : EPOCH 11 - PROGRESS: at 9.80% examples, 338328 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:42:55,432 : INFO : EPOCH 11 - PROGRESS: at 11.27% examples, 338635 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:42:56,450 : INFO : EPOCH 11 - PROGRESS: at 11.69% examples, 337268 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:42:57,489 : INFO : EPOCH 11 - PROGRESS: at 12.52% examples, 337212 words/s, in_q

Epoch 12 - END  Loss: 90684384.0
Epoch: 13 - START loss 90684384.0


2025-02-07 11:43:59,480 : INFO : EPOCH 12 - PROGRESS: at 1.58% examples, 324235 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:44:00,485 : INFO : EPOCH 12 - PROGRESS: at 2.88% examples, 323744 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:44:01,536 : INFO : EPOCH 12 - PROGRESS: at 3.77% examples, 323813 words/s, in_qsize 4, out_qsize 1
2025-02-07 11:44:02,570 : INFO : EPOCH 12 - PROGRESS: at 6.15% examples, 331281 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:44:03,571 : INFO : EPOCH 12 - PROGRESS: at 8.19% examples, 334197 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:44:04,589 : INFO : EPOCH 12 - PROGRESS: at 9.58% examples, 335672 words/s, in_qsize 4, out_qsize 1
2025-02-07 11:44:05,590 : INFO : EPOCH 12 - PROGRESS: at 11.25% examples, 336350 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:44:06,640 : INFO : EPOCH 12 - PROGRESS: at 11.66% examples, 334801 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:44:07,658 : INFO : EPOCH 12 - PROGRESS: at 12.48% examples, 334113 words/s, in_q

Epoch 13 - END  Loss: 92783712.0
Epoch: 14 - START loss 92783712.0


2025-02-07 11:45:09,688 : INFO : EPOCH 13 - PROGRESS: at 1.58% examples, 330227 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:45:10,737 : INFO : EPOCH 13 - PROGRESS: at 2.92% examples, 341015 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:45:11,796 : INFO : EPOCH 13 - PROGRESS: at 4.24% examples, 345012 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:45:12,822 : INFO : EPOCH 13 - PROGRESS: at 6.62% examples, 347136 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:45:13,840 : INFO : EPOCH 13 - PROGRESS: at 8.53% examples, 350786 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:45:14,850 : INFO : EPOCH 13 - PROGRESS: at 10.36% examples, 353156 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:45:15,860 : INFO : EPOCH 13 - PROGRESS: at 11.36% examples, 352101 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:45:16,869 : INFO : EPOCH 13 - PROGRESS: at 11.94% examples, 351241 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:45:17,912 : INFO : EPOCH 13 - PROGRESS: at 12.75% examples, 350225 words/s, in_

Epoch 14 - END  Loss: 94906232.0
Epoch: 15 - START loss 94906232.0


2025-02-07 11:46:17,185 : INFO : EPOCH 14 - PROGRESS: at 1.62% examples, 347233 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:46:18,195 : INFO : EPOCH 14 - PROGRESS: at 2.90% examples, 340056 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:46:19,209 : INFO : EPOCH 14 - PROGRESS: at 4.12% examples, 343766 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:46:20,225 : INFO : EPOCH 14 - PROGRESS: at 6.50% examples, 351641 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:46:21,253 : INFO : EPOCH 14 - PROGRESS: at 8.49% examples, 353612 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:46:22,268 : INFO : EPOCH 14 - PROGRESS: at 10.27% examples, 353947 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:46:23,294 : INFO : EPOCH 14 - PROGRESS: at 11.35% examples, 354263 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:46:24,296 : INFO : EPOCH 14 - PROGRESS: at 11.92% examples, 353385 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:46:25,323 : INFO : EPOCH 14 - PROGRESS: at 12.74% examples, 352763 words/s, in_

Epoch 15 - END  Loss: 96881160.0
Epoch: 16 - START loss 96881160.0


2025-02-07 11:47:24,570 : INFO : EPOCH 15 - PROGRESS: at 1.62% examples, 346483 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:47:25,575 : INFO : EPOCH 15 - PROGRESS: at 2.89% examples, 336546 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:47:26,636 : INFO : EPOCH 15 - PROGRESS: at 4.14% examples, 341678 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:47:27,641 : INFO : EPOCH 15 - PROGRESS: at 6.50% examples, 348844 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:47:28,655 : INFO : EPOCH 15 - PROGRESS: at 8.48% examples, 350805 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:47:29,664 : INFO : EPOCH 15 - PROGRESS: at 10.24% examples, 351975 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:47:30,695 : INFO : EPOCH 15 - PROGRESS: at 11.33% examples, 350179 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:47:31,708 : INFO : EPOCH 15 - PROGRESS: at 11.88% examples, 349277 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:47:32,724 : INFO : EPOCH 15 - PROGRESS: at 12.71% examples, 349535 words/s, in_

Epoch 16 - END  Loss: 98855712.0
Epoch: 17 - START loss 98855712.0


2025-02-07 11:48:32,217 : INFO : EPOCH 16 - PROGRESS: at 1.62% examples, 343659 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:48:33,221 : INFO : EPOCH 16 - PROGRESS: at 2.91% examples, 343261 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:48:34,266 : INFO : EPOCH 16 - PROGRESS: at 4.12% examples, 339633 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:48:35,305 : INFO : EPOCH 16 - PROGRESS: at 6.56% examples, 348402 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:48:36,320 : INFO : EPOCH 16 - PROGRESS: at 8.51% examples, 351991 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:48:37,343 : INFO : EPOCH 16 - PROGRESS: at 10.32% examples, 353382 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:48:38,363 : INFO : EPOCH 16 - PROGRESS: at 11.36% examples, 352922 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:48:39,371 : INFO : EPOCH 16 - PROGRESS: at 11.94% examples, 351983 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:48:40,394 : INFO : EPOCH 16 - PROGRESS: at 12.75% examples, 351640 words/s, in_

Epoch 17 - END  Loss: 100763416.0
Epoch: 18 - START loss 100763416.0


2025-02-07 11:49:39,813 : INFO : EPOCH 17 - PROGRESS: at 1.58% examples, 329935 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:49:40,822 : INFO : EPOCH 17 - PROGRESS: at 2.90% examples, 339595 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:49:41,842 : INFO : EPOCH 17 - PROGRESS: at 4.12% examples, 342864 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:49:42,847 : INFO : EPOCH 17 - PROGRESS: at 6.41% examples, 348002 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:49:43,849 : INFO : EPOCH 17 - PROGRESS: at 8.35% examples, 349266 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:49:44,852 : INFO : EPOCH 17 - PROGRESS: at 10.20% examples, 352420 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:49:45,855 : INFO : EPOCH 17 - PROGRESS: at 11.31% examples, 351967 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:49:46,879 : INFO : EPOCH 17 - PROGRESS: at 11.85% examples, 350341 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:49:47,892 : INFO : EPOCH 17 - PROGRESS: at 12.65% examples, 348046 words/s, in_

Epoch 18 - END  Loss: 102551256.0
Epoch: 19 - START loss 102551256.0


2025-02-07 11:50:47,545 : INFO : EPOCH 18 - PROGRESS: at 1.60% examples, 339939 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:48,584 : INFO : EPOCH 18 - PROGRESS: at 2.92% examples, 343768 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:49,633 : INFO : EPOCH 18 - PROGRESS: at 4.14% examples, 339694 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:50,688 : INFO : EPOCH 18 - PROGRESS: at 6.62% examples, 346788 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:50:51,719 : INFO : EPOCH 18 - PROGRESS: at 8.53% examples, 349671 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:52,730 : INFO : EPOCH 18 - PROGRESS: at 10.36% examples, 352077 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:53,738 : INFO : EPOCH 18 - PROGRESS: at 11.36% examples, 352362 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:54,762 : INFO : EPOCH 18 - PROGRESS: at 11.95% examples, 350783 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:50:55,768 : INFO : EPOCH 18 - PROGRESS: at 12.75% examples, 350335 words/s, in_

Epoch 19 - END  Loss: 104406640.0
Epoch: 20 - START loss 104406640.0


2025-02-07 11:51:56,037 : INFO : EPOCH 19 - PROGRESS: at 1.62% examples, 347015 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:51:57,060 : INFO : EPOCH 19 - PROGRESS: at 2.92% examples, 345831 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:51:58,072 : INFO : EPOCH 19 - PROGRESS: at 4.12% examples, 342295 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:51:59,087 : INFO : EPOCH 19 - PROGRESS: at 6.41% examples, 346663 words/s, in_qsize 6, out_qsize 0
2025-02-07 11:52:00,103 : INFO : EPOCH 19 - PROGRESS: at 8.38% examples, 348869 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:52:01,120 : INFO : EPOCH 19 - PROGRESS: at 10.22% examples, 351224 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:52:02,169 : INFO : EPOCH 19 - PROGRESS: at 11.33% examples, 349782 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:52:03,190 : INFO : EPOCH 19 - PROGRESS: at 11.88% examples, 348563 words/s, in_qsize 5, out_qsize 0
2025-02-07 11:52:04,201 : INFO : EPOCH 19 - PROGRESS: at 12.70% examples, 348260 words/s, in_

Epoch 20 - END  Loss: 106169448.0
End time: 2025-02-07 11:53:02.743469


2025-02-07 11:53:02,973 : INFO : not storing attribute cum_table
2025-02-07 11:53:03,439 : INFO : saved ../models/word2vec_latsenrom.model
