In [18]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import sentencepiece as spm
from pathlib import Path

## Data

In [6]:
sentences=Path('data/cleaned.txt').read_text().split('\n')
sentences[:5]

['hi my name is sanyam .',
 'today i went to the zoo .',
 'zoo had a dog .',
 'zoo had a cat .',
 'the zoo was good .']

In [7]:
#tokenize
sp = spm.SentencePieceProcessor(model_file='tokenizers/models/dev.model')
tokens=sp.encode(sentences, out_type=str)

In [9]:
tokens[0]

['▁h',
 'i',
 '▁',
 'm',
 'y',
 '▁',
 'n',
 'am',
 'e',
 '▁i',
 's',
 '▁',
 's',
 'a',
 'n',
 'y',
 'am',
 '▁.']

## Init model

In [15]:
w2v_model = Word2Vec(min_count=20,
                     window=5,
                     iter=5,
                     size=300,
                     sample=6e-5, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=3)

## Build vocab

In [16]:
w2v_model.build_vocab(tokens, progress_per=10000)

## Train

In [17]:
w2v_model.train(tokens, total_examples=w2v_model.corpus_count, epochs=30, report_delay=1)

(4878, 133500)

In [22]:
w2v_model.init_sims(replace=True)

In [27]:
w2v_model.wv.vocab

{'▁h': <gensim.models.keyedvectors.Vocab at 0x26840ca0248>,
 'i': <gensim.models.keyedvectors.Vocab at 0x26840ca0808>,
 '▁': <gensim.models.keyedvectors.Vocab at 0x26840ca05c8>,
 'm': <gensim.models.keyedvectors.Vocab at 0x26840ca0448>,
 'y': <gensim.models.keyedvectors.Vocab at 0x26840ca0308>,
 'n': <gensim.models.keyedvectors.Vocab at 0x26840c5e188>,
 'am': <gensim.models.keyedvectors.Vocab at 0x26840c5e1c8>,
 'e': <gensim.models.keyedvectors.Vocab at 0x26840c5e248>,
 '▁i': <gensim.models.keyedvectors.Vocab at 0x26840ca0b88>,
 's': <gensim.models.keyedvectors.Vocab at 0x26840ca0148>,
 'a': <gensim.models.keyedvectors.Vocab at 0x26840c5e3c8>,
 '▁.': <gensim.models.keyedvectors.Vocab at 0x26840c5e888>,
 '▁to': <gensim.models.keyedvectors.Vocab at 0x26840c5e688>,
 'd': <gensim.models.keyedvectors.Vocab at 0x26840c5e588>,
 '▁w': <gensim.models.keyedvectors.Vocab at 0x26840c5e6c8>,
 't': <gensim.models.keyedvectors.Vocab at 0x26840c5e7c8>,
 '▁t': <gensim.models.keyedvectors.Vocab at 0x268

In [28]:
w2v_model.wv.most_similar(positive=["am"])

[('y', 0.9999210238456726),
 ('s', 0.9999194145202637),
 ('▁h', 0.999911367893219),
 ('d', 0.9999109506607056),
 ('n', 0.9999079704284668),
 ('▁.', 0.999904990196228),
 ('▁', 0.9999023675918579),
 ('▁i', 0.9999021291732788),
 ('▁w', 0.9998984932899475),
 ('a', 0.9998965263366699)]

## Save

In [20]:
word_vectors = w2v_model.wv

In [21]:
word_vectors.save('embedding_models\dev_vectors.kv')