# Building Word2Vec models from scratch

### Importing the Word2Vec module from Gensim

In [1]:
from gensim.models import Word2Vec

### Defining a text corpus

In [2]:
sentences = [["I", "am", "trying", "to", "understand", "Natural", "Language", "Processing"],
             ["Natural", "Language", "Processing", "is", "fun", "to", "learn"],
             ["There", "are", "numerous", "use", "cases", "of", "Natural", "Language", "Processing"]]

### Building a basic Word2Vec model

In [3]:
model = Word2Vec(sentences, min_count=1)

### Visualizing the embeddings for the word "Natural"

In [4]:
model.wv["Natural"]

array([-1.1506568e-03,  2.3529895e-03,  4.5520775e-03, -3.0040720e-03,
       -8.5354433e-04, -4.7675124e-04, -2.0673673e-03, -3.0746455e-03,
       -1.9341263e-03, -4.3878607e-03, -8.3469070e-04,  1.4558269e-03,
        2.7519549e-04, -4.3559028e-03,  2.3670145e-03,  2.0943601e-03,
       -3.8913325e-03, -6.1548827e-04, -4.6169399e-03,  9.4052072e-04,
       -3.4304196e-04, -3.4094250e-04,  2.9447235e-03,  4.3063508e-03,
       -3.7732773e-04, -2.2485292e-04,  1.6115705e-03,  1.2149011e-03,
       -4.8515568e-03,  4.7764364e-03, -2.5412594e-03,  3.3789561e-03,
        1.4883940e-03,  1.5283572e-03,  1.4682902e-03, -3.5080998e-03,
        3.9456845e-03,  7.0160354e-04, -4.1346089e-03,  4.0484671e-04,
       -1.1999794e-04,  4.8559154e-03,  2.0889365e-03,  4.5922971e-03,
        2.0818750e-03,  2.4181246e-04,  6.7220608e-05, -1.0669195e-03,
       -3.9796722e-03,  3.9165807e-03, -3.3378431e-03, -2.7101892e-03,
        1.1821726e-03, -4.4203321e-03,  3.9127376e-04,  3.1629119e-03,
      

### Size of each word vector

In [5]:
model.vector_size

100

### Size of the vocabulary

In [6]:
len(model.wv.vocab)

17

### Building a 2nd Word2Vec model restricting the vocabulary using min_count

In [7]:
model = Word2Vec(sentences, min_count=2)

### Visualizing the embeddings for the word "Natural"

In [8]:
model.wv["Natural"]

array([-1.1506568e-03,  2.3529895e-03,  4.5520775e-03, -3.0040720e-03,
       -8.5354433e-04, -4.7675124e-04, -2.0673673e-03, -3.0746455e-03,
       -1.9341263e-03, -4.3878607e-03, -8.3469070e-04,  1.4558269e-03,
        2.7519549e-04, -4.3559028e-03,  2.3670145e-03,  2.0943601e-03,
       -3.8913325e-03, -6.1548827e-04, -4.6169399e-03,  9.4052072e-04,
       -3.4304196e-04, -3.4094250e-04,  2.9447235e-03,  4.3063508e-03,
       -3.7732773e-04, -2.2485292e-04,  1.6115705e-03,  1.2149011e-03,
       -4.8515568e-03,  4.7764364e-03, -2.5412594e-03,  3.3789561e-03,
        1.4883940e-03,  1.5283572e-03,  1.4682902e-03, -3.5080998e-03,
        3.9456845e-03,  7.0160354e-04, -4.1346089e-03,  4.0484671e-04,
       -1.1999794e-04,  4.8559154e-03,  2.0889365e-03,  4.5922971e-03,
        2.0818750e-03,  2.4181246e-04,  6.7220608e-05, -1.0669195e-03,
       -3.9796722e-03,  3.9165807e-03, -3.3378431e-03, -2.7101892e-03,
        1.1821726e-03, -4.4203321e-03,  3.9127376e-04,  3.1629119e-03,
      

### Size of the vocabulary

In [9]:
len(model.wv.vocab)

4

### Vocabulary

In [10]:
model.wv.vocab

{'to': <gensim.models.keyedvectors.Vocab at 0x127591a58>,
 'Natural': <gensim.models.keyedvectors.Vocab at 0x127591a90>,
 'Language': <gensim.models.keyedvectors.Vocab at 0x127591ac8>,
 'Processing': <gensim.models.keyedvectors.Vocab at 0x127591b00>}

### Size of each word vector

In [11]:
model.vector_size

100

### Building a 2nd Word2Vec model restricting the vocabulary using min_count and each vector of size 300

In [12]:
model = Word2Vec(sentences, min_count=2, size = 300)

### Visualizing the embeddings for the word "Natural"

In [13]:
model.wv["Natural"]

array([-3.83552251e-04,  7.84329837e-04,  1.51735917e-03, -1.00135733e-03,
       -2.84514768e-04, -1.58917072e-04, -6.89122418e-04, -1.02488184e-03,
       -6.44708751e-04, -1.46262010e-03, -2.78230233e-04,  4.85275639e-04,
        9.17318248e-05, -1.45196752e-03,  7.89004902e-04,  6.98120042e-04,
       -1.29711081e-03, -2.05162767e-04, -1.53898005e-03,  3.13506898e-04,
       -1.14347313e-04, -1.13647504e-04,  9.81574529e-04,  1.43545028e-03,
       -1.25775900e-04, -7.49509709e-05,  5.37190179e-04,  4.04967024e-04,
       -1.61718554e-03,  1.59214553e-03, -8.47086427e-04,  1.12631870e-03,
        4.96131310e-04,  5.09452366e-04,  4.89430095e-04, -1.16936653e-03,
        1.31522818e-03,  2.33867846e-04, -1.37820304e-03,  1.34948903e-04,
       -3.99993114e-05,  1.61863840e-03,  6.96312229e-04,  1.53076567e-03,
        6.93958311e-04,  8.06041498e-05,  2.24068681e-05, -3.55639815e-04,
       -1.32655737e-03,  1.30552694e-03, -1.11261441e-03, -9.03396402e-04,
        3.94057512e-04, -

### Size of the vocabulary

In [14]:
len(model.wv.vocab)

4

### Size of each word vector

In [15]:
model.vector_size

300

### Building a 4th Word2Vec model using 2 worker threads, skipgram approach and negative sampling

In [16]:
model = Word2Vec(sentences, min_count=1, size = 300, workers = 2, sg = 1, negative = 1)

### Size of the vocabulary

In [17]:
len(model.wv.vocab)

17

### Vocabulary

In [18]:
model.wv.vocab

{'I': <gensim.models.keyedvectors.Vocab at 0x1275ab5c0>,
 'am': <gensim.models.keyedvectors.Vocab at 0x1275ab588>,
 'trying': <gensim.models.keyedvectors.Vocab at 0x1275ab518>,
 'to': <gensim.models.keyedvectors.Vocab at 0x1275ab4e0>,
 'understand': <gensim.models.keyedvectors.Vocab at 0x1275ab4a8>,
 'Natural': <gensim.models.keyedvectors.Vocab at 0x1275ab438>,
 'Language': <gensim.models.keyedvectors.Vocab at 0x1275ab400>,
 'Processing': <gensim.models.keyedvectors.Vocab at 0x1275ab3c8>,
 'is': <gensim.models.keyedvectors.Vocab at 0x1275ab390>,
 'fun': <gensim.models.keyedvectors.Vocab at 0x1275ab358>,
 'learn': <gensim.models.keyedvectors.Vocab at 0x1275ab2e8>,
 'There': <gensim.models.keyedvectors.Vocab at 0x1275ab208>,
 'are': <gensim.models.keyedvectors.Vocab at 0x1275ab240>,
 'numerous': <gensim.models.keyedvectors.Vocab at 0x1275ab1d0>,
 'use': <gensim.models.keyedvectors.Vocab at 0x127591a20>,
 'cases': <gensim.models.keyedvectors.Vocab at 0x1275919e8>,
 'of': <gensim.models.ke

### Size of each word vector

In [19]:
model.vector_size

300