# Import Libraries

In [18]:
import gensim
from gensim.models import Word2Vec
from operator import itemgetter

# Continuous Bag of Words (CBOW)

In [23]:
sentences_1 = [
    ["I", "love", "natural", "language", "processing"],
    ["Word2Vec", "is", "a", "great", "tool"],
    ["Machine", "learning", "is", "fun"],
]

sentences_1

[['I', 'love', 'natural', 'language', 'processing'],
 ['Word2Vec', 'is', 'a', 'great', 'tool'],
 ['Machine', 'learning', 'is', 'fun']]

In [24]:
len(sentences_1)

3

**Train the Word2Vec model (Fit `sentences` on Word2Vec Model) :-**

In [25]:
cbow_model_1 = Word2Vec(sentences_1, vector_size=100, window=5, min_count=1, sg=1)
type(cbow_model_1)

gensim.models.word2vec.Word2Vec

**Get the vector for a word (`language`) :-**

In [26]:
vector_1 = cbow_model_1.wv['language']
type(vector_1)

numpy.ndarray

In [27]:
vector_1.shape

(100,)

In [28]:
print("Vector for 'language':\n\n", vector_1)

Vector for 'language':

 [-0.00515624 -0.00666834 -0.00777684  0.00831073 -0.00198234 -0.00685496
 -0.00415439  0.00514413 -0.00286914 -0.00374966  0.00162143 -0.00277629
 -0.00158436  0.00107449 -0.00297794  0.00851928  0.00391094 -0.00995886
  0.0062596  -0.00675425  0.00076943  0.00440423 -0.00510337 -0.00211067
  0.00809548 -0.00424379 -0.00763626  0.00925791 -0.0021555  -0.00471943
  0.0085708   0.00428334  0.00432484  0.00928451 -0.00845308  0.00525532
  0.00203935  0.00418828  0.0016979   0.00446413  0.00448629  0.00610452
 -0.0032021  -0.00457573 -0.00042652  0.00253373 -0.00326317  0.00605772
  0.00415413  0.00776459  0.00256927  0.00811668 -0.00138721  0.00807793
  0.00371702 -0.00804732 -0.00393361 -0.00247188  0.00489304 -0.00087216
 -0.00283091  0.00783371  0.0093229  -0.00161493 -0.00515925 -0.00470176
 -0.00484605 -0.00960283  0.00137202 -0.00422492  0.00252671  0.00561448
 -0.00406591 -0.00959658  0.0015467  -0.00670012  0.00249517 -0.00378063
  0.00707842  0.00064022  

**Find similar words of word `language`**

In [29]:
similar_words_language = cbow_model_1.wv.most_similar('language', topn=5)
type(similar_words_language)

list

In [30]:
print("Words similar to 'language':\n\n", list(map(itemgetter(0), similar_words_language)))

Words similar to 'language':

 ['is', 'processing', 'natural', 'a', 'learning']


# Skip-gram

In [31]:
sentences_2 = [
    ["I", "love", "natural", "language", "processing"],
    ["Word2Vec", "is", "a", "great", "tool"],
    ["Machine", "learning", "is", "fun"],
    ["Natural", "language", "processing", "is", "awesome"]
]

sentences_2

[['I', 'love', 'natural', 'language', 'processing'],
 ['Word2Vec', 'is', 'a', 'great', 'tool'],
 ['Machine', 'learning', 'is', 'fun'],
 ['Natural', 'language', 'processing', 'is', 'awesome']]

In [32]:
print(len(sentences_2))
print(type(sentences_2))

4
<class 'list'>


In [33]:
cbow_model = Word2Vec(sentences, vector_size=100, window=2, min_count=1, sg=0)
type(cbow_model)

gensim.models.word2vec.Word2Vec

In [34]:
skipgram_model = Word2Vec(sentences, vector_size=100, window=2, min_count=1, sg=1)
type(skipgram_model)

gensim.models.word2vec.Word2Vec

In [35]:
word = "language"
cbow_vector = cbow_model.wv[word]
skipgram_vector = skipgram_model.wv[word]

print( type(cbow_vector) )
print( type(skipgram_vector) )

<class 'numpy.ndarray'>
<class 'numpy.ndarray'>


In [36]:
print(f"CBOW Vector for '{word}':\n\n", cbow_vector)

CBOW Vector for 'language':

 [-0.00515624 -0.00666834 -0.00777684  0.00831073 -0.00198234 -0.00685496
 -0.00415439  0.00514413 -0.00286914 -0.00374966  0.00162143 -0.00277629
 -0.00158436  0.00107449 -0.00297794  0.00851928  0.00391094 -0.00995886
  0.0062596  -0.00675425  0.00076943  0.00440423 -0.00510337 -0.00211067
  0.00809548 -0.00424379 -0.00763626  0.00925791 -0.0021555  -0.00471943
  0.0085708   0.00428334  0.00432484  0.00928451 -0.00845308  0.00525532
  0.00203935  0.00418828  0.0016979   0.00446413  0.00448629  0.00610452
 -0.0032021  -0.00457573 -0.00042652  0.00253373 -0.00326317  0.00605772
  0.00415413  0.00776459  0.00256927  0.00811668 -0.00138721  0.00807793
  0.00371702 -0.00804732 -0.00393361 -0.00247188  0.00489304 -0.00087216
 -0.00283091  0.00783371  0.0093229  -0.00161493 -0.00515925 -0.00470176
 -0.00484605 -0.00960283  0.00137202 -0.00422492  0.00252671  0.00561448
 -0.00406591 -0.00959658  0.0015467  -0.00670012  0.00249517 -0.00378063
  0.00707842  0.00064

In [37]:
print(f"Skip-gram Vector for '{word}':\n\n", skipgram_vector)

Skip-gram Vector for 'language':

 [-0.00515624 -0.00666834 -0.00777684  0.00831073 -0.00198234 -0.00685496
 -0.00415439  0.00514413 -0.00286914 -0.00374966  0.00162143 -0.00277629
 -0.00158436  0.00107449 -0.00297794  0.00851928  0.00391094 -0.00995886
  0.0062596  -0.00675425  0.00076943  0.00440423 -0.00510337 -0.00211067
  0.00809548 -0.00424379 -0.00763626  0.00925791 -0.0021555  -0.00471943
  0.0085708   0.00428334  0.00432484  0.00928451 -0.00845308  0.00525532
  0.00203935  0.00418828  0.0016979   0.00446413  0.00448629  0.00610452
 -0.0032021  -0.00457573 -0.00042652  0.00253373 -0.00326317  0.00605772
  0.00415413  0.00776459  0.00256927  0.00811668 -0.00138721  0.00807793
  0.00371702 -0.00804732 -0.00393361 -0.00247188  0.00489304 -0.00087216
 -0.00283091  0.00783371  0.0093229  -0.00161493 -0.00515925 -0.00470176
 -0.00484605 -0.00960283  0.00137202 -0.00422492  0.00252671  0.00561448
 -0.00406591 -0.00959658  0.0015467  -0.00670012  0.00249517 -0.00378063
  0.00707842  0.

In [38]:
cbow_similar_words = cbow_model.wv.most_similar(word, topn=5)
skipgram_similar_words = skipgram_model.wv.most_similar(word, topn=5)

print(type(cbow_similar_words))
print(type(skipgram_similar_words))

<class 'list'>
<class 'list'>


In [39]:
print(f"CBOW - Words similar to '{word}':\n\n", list(map(itemgetter(0), cbow_similar_words)))

CBOW - Words similar to 'language':

 ['is', 'processing', 'natural', 'a', 'learning']


In [40]:
print(f"Skip-gram - Words similar to '{word}':\n\n", list(map(itemgetter(0), skipgram_similar_words)))

Skip-gram - Words similar to 'language':

 ['is', 'processing', 'natural', 'a', 'learning']
