### Word embeddings are basically a form of word representation 
### that bridges the human understanding of language to that of a machine.
### Word embeddings are distributed representations of text in an n-dimensional space.
### Word2Vec consists of models for generating word embedding

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
warnings.filterwarnings(action = 'ignore')

In [2]:
import gensim 
from gensim.models import Word2Vec 
  
#  Reads ‘alice.txt’ file 
sample = open("C:/Users/HP/Downloads/datasets/Alice.txt", "r") 
s = sample.read() 
  
# Replaces escape character with space 
f = s.replace("\n", " ") 
  
data = [] 
  
# iterate through each sentence in the file 
for i in sent_tokenize(f): 
    temp = [] 
      
    # tokenize the sentence into words 
    for j in word_tokenize(i): 
        temp.append(j.lower()) 
  
    data.append(temp) 

In [3]:
data

[['project',
  'gutenberg',
  "'s",
  'alice',
  "'s",
  'adventures',
  'in',
  'wonderland',
  ',',
  'by',
  'lewis',
  'carroll',
  'this',
  'ebook',
  'is',
  'for',
  'the',
  'use',
  'of',
  'anyone',
  'anywhere',
  'at',
  'no',
  'cost',
  'and',
  'with',
  'almost',
  'no',
  'restrictions',
  'whatsoever',
  '.'],
 ['you',
  'may',
  'copy',
  'it',
  ',',
  'give',
  'it',
  'away',
  'or',
  're-use',
  'it',
  'under',
  'the',
  'terms',
  'of',
  'the',
  'project',
  'gutenberg',
  'license',
  'included',
  'with',
  'this',
  'ebook',
  'or',
  'online',
  'at',
  'www.gutenberg.org',
  'title',
  ':',
  'alice',
  "'s",
  'adventures',
  'in',
  'wonderland',
  'author',
  ':',
  'lewis',
  'carroll',
  'posting',
  'date',
  ':',
  'june',
  '25',
  ',',
  '2008',
  '[',
  'ebook',
  '#',
  '11',
  ']',
  'release',
  'date',
  ':',
  'march',
  ',',
  '1994',
  '[',
  'last',
  'updated',
  ':',
  'december',
  '20',
  ',',
  '2011',
  ']',
  'language',
  ':'

In [4]:
# Create CBOW model 
model1 = gensim.models.Word2Vec(data, min_count = 1,  
                              size = 100, window = 5)

In [5]:
# Print results 
print("Cosine similarity between 'alice' "+"and 'wonderland' - CBOW : ", 
    model1.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'harry' "+"and 'machines' - CBOW : ", 
      model1.similarity('alice', 'machines')) 

Cosine similarity between 'alice' and 'wonderland' - CBOW :  0.999428
Cosine similarity between 'harry' and 'machines' - CBOW :  0.974867


In [6]:
# Create Skip Gram model 
model2 = gensim.models.Word2Vec(data, min_count = 1, size = 100, 
                                             window = 5, sg = 1)

In [7]:
# Print results 
print("Cosine similarity between 'alice' "+"and 'wonderland' - Skip Gram : ", 
    model2.similarity('alice', 'wonderland')) 
      
print("Cosine similarity between 'harry' "+"and 'machines' - Skip Gram : ", 
      model2.similarity('alice', 'machines'))

Cosine similarity between 'alice' and 'wonderland' - Skip Gram :  0.93779063
Cosine similarity between 'harry' and 'machines' - Skip Gram :  0.94011056
