# Chapter 5: Word Embeddings and Distance Measurements for Text

## Exploring the pretrained Word2vec model using gensim

In [1]:
#importing the gensim library
import gensim
from gensim.models import KeyedVectors

In [2]:
#loading the model
model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [3]:
#Validating the size of the pretrained Word2vec vocabulary
len(model.key_to_index)

3000000

In [4]:
#Exploring the size of each Word2Vec
model.vector_size

300

In [5]:
#Explore the pretrained Word2Vec vocabulary
model.key_to_index

{'</s>': 0,
 'in': 1,
 'for': 2,
 'that': 3,
 'is': 4,
 'on': 5,
 '##': 6,
 'The': 7,
 'with': 8,
 'said': 9,
 'was': 10,
 'the': 11,
 'at': 12,
 'not': 13,
 'as': 14,
 'it': 15,
 'be': 16,
 'from': 17,
 'by': 18,
 'are': 19,
 'I': 20,
 'have': 21,
 'he': 22,
 'will': 23,
 'has': 24,
 '####': 25,
 'his': 26,
 'an': 27,
 'this': 28,
 'or': 29,
 'their': 30,
 'who': 31,
 'they': 32,
 'but': 33,
 '$': 34,
 'had': 35,
 'year': 36,
 'were': 37,
 'we': 38,
 'more': 39,
 '###': 40,
 'up': 41,
 'been': 42,
 'you': 43,
 'its': 44,
 'one': 45,
 'about': 46,
 'would': 47,
 'which': 48,
 'out': 49,
 'can': 50,
 'It': 51,
 'all': 52,
 'also': 53,
 'two': 54,
 'after': 55,
 'first': 56,
 'He': 57,
 'do': 58,
 'time': 59,
 'than': 60,
 'when': 61,
 'We': 62,
 'over': 63,
 'last': 64,
 'new': 65,
 'other': 66,
 'her': 67,
 'people': 68,
 'into': 69,
 'In': 70,
 'our': 71,
 'there': 72,
 'A': 73,
 'she': 74,
 'could': 75,
 'just': 76,
 'years': 77,
 'some': 78,
 'U.S.': 79,
 'three': 80,
 'million': 81

In [6]:
#Checking similarity
model.most_similar('Delhi')

[('Kolkata', 0.7663769125938416),
 ('Mumbai', 0.7306069731712341),
 ('Lucknow', 0.7277829647064209),
 ('Patna', 0.7159016728401184),
 ('Guwahati', 0.7072612643241882),
 ('Jaipur', 0.6992815136909485),
 ('Hyderabad', 0.6983195543289185),
 ('Ranchi', 0.6962575912475586),
 ('Bhubaneswar', 0.6959235072135925),
 ('Chandigarh', 0.6940240263938904)]

In [7]:
#validating the king, queen, woman, and man examples from earlier
#both in terms of the closest word and the second-closest word
result = model.most_similar(positive=['man', 'queen'], negative=['king'], topn=1)
print(result)

[('woman', 0.7609435319900513)]


In [8]:
#Checking for the two closest words
result = model.most_similar(positive=['man', 'queen'], negative=['king'], topn=2)
print(result)

[('woman', 0.7609435319900513), ('girl', 0.6139994263648987)]


In [9]:
#validating the country and capital example
result = model.most_similar(positive=['France', 'Rome'], negative=['Italy'], topn=1)
print(result)

[('Paris', 0.7190686464309692)]


## Word2Vec Architecture

### Softmax

In [10]:
import numpy as np
#example of output vector
z = [2.0, 3.0, 1.0, 4.0, 2.0, 3.0, 2.0]
#softmaxing
np.exp(z) / np.sum(np.exp(z))

array([0.06175318, 0.16786254, 0.02271772, 0.45629768, 0.06175318,
       0.16786254, 0.06175318])

## Traning a Word2Vec Model

### Building a basic Word2vec model

In [11]:
#importing word2vec fromo gensim
from gensim.models import Word2Vec

#sentences as data
sentences = [["I", "am", "trying", "to", "understand", "Natural", "Language", "Processing"], 
             ["Natural", "Language", "Processing", "is", "fun", "to", "learn"], 
             ["There", "are", "numerous", "use", "cases", "of", "Natural", "Language", "Processing"]]

#building the model
model = Word2Vec(sentences, min_count=1)

In [12]:
#checking size of vector we just built
model.vector_size

100

In [13]:
#checking the vocabulary size 
len(model.wv.key_to_index)

17

### Modifying the min_count parameter

In [14]:
#building model for words that occur atleast 2 times
model = Word2Vec(sentences, min_count=2)

In [15]:
#checking the vocabulary size 
len(model.wv.key_to_index)

4

In [16]:
#checking the words in the vocab
model.wv.key_to_index

{'Processing': 0, 'Language': 1, 'Natural': 2, 'to': 3}

In [17]:
#checking dimensions of the vectors
model.vector_size

100

### Playing with vector size

In [18]:
#building model where each vector has 300-dimensions
model = Word2Vec(sentences, min_count=2, vector_size = 300)

In [19]:
#checking dimensions of vectors
model.vector_size

300

### Other important configurable parameters

In [20]:
#sg = 1 (using Skip-gram approach)
#negative = 1(if > 0 indicates subsampling should be used and the integer value signifies how many)
#workers (no. of threads to use for training)
model = Word2Vec (sentences, min_count=1, vector_size = 300, workers = 2, sg = 1, negative = 1)

In [21]:
#checking the vocabulary size 
len(model.wv.key_to_index)

17

In [22]:
#checking the words in the vocabulary
model.wv.key_to_index

{'Processing': 0,
 'Natural': 1,
 'Language': 2,
 'to': 3,
 'of': 4,
 'am': 5,
 'trying': 6,
 'understand': 7,
 'is': 8,
 'cases': 9,
 'fun': 10,
 'learn': 11,
 'There': 12,
 'are': 13,
 'numerous': 14,
 'use': 15,
 'I': 16}

## Word Mover's Distance

### Implementation of WMD using gensim

In [23]:
#importing gensim
import gensim
from gensim.models import KeyedVectors

In [24]:
#loading pretrained model
model=KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

In [25]:
#defining the data
sentence_1 = "Obama speaks to the media in Illinois"
sentence_2 = "President greets the press in Chicago"
sentence_3 = "Apple is my favorite company"

In [26]:
#calculating WMD between sentence_1 and sentence_2
word_mover_distance = model.wmdistance(sentence_1, sentence_2)
word_mover_distance

0.42775531265615635

In [27]:
#calculating WMD between sentence_1 and sentence_3
word_mover_distance = model.wmdistance(sentence_1, sentence_3)
word_mover_distance

0.477934024647689

In [28]:
#normalizing word embeddings
model.fill_norms()

In [29]:
#recomputing the WMD between the sentences based on the normalized embeddings
#for 1 and 2
word_mover_distance = model.wmdistance(sentence_1, sentence_2)
word_mover_distance

0.42775531265615635

In [30]:
#for 1 and 3
word_mover_distance = model.wmdistance(sentence_1, sentence_3)
word_mover_distance

0.477934024647689