## Using a pre-trained word2vec model

In [28]:
import os
import gzip
import shutil

import warnings #This module ignores the various types of warnings generated
warnings.filterwarnings("ignore") 

import psutil #This module helps in retrieving information on running processes and system resource utilization
process = psutil.Process(os.getpid())
from psutil import virtual_memory
mem = virtual_memory()

import time #This module is used to calculate the time  

for dirname, _, filenames in os.walk('../input/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz
../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin


In [29]:
gn_vec_path = "../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin"
gn_vec_zip_path = "../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz"


print(f"Model at {gn_vec_path}")

Model at ../input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin


In [30]:
from gensim.models import Word2Vec, KeyedVectors
pretrainedpath = gn_vec_path

#Load W2V model. This will take some time, but it is a one time effort! 
pre = process.memory_info().rss
print("Memory used in GB before Loading the Model: %0.2f"%float(pre/(10**9))) #Check memory usage before loading the model
print('-'*10)

start_time = time.time() #Start the timer
ttl = mem.total #Toal memory available

w2v_model = KeyedVectors.load_word2vec_format(pretrainedpath, binary=True) #load the model
print("%0.2f seconds taken to load"%float(time.time() - start_time)) #Calculate the total time elapsed since starting the timer
print('-'*10)

print('Finished loading Word2Vec')
print('-'*10)

post = process.memory_info().rss
print("Memory used in GB after Loading the Model: {:.2f}".format(float(post/(10**9)))) #Calculate the memory used after loading the model
print('-'*10)

print("Percentage increase in memory usage: {:.2f}% ".format(float((post/pre)*100))) #Percentage increase in memory after loading the model
print('-'*10)

print("Numver of words in vocablulary: ",len(list(w2v_model.index_to_key))) #Number of words in the vocabulary.  list(w2v_model.wv.index_to_key)

Memory used in GB before Loading the Model: 4.88
----------
45.44 seconds taken to load
----------
Finished loading Word2Vec
----------
Memory used in GB after Loading the Model: 9.05
----------
Percentage increase in memory usage: 185.54% 
----------
Numver of words in vocablulary:  3000000


In [31]:
#Let us examine the model by knowing what the most similar words are, for a given word!
w2v_model.most_similar('beautiful')

[('gorgeous', 0.8353005051612854),
 ('lovely', 0.8106936812400818),
 ('stunningly_beautiful', 0.7329413294792175),
 ('breathtakingly_beautiful', 0.7231340408325195),
 ('wonderful', 0.6854086518287659),
 ('fabulous', 0.6700063943862915),
 ('loveliest', 0.6612576246261597),
 ('prettiest', 0.6595001816749573),
 ('beatiful', 0.6593326330184937),
 ('magnificent', 0.6591402888298035)]

In [32]:
#Let us try with another word! 
w2v_model.most_similar('toronto')

[('montreal', 0.6984112858772278),
 ('vancouver', 0.6587257385253906),
 ('nyc', 0.6248832941055298),
 ('alberta', 0.6179691553115845),
 ('boston', 0.611499547958374),
 ('calgary', 0.61032634973526),
 ('edmonton', 0.6100260615348816),
 ('canadian', 0.5944076776504517),
 ('chicago', 0.5911980271339417),
 ('springfield', 0.5888351798057556)]

In [33]:
#What is the vector representation for a word? 
w2v_model['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [34]:
#What if I am looking for a word that is not in this vocabulary?

#w2v_model['practicalnlp']

Two things to note while using pre-trained models:
 1. Tokens/Words are always lowercased. If a word is not in the vocabulary, the model throws an exception.
 2. So, it is always a good idea to encapsulate those statements in try/except blocks.

## 2. Getting the embedding representation for full text
We have seen how to get embedding vectors for single words. How do we use them to get such a representation for a full text? A simple way is to just sum or average the embeddings for individual words. We will see an example of this using Word2Vec in Chapter 4. Let us see a small example using another NLP library Spacy - which we saw earlier in Chapter 2 too.

In [35]:
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.3.0/en_core_web_md-3.3.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting typing-extensions<4.2.0,>=3.7.4
  Downloading typing_extensions-4.1.1-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, en-core-web-md
  Attempting uninstall: typing-extensions
    Found existing installation: typing_extensions 4.4.0
    Uninstalling typing_extensions-4.4.0:
      Successfully uninstalled typing_extensions-4.4.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
tensorflow 2.6.4 requires h5py~=3.1.0, but you

In [38]:
import spacy

%time 
nlp = spacy.load('en_core_web_md')
# process a sentence using the model
mydoc = nlp("Canada is a large country")
#Get a vector for individual words
#print(doc[0].vector) #vector for 'Canada', the first word in the text 
print(mydoc.vector) #Averaged vector for the entire sentence

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 8.82 µs
[-6.34728014e-01  1.15934394e-01 -2.98869964e-02  4.20099981e-02
 -3.67478020e-02  4.15896103e-02  1.75415993e-01 -1.72071941e-02
  7.19052032e-02  2.21598005e+00 -1.77238017e-01 -1.15464009e-01
 -1.78752810e-01 -2.10909848e-03  4.38740030e-02  3.31759974e-02
 -2.07035989e-01  1.29705787e+00  2.45502405e-03  4.44060005e-02
 -2.26073980e-01 -2.55783975e-01  3.79201164e-03 -2.58204401e-01
 -2.83650398e-01 -1.15281843e-01  3.59430015e-01  1.04026809e-01
  5.20303957e-02 -8.42919946e-02 -1.23832203e-01 -1.36483997e-01
 -3.81584018e-02 -1.85376003e-01 -1.22726001e-01 -1.02931604e-01
 -9.99336019e-02 -1.16094187e-01 -1.00685999e-01 -1.07025996e-01
  2.86531985e-01 -1.46295205e-01  1.21236876e-01  8.73500109e-03
  8.22656527e-02 -2.35577583e-01  6.14646003e-02 -2.13399883e-02
 -7.77979940e-02  1.25501603e-01 -3.31021219e-01  5.10863960e-01
 -6.77040219e-03 -9.42272097e-02  3.10105979e-01  1.58439189e-01
  1.82869211e-01  1.967399

In [39]:
#What happens when I give a sentence with strange words (and stop words), and try to get its word vector in Spacy?
temp = nlp('practicalnlp is a newword')
temp[0].vector

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.