# Creating Word Vectors with word2vec

In this notebook, we create word vectors from a corpus of public-domain books, a selection from [Project Gutenberg](https://www.gutenberg.org/).

#### Load dependencies

In [1]:
# nltk stands for Natural Language tool kit.
import nltk
from nltk import word_tokenize, sent_tokenize
import gensim
from gensim.models.word2vec import Word2Vec
# pronounce as skit-learn for reducing the dimensionality of the word vectors to plot on 2D graph.
from sklearn.manifold import TSNE
import pandas as pd
# bokeh for fun interactive plots.
from bokeh.io import output_notebook
from bokeh.plotting import show, figure

In [2]:
# we are downloading a english language tokenizer.
# we can think of a reference dictionary for helping us to break our strings of natural language to 
# into tokens of individual sentence or words.
nltk.download('punkt') 
# English-language sentence tokenizer (not all periods end sentences; 
# not all sentences start with a capital letter)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sshivagangeprakash/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

#### Load data

In [3]:
# This is the dataset for the words in it, we need to create word vectors.
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to
[nltk_data]     /Users/sshivagangeprakash/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.


True

In [4]:
from nltk.corpus import gutenberg

In [5]:
len(gutenberg.fileids())

18

In [6]:
gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

#### Tokenize text

In [7]:
# tokenize the input data into sentences.
gberg_sent_tokens = sent_tokenize(gutenberg.raw())

In [8]:
gberg_sent_tokens[0:6]

['[Emma by Jane Austen 1816]\n\nVOLUME I\n\nCHAPTER I\n\n\nEmma Woodhouse, handsome, clever, and rich, with a comfortable home\nand happy disposition, seemed to unite some of the best blessings\nof existence; and had lived nearly twenty-one years in the world\nwith very little to distress or vex her.',
 "She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period.",
 'Her mother\nhad died too long ago for her to have more than an indistinct\nremembrance of her caresses; and her place had been supplied\nby an excellent woman as governess, who had fallen little short\nof a mother in affection.',
 "Sixteen years had Miss Taylor been in Mr. Woodhouse's family,\nless as a governess than a friend, very fond of both daughters,\nbut particularly of Emma.",
 'Between _them_ it was more the intimacy\nof sisters.',
 "Even before Miss Taylor had ceased to hold the nominal

In [9]:
gberg_sent_tokens[1]

"She was the youngest of the two daughters of a most affectionate,\nindulgent father; and had, in consequence of her sister's marriage,\nbeen mistress of his house from a very early period."

In [10]:
# tokenize into words
word_tokenize(gberg_sent_tokens[1])

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'s",
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [11]:
word_tokenize(gberg_sent_tokens[1])[14]

'father'

In [12]:
# a convenient method that handles newlines, as well as tokenizing sentences and words in one shot
gberg_sents = gutenberg.sents()

In [13]:
gberg_sents[0:6]

[['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']'],
 ['VOLUME', 'I'],
 ['CHAPTER', 'I'],
 ['Emma',
  'Woodhouse',
  ',',
  'handsome',
  ',',
  'clever',
  ',',
  'and',
  'rich',
  ',',
  'with',
  'a',
  'comfortable',
  'home',
  'and',
  'happy',
  'disposition',
  ',',
  'seemed',
  'to',
  'unite',
  'some',
  'of',
  'the',
  'best',
  'blessings',
  'of',
  'existence',
  ';',
  'and',
  'had',
  'lived',
  'nearly',
  'twenty',
  '-',
  'one',
  'years',
  'in',
  'the',
  'world',
  'with',
  'very',
  'little',
  'to',
  'distress',
  'or',
  'vex',
  'her',
  '.'],
 ['She',
  'was',
  'the',
  'youngest',
  'of',
  'the',
  'two',
  'daughters',
  'of',
  'a',
  'most',
  'affectionate',
  ',',
  'indulgent',
  'father',
  ';',
  'and',
  'had',
  ',',
  'in',
  'consequence',
  'of',
  'her',
  'sister',
  "'",
  's',
  'marriage',
  ',',
  'been',
  'mistress',
  'of',
  'his',
  'house',
  'from',
  'a',
  'very',
  'early',
  'period',
  '.'],
 ['Her',
  'mother',
  'h

In [14]:
gberg_sents[4]

['She',
 'was',
 'the',
 'youngest',
 'of',
 'the',
 'two',
 'daughters',
 'of',
 'a',
 'most',
 'affectionate',
 ',',
 'indulgent',
 'father',
 ';',
 'and',
 'had',
 ',',
 'in',
 'consequence',
 'of',
 'her',
 'sister',
 "'",
 's',
 'marriage',
 ',',
 'been',
 'mistress',
 'of',
 'his',
 'house',
 'from',
 'a',
 'very',
 'early',
 'period',
 '.']

In [15]:
gberg_sents[4][14]

'father'

In [16]:
# another convenient method that we don't immediately need: 
gutenberg.words() 

['[', 'Emma', 'by', 'Jane', 'Austen', '1816', ']', ...]

In [17]:
# gutenberg.words() is analogous to the following line, which need not be run: 
# word_tokenize(gutenberg.raw())

In [18]:
# our Gutenberg corpus is 2.6m words in length: 
len(gutenberg.words())

2621613

#### Run word2vec

In [20]:
# first argument is sentences. 
# second argument - number of dimensions in the word vector space. It is our choice we have given as 64
# so we will have 64 dimensions vector space, each word vector will have 64 dimensions.
# sg - stands for Skip gram - 1 means skip gram enabled
# if sg = 0 means it will choose CBOW, if the input data is small go with SG else CBOW
# window 10 means consider 10 words on either side of the target word (context words) 10 words on each side means totally 20 words
# 20 words used to predict the target word
# min_count 5, means a word should have occured atleast 5 times for it to appear in vector space or to create a word vector
# workers - no of cores in the processor to use, again hyper parameter.
model = Word2Vec(sentences=gberg_sents, vector_size=64, sg=1, window=10, min_count=5, workers=8)

In [21]:
# save the model to a file.
model.save('raw_gutenberg_model.w2v')

#### Explore model

In [22]:
# skip re-training the model if the model is already saved to a file
# just load it.
model = gensim.models.Word2Vec.load('raw_gutenberg_model.w2v')

In [24]:
# below line of code will print the word vector of dog in n-dimensional array form.
# we have chose 64 dimensional array. So it will have 64 elements in the array.
model.wv['dog']

array([ 0.2713596 , -0.32924688,  0.11472257,  0.3351441 , -0.30741057,
       -0.23049717,  0.18597332, -0.50526196,  0.0210742 ,  0.39238462,
        0.5664741 , -0.5492611 , -0.15522766, -0.19445723, -0.13773799,
        0.99456644, -0.17534003,  0.2565941 , -0.28944844,  0.28690422,
        0.16329953,  0.36687422,  0.07957038,  0.409416  ,  0.09103061,
        0.56766564, -0.571075  ,  0.24915783,  0.04004098,  0.2911941 ,
       -0.14861006, -0.05701988, -0.1547411 , -0.33097813,  0.02262263,
       -0.15625168,  0.4689804 ,  0.32878602,  0.13954316, -0.12812825,
       -0.47772878,  0.07650188,  0.08373575, -0.23279129,  0.1028755 ,
        0.01376102,  0.10149986, -0.22131407,  0.08463605,  0.41793898,
       -0.12431577, -0.38518965,  0.3131011 ,  0.08848778, -0.0461773 ,
        0.33207992, -0.17040533, -0.02537598,  0.07207736,  0.25269946,
       -0.1350164 , -0.0142925 ,  0.10114615, -0.17924385], dtype=float32)

In [25]:
len(model.wv['dog'])

64

In [26]:
# List the words which are closer to `dog` using euclidean distance.
model.wv.most_similar('dog') # distance

[('puppy', 0.7888559103012085),
 ('thief', 0.7744528651237488),
 ('cage', 0.7598984241485596),
 ('broth', 0.7489614486694336),
 ('sweeper', 0.7468899488449097),
 ('butcher', 0.7442649006843567),
 ('cow', 0.7399811148643494),
 ('bridle', 0.7391263246536255),
 ('chimney', 0.7368165254592896),
 ('boy', 0.7342114448547363)]

In [27]:
model.wv.most_similar('think')

[('suppose', 0.8353540897369385),
 ('contradict', 0.8350876569747925),
 ('manage', 0.8255674839019775),
 ('Mamma', 0.8251374363899231),
 ('awfully', 0.8210975527763367),
 ('know', 0.8210111260414124),
 ('really', 0.8178511261940002),
 ('believe', 0.8144370913505554),
 ('beaux', 0.813211977481842),
 ('NOW', 0.8130669593811035)]

In [28]:
model.wv.most_similar('day')

[('morning', 0.7813193202018738),
 ('night', 0.779412567615509),
 ('time', 0.7721558213233948),
 ('month', 0.752544105052948),
 ('week', 0.7454372644424438),
 ('evening', 0.7122092247009277),
 ('feasting', 0.6935468316078186),
 ('Saturday', 0.6819261312484741),
 ('Adar', 0.6806393265724182),
 ('fourteenth', 0.6693751811981201)]

In [29]:
model.wv.most_similar('father')

[('mother', 0.8717845678329468),
 ('brother', 0.857100784778595),
 ('sister', 0.8104619979858398),
 ('daughter', 0.7923604249954224),
 ('wife', 0.7680190801620483),
 ('uncle', 0.7439402341842651),
 ('Amnon', 0.7314168810844421),
 ('servant', 0.7286089658737183),
 ('bondwoman', 0.7266589403152466),
 ('younger', 0.7237505912780762)]

In [30]:
# out of 4 words, choose which is not related. 
# it uses euclidean distance between each of the element in the list
# which is far away from all other elements in the list that will be marked as doesn't match or dissmilar
model.wv.doesnt_match("mother father daughter dog".split())

'dog'

In [31]:
model.wv.similarity('father', 'dog')

0.48597765

In [32]:
# close, but not quite; distinctly in female direction: 
# what happens is father - man + woman 
# vector arithmetic will happen as mentioned above. Result will be returned. 
model.wv.most_similar(positive=['father', 'woman'], negative=['man']) 

[('daughter', 0.7736457586288452),
 ('mother', 0.7729701995849609),
 ('wife', 0.7643260955810547),
 ('husband', 0.7586399912834167),
 ('sister', 0.7413561344146729),
 ('brother', 0.7309970259666443),
 ('Rachel', 0.7104281187057495),
 ('daughters', 0.6923649311065674),
 ('child', 0.678975522518158),
 ('Sarah', 0.6787348985671997)]

In [33]:
# more confident about this one: 
model.wv.most_similar(positive=['son', 'woman'], negative=['man']) 

[('Leah', 0.7589835524559021),
 ('daughter', 0.7383713722229004),
 ('wife', 0.7178788185119629),
 ('Rachel', 0.717748761177063),
 ('Sarai', 0.7163435816764832),
 ('Bethuel', 0.7106838226318359),
 ('Sarah', 0.7095574736595154),
 ('Hagar', 0.7044370174407959),
 ('Jephunneh', 0.7035619020462036),
 ('Abram', 0.7031012773513794)]

In [34]:
model.wv.most_similar(positive=['husband', 'woman'], negative=['man']) 

[('wife', 0.7204158306121826),
 ('child', 0.6890517473220825),
 ('widow', 0.6763985753059387),
 ('daughter', 0.6758059859275818),
 ('sister', 0.6665168404579163),
 ('mother', 0.6595192551612854),
 ('nurse', 0.650743842124939),
 ('maid', 0.6497228145599365),
 ('Rachel', 0.636143147945404),
 ('conceived', 0.623807430267334)]

In [35]:
model.wv.most_similar(positive=['king', 'woman'], negative=['man'], topn=30) 

[('Sarah', 0.6889079213142395),
 ('Babylon', 0.680236279964447),
 ('Rachel', 0.6771531701087952),
 ('Leah', 0.6528046727180481),
 ('Padanaram', 0.6503288149833679),
 ('Abram', 0.6444354057312012),
 ('tribute', 0.641265869140625),
 ('queen', 0.6361493468284607),
 ('Vashti', 0.6355467438697815),
 ('Solomon', 0.6344469785690308),
 ('Sarai', 0.6311295628547668),
 ('Judah', 0.6270841360092163),
 ('Ephron', 0.6246798634529114),
 ('Laban', 0.6235942244529724),
 ('Bethuel', 0.6219090819358826),
 ('Jerusalem', 0.620144784450531),
 ('Abraham', 0.6173027157783508),
 ('Heth', 0.6169054508209229),
 ('daughter', 0.6160672307014465),
 ('servants', 0.6158027052879333),
 ('Assyria', 0.6147435307502747),
 ('Jerubbaal', 0.6138827204704285),
 ('Nebuchadnezzar', 0.6124151945114136),
 ('Cain', 0.6115320324897766),
 ('Rahab', 0.6108534932136536),
 ('sware', 0.6097025871276855),
 ('Hanun', 0.6094104647636414),
 ('Isaac', 0.6089671850204468),
 ('Pharaoh', 0.6072671413421631),
 ('Lot', 0.6064099073410034)]

In [36]:
# impressive for such a small data set, without any cleaning, e.g., to lower case (covered next)

#### Reduce word vector dimensionality with t-SNE

In [38]:
# the input corpus has more than 2 million words, but our model reduced the words into 17000. Nothing bad here.
len(model.wv.key_to_index)

17011

In [39]:
X = model.wv[model.wv.key_to_index]

In [40]:
# so far we have created 64 dimensions vector space.
# It is hard to visualize, so we have to reduce it to 2D or 3D for humans to visualize
# for that reason we are using TNSE
# n_components means the number of dimensions we want it can be either 2 or 3.
# n_iter - means how many iterations you need to perform to reduce the word vectors to n_components dimensions
# bigger the value of n_iter better the results of dimensionality reduction.
tsne = TSNE(n_components=2, n_iter=1000) # 200 is minimum iter; default is 1000

In [41]:
X_2d = tsne.fit_transform(X)

In [42]:
X_2d[0:5]

array([[-13.193214, -38.32847 ],
       [-13.455554, -37.764153],
       [-13.208204, -38.044792],
       [-13.587155, -39.043606],
       [-13.054964, -37.547115]], dtype=float32)

In [44]:
# create DataFrame for storing results and plotting
coords_df = pd.DataFrame(X_2d, columns=['x','y'])
coords_df['token'] = model.wv.key_to_index.keys()

In [45]:
coords_df.head()

Unnamed: 0,x,y,token
0,-13.193214,-38.328468,","
1,-13.455554,-37.764153,the
2,-13.208204,-38.044792,and
3,-13.587155,-39.043606,.
4,-13.054964,-37.547115,of


In [46]:
# load the TSNE fit transform output to a file, so you need not spend so much time again transforming.
coords_df.to_csv('raw_gutenberg_tsne.csv', index=False)

#### Visualize 2D representation of word vectors

In [47]:
coords_df = pd.read_csv('raw_gutenberg_tsne.csv')

In [48]:
output_notebook() # output bokeh plots inline in notebook

In [49]:
subset_df = coords_df.sample(n=5000)

In [51]:
p = figure(width=800, height=800)
_ = p.text(x=subset_df.x, y=subset_df.y, text=subset_df.token)

In [52]:
show(p)