In [1]:
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import RegexpTokenizer

In [2]:
# create corpus - list of documents
doc1 = "Apple and orange are fruits. Apple and Google are IT companies."
doc2 = "Hyderabad and Secunderabad are twin cities. Vijayawada and Bejawada are names for same city"
doc3 = "India has 29 states. Telangana is one of them"
# tokenize and exclude punctuation marks
tokenizer = RegexpTokenizer(r'\w+')
token1 = tokenizer.tokenize(doc1)            # removes punctuation marks
print(type(token1))
token2 = tokenizer.tokenize(doc2)
token3 = tokenizer.tokenize(doc3)
sentence = list()
sentence.append(token1)
sentence.append(token2)
sentence.append(token3)
print(sentence)

<class 'list'>
[['Apple', 'and', 'orange', 'are', 'fruits', 'Apple', 'and', 'Google', 'are', 'IT', 'companies'], ['Hyderabad', 'and', 'Secunderabad', 'are', 'twin', 'cities', 'Vijayawada', 'and', 'Bejawada', 'are', 'names', 'for', 'same', 'city'], ['India', 'has', '29', 'states', 'Telangana', 'is', 'one', 'of', 'them']]


In [3]:
# create word2vec 
model = Word2Vec(sentence, sg = 0, min_count=1,size=5,window=5,workers=4,
                 seed =10, alpha = 0.01, min_alpha=0.001,
                 compute_loss=True, iter=50)         # size represents number of dimensions#
print(type(model))

<class 'gensim.models.word2vec.Word2Vec'>


In [4]:
model.get_latest_training_loss()

1328.726806640625

In [5]:
# print word vector for any word
model['Apple']
#model.wv._getitem_('Apple')

  


array([ 0.04576896, -0.0460228 ,  0.08032094, -0.06570196, -0.0931375 ],
      dtype=float32)

In [6]:
model['orange']

  """Entry point for launching an IPython kernel.


array([ 0.02377045,  0.09000102,  0.03034329, -0.08768976,  0.00173387],
      dtype=float32)

In [8]:
model['Google']

  """Entry point for launching an IPython kernel.


array([-0.09084674, -0.09710447,  0.05933991, -0.07595898,  0.05437203],
      dtype=float32)

In [7]:
# similarity between two words
model.wv.similarity('Hyderabad','Secunderabad')

-0.55234265

In [9]:
model.wv.similarity('India','Telangana')

0.35862145

In [17]:
model.wv.similarity('Apple','orange')

0.098391965

In [10]:
model.wv.similarity('Apple','Google')

0.19958498

In [11]:
model.wv.similarity('Apple','fruits')

0.36182353

In [35]:
model.wv.similarity('Apple','IT')

-0.26950532

In [36]:
model.wv.similarity('Vijayawada','Bejawada')

-0.085544378

In [10]:
model.wv.most_similar(positive=['Apple', 'fruits'], negative=['orange'], topn=1)

[('India', 0.7632635831832886)]

In [11]:
# print all words
print(model.wv.index2word)
print(len(model.wv.index2word))

['and', 'are', 'Apple', 'orange', 'fruits', 'Google', 'IT', 'companies', 'Hyderabad', 'Secunderabad', 'twin', 'cities', 'Vijayawada', 'Bejawada', 'names', 'for', 'same', 'city', 'India', 'has', '29', 'states', 'Telangana', 'is', 'one', 'of', 'them']
27


In [12]:
# print vectors for all words
model.wv.vectors[0:2]

array([[ 0.02612994,  0.00679522,  0.03088141, -0.04218544,  0.01178725,
        -0.02711239,  0.00707143, -0.01302773, -0.02990203, -0.00189833],
       [ 0.02507149, -0.04845297, -0.04416666,  0.01537336, -0.02588061,
         0.03807232,  0.01327009, -0.04607012,  0.01074946, -0.03296657]],
      dtype=float32)

In [14]:
# zip word and corresponding vector
w2v = dict(zip(model.wv.index2word, model.wv.vectors))
#print(w2v)

In [21]:
# build word2vec in two phases - first build vocabulary and train it
model3 = Word2Vec(min_count=1,size=10)
model3.build_vocab(sentence)
print(model3.epochs)
print(len(sentence))
model3.train(sentence, total_words=len(sentence), epochs=model3.epochs) 

5
3


(35, 170)

In [22]:
# print vectors for all words
print(model3.wv.vectors[0:1])

[[ 0.00998848 -0.00347426  0.01405674 -0.04747547 -0.03307188 -0.00033746
  -0.0415465  -0.02504256 -0.01051377 -0.02789557]]


In [12]:
# read pre-trained vectors from glove
import os
os.chdir('C:/Vishnu Murthy Chakka/Knowledge Base/Data Science/Manuals/Natural Language Processing/glove')
print(os.getcwd())

import numpy as np

#with open("glove.6B.50d.txt", "rb") as lines:
#    w2v = {line.split()[0]: np.array(map(float, line.split()[1:]))
#           for line in lines}
with open("glove.6B.50d.txt", "rb") as lines:
    w2v = {line.split()[0].decode("utf-8"): np.array(line.split()[1:]).astype(float)
           for line in lines}

C:\Vishnu Murthy Chakka\Knowledge Base\Data Science\Manuals\Natural Language Processing\glove


In [13]:
w2v["the"]

array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01])

In [14]:
print(w2v['frog'])
print(w2v['frogs'])

[ 0.61038  -0.20757  -0.71951   0.89304   0.32482   0.76564   0.1814
 -0.33086   0.79173  -0.31664   0.011143  0.45412   1.5992    0.013494
 -0.093646  0.19245   0.251     1.1277   -1.0897   -0.42909  -1.1327
 -0.90465   0.5617   -0.058464  1.0007   -0.39017  -0.41665   0.73721
 -0.53824  -0.95993   0.67929  -0.59053   0.13408   0.54273  -0.36615
  0.014978 -0.2496   -0.81088   0.078905 -0.97552  -0.66394  -0.18508
 -0.87174   0.30782   1.2839   -0.14884   0.62178  -1.509     0.14582
 -0.31682 ]
[ 0.16142  -1.0424   -0.46239   0.77606   0.26464   0.31932  -0.76033
 -0.52584   0.38692   0.019854  0.13353   0.51689   1.1251    0.10543
  0.055542 -0.18489   0.16058   0.30495  -1.6176   -0.88914  -2.0381
 -0.65167   1.6266   -0.50523   0.72486   0.061601  0.024211 -0.12515
 -0.26353  -1.3289    0.94717   0.71643  -0.24545  -0.13676   0.33316
  0.15309  -0.40259  -0.22679   0.28291  -0.69532  -1.3783   -0.61586
 -0.70103   1.3187    1.043     0.77337   0.49553  -0.91989  -0.28027
 -0.45603 

In [15]:
# load pre-trained vectors from google - 300 dimensions i.e. a word is represented by array of 300 numbers
import os
os.chdir('C:/Vishnu Murthy Chakka/Knowledge Base/Data Science/Manuals/Natural Language Processing/google')
print(os.getcwd())
from gensim.models import KeyedVectors 
model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

C:\Vishnu Murthy Chakka\Knowledge Base\Data Science\Manuals\Natural Language Processing\google


In [16]:
model['dog']

array([ 5.12695312e-02, -2.23388672e-02, -1.72851562e-01,  1.61132812e-01,
       -8.44726562e-02,  5.73730469e-02,  5.85937500e-02, -8.25195312e-02,
       -1.53808594e-02, -6.34765625e-02,  1.79687500e-01, -4.23828125e-01,
       -2.25830078e-02, -1.66015625e-01, -2.51464844e-02,  1.07421875e-01,
       -1.99218750e-01,  1.59179688e-01, -1.87500000e-01, -1.20117188e-01,
        1.55273438e-01, -9.91210938e-02,  1.42578125e-01, -1.64062500e-01,
       -8.93554688e-02,  2.00195312e-01, -1.49414062e-01,  3.20312500e-01,
        3.28125000e-01,  2.44140625e-02, -9.71679688e-02, -8.20312500e-02,
       -3.63769531e-02, -8.59375000e-02, -9.86328125e-02,  7.78198242e-03,
       -1.34277344e-02,  5.27343750e-02,  1.48437500e-01,  3.33984375e-01,
        1.66015625e-02, -2.12890625e-01, -1.50756836e-02,  5.24902344e-02,
       -1.07421875e-01, -8.88671875e-02,  2.49023438e-01, -7.03125000e-02,
       -1.59912109e-02,  7.56835938e-02, -7.03125000e-02,  1.19140625e-01,
        2.29492188e-01,  

In [14]:
import numpy as np
np.mean(model['dog'])

-0.0088713523

In [4]:
#printing similarity index
print(model.similarity('woman', 'man'))

0.766401


In [5]:
#printing similarity index
print(model.similarity('Delhi', 'India'))

0.663204


In [42]:
#printing similarity index
print(model.similarity('Delhi', 'Islamabad'))

0.643999


In [43]:
#printing similarity index
print(model.similarity('Delhi', 'London'))

0.447594


In [6]:
# load pre-trained vectors from google
from gensim.test.utils import datapath
wv_from_text = KeyedVectors.load_word2vec_format(datapath('word2vec_pre_kv_c'), binary=False)  # C text format

In [44]:
type(wv_from_text)

gensim.models.keyedvectors.Word2VecKeyedVectors

In [11]:
wv_from_text['the']

array([-0.56110603, -1.97569799,  1.66395497, -1.23224604,  0.75475103,
        0.98576403,  2.26144099, -0.59829003, -0.47433099, -1.41610503], dtype=float32)

In [12]:
wv_from_text['for']

array([-0.478865  , -2.14878297,  1.82030797, -1.33254898,  0.86115199,
        1.06142497,  2.51318002, -0.62415099, -0.508928  , -1.59284794], dtype=float32)

In [8]:
wv_from_text['dog']

KeyError: "word 'dog' not in vocabulary"

In [21]:
class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        # if a text is empty we should return a vector of zeros
        # with the same dimensionality as all the other vectors
#        self.dim = len(word2vec.itervalues().next())

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

In [18]:
# use keras for tokenization
from keras.preprocessing import text, sequence
doc1 = "Apple and orange are fruits. Apple and Google are IT companies."
doc2 = "Hyderabad and Secunderabad are twin cities. Vijayawada and Bejawada are names for same city"
doc3 = "India has 29 states. Telangana is one of them"
corpus = list()
corpus.append(doc1)
corpus.append(doc2)
corpus.append(doc3)
token = text.Tokenizer()
token.fit_on_texts(corpus)
word_index = token.word_index

Using TensorFlow backend.


In [56]:
type(word_index)

dict

In [59]:
word_index.items()

dict_items([('and', 1), ('are', 2), ('apple', 3), ('orange', 4), ('fruits', 5), ('google', 6), ('it', 7), ('companies', 8), ('hyderabad', 9), ('secunderabad', 10), ('twin', 11), ('cities', 12), ('vijayawada', 13), ('bejawada', 14), ('names', 15), ('for', 16), ('same', 17), ('city', 18), ('india', 19), ('has', 20), ('29', 21), ('states', 22), ('telangana', 23), ('is', 24), ('one', 25), ('of', 26), ('them', 27)])

In [19]:
# get index of tokens created
print(word_index)          # this is dictionary/vocabulary

{'and': 1, 'are': 2, 'apple': 3, 'orange': 4, 'fruits': 5, 'google': 6, 'it': 7, 'companies': 8, 'hyderabad': 9, 'secunderabad': 10, 'twin': 11, 'cities': 12, 'vijayawada': 13, 'bejawada': 14, 'names': 15, 'for': 16, 'same': 17, 'city': 18, 'india': 19, 'has': 20, '29': 21, 'states': 22, 'telangana': 23, 'is': 24, 'one': 25, 'of': 26, 'them': 27}


In [20]:
# convert words/tokens to index
print(token.texts_to_sequences(corpus))

[[3, 1, 4, 2, 5, 3, 1, 6, 2, 7, 8], [9, 1, 10, 2, 11, 12, 13, 1, 14, 2, 15, 16, 17, 18], [19, 20, 21, 22, 23, 24, 25, 26, 27]]


In [21]:
# padding
input = sequence.pad_sequences(token.texts_to_sequences(corpus), maxlen=20)
print(input)

[[ 0  0  0  0  0  0  0  0  0  3  1  4  2  5  3  1  6  2  7  8]
 [ 0  0  0  0  0  0  9  1 10  2 11 12 13  1 14  2 15 16 17 18]
 [ 0  0  0  0  0  0  0  0  0  0  0 19 20 21 22 23 24 25 26 27]]


In [61]:
# create token-embedding mapping
embedding_matrix = np.zeros((len(word_index) + 1, 50))
for word, i in word_index.items():
    if word in w2v.keys():
        embedding_matrix[i] = w2v[word]
print(embedding_matrix)

[[ 0.        0.        0.       ...,  0.        0.        0.      ]
 [ 0.26818   0.14346  -0.27877  ..., -0.6321   -0.25028  -0.38097 ]
 [ 0.96193   0.012516  0.21733  ...,  0.14032  -0.38468  -0.38712 ]
 ..., 
 [ 0.31474   0.41662   0.1348   ..., -0.20526   0.07009  -0.11568 ]
 [ 0.70853   0.57088  -0.4716   ..., -0.22562  -0.093918 -0.80375 ]
 [ 0.64642  -0.556     0.47038  ..., -0.35831  -0.10995  -0.447   ]]
