In [2]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader

import numpy as np

import re

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

import relations

In [3]:
# import word list
imported_tuples = relations.read_relations("./relations/country-city.txt")

In [4]:
# Load the model
model = gensim.downloader.load('glove-wiki-gigaword-50')
dims = 50

In [5]:
# preprocess data
word_tuples = []
for tup in imported_tuples:
    if " " not in tup[0].strip() and " " not in tup[1].strip():
        word_tuples.append((tup[0].strip().lower(), tup[1].strip().lower()))

In [6]:
# conversion to vectors
word_vectors = []
for tup in word_tuples:
    if tup[0] in model.index_to_key and tup[1] in model.index_to_key:
        word_vectors.append((model.get_vector(tup[0]), model.get_vector(tup[1])))

In [7]:
word_tuples[:5]

[('afghanistan', 'kabul'),
 ('albania', 'tirana'),
 ('algeria', 'algiers'),
 ('angola', 'luanda'),
 ('armenia', 'yerevan')]

In [8]:
word_vectors[1]

(array([ 0.83302  , -0.45109  ,  0.036124 , -0.071542 , -0.43803  ,
        -0.35345  ,  0.44737  ,  0.80139  , -0.68828  , -0.78343  ,
         0.72771  , -1.2222   ,  0.19957  , -0.77006  ,  0.58453  ,
        -0.091821 ,  0.087678 ,  0.31854  ,  0.70997  ,  0.9374   ,
        -0.46157  , -0.17633  , -0.40542  ,  0.67545  ,  0.089017 ,
        -0.75488  ,  0.24461  , -0.059126 ,  0.0026154,  0.27766  ,
         1.6444   ,  0.94396  , -1.3238   ,  0.020666 , -0.52746  ,
         0.83906  , -0.86562  ,  0.067062 , -0.38675  ,  0.21263  ,
        -0.069293 ,  0.36148  ,  0.83636  , -1.8639   , -0.85996  ,
         0.55211  , -0.090987 ,  0.70084  , -0.11893  , -1.7953   ],
       dtype=float32),
 array([ 0.82105 , -0.21744 ,  0.43371 ,  0.31729 , -0.94054 , -1.2881  ,
         0.7401  ,  0.68178 , -1.1084  , -0.045213,  0.31385 , -1.5741  ,
        -0.069105, -0.92858 , -0.12261 , -0.05564 , -0.58251 ,  1.3842  ,
         0.28483 ,  0.50501 ,  0.24093 ,  1.017   , -0.26853 ,  0.60525 ,


## Actual Work

#### Create an Array X: x<sub>i</sub> = v<sub>i</sub> - u<sub>i</sub>

In [9]:
X = list()
for u, v in word_vectors:
    X.append(u - v)
    
X[1]

array([ 0.01196998, -0.23365001, -0.39758602, -0.388832  ,  0.50251   ,
        0.93465   , -0.29273003,  0.11961001,  0.42012   , -0.738217  ,
        0.41386002,  0.35189998,  0.268675  ,  0.15851998,  0.70713997,
       -0.036181  ,  0.670188  , -1.06566   ,  0.42514   ,  0.43238997,
       -0.7025    , -1.1933299 , -0.13689   ,  0.07020003,  0.173776  ,
       -0.55845   , -0.0662    , -0.326136  ,  0.9675654 ,  0.59071004,
        0.97688   , -0.01770997, -0.41065997,  0.005025  , -0.71237   ,
        0.32705003, -0.94807404,  0.322702  , -0.7925    , -0.38448998,
       -0.22698301,  0.02359   ,  0.48071998, -0.5540999 , -0.32920003,
        0.15780002, -0.010754  ,  0.03328001, -0.66625   , -0.3699    ],
      dtype=float32)

#### Shift Origin by computing M and subtracting from X

In [10]:
M = np.zeros((dims))

for x in X:
    M += x

M /= len(X)

In [11]:
M

array([-0.16746133, -0.26994048,  0.14773648,  0.40287001,  0.13896023,
        0.83586751, -0.09433237,  0.01654571,  0.60620341, -0.19630702,
        0.45520297,  0.43295857,  0.16052792, -0.00137272,  0.34746849,
       -0.12763229,  0.47484247, -0.4396687 ,  0.22809654,  0.02233975,
       -0.78939633, -0.56049448,  0.35158877, -0.03756598,  0.28124142,
       -0.50379695,  0.10671058, -0.60775114,  0.1895121 ,  0.30341821,
        0.91621018,  0.22692747, -0.15959574,  0.40773427, -0.27974377,
        0.08509954, -0.4772205 ,  0.19109579, -0.54973325, -0.5939445 ,
       -0.07247331,  0.09992009,  0.42285058, -0.36621386,  0.02810357,
        0.44685965, -0.39466281,  0.02177201, -0.26247528, -0.21880445])

In [12]:
for i in range(len(X)):
    X[i] = X[i] - M

X[1]

array([ 0.17943131,  0.03629046, -0.5453225 , -0.79170201,  0.36354978,
        0.0987825 , -0.19839766,  0.1030643 , -0.18608341, -0.54190997,
       -0.04134295, -0.08105859,  0.10814708,  0.15989271,  0.35967148,
        0.09145129,  0.19534554, -0.6259913 ,  0.19704346,  0.41005022,
        0.08689634, -0.63283545, -0.48847876,  0.107766  , -0.10746542,
       -0.05465303, -0.17291058,  0.28161515,  0.77805332,  0.28729183,
        0.06066984, -0.24463744, -0.25106423, -0.40270927, -0.43262621,
        0.24195049, -0.47085354,  0.1316062 , -0.24276677,  0.20945452,
       -0.1545097 , -0.0763301 ,  0.0578694 , -0.18788606, -0.3573036 ,
       -0.28905963,  0.38390881,  0.011508  , -0.40377471, -0.15109554])

#### Compute the Covariance matrix K

In [13]:
dim = dict()

for i in range(dims):
    dim[i] = np.zeros((len(X)))
    for j in range(len(X)):
        dim[i][j] = X[j][i]

# should be 0.17555907 (see above)
dim[0][1]

0.17943131063374684

In [14]:
# test that * works as element-wise product
print(dim[1][0], dim[2][0])

print((dim[1] * dim[2])[0])

-0.03580950646788528 -0.11962651271078226
0.004283766380647317


In [15]:
def covariance(i, j):
    temp_array = dim[i] * dim[j]    
    k = np.sum(temp_array)/len(X)
    #print(k, temp_array)
    return k
    
covariance(1, 2)

0.02107461676235216

In [16]:
K = np.zeros((dims, dims))

for i in range(dims):    
    for j in range(dims):
        K[i][j] = covariance(i, j)
        
K[1][2]

0.02107461676235216

In [33]:
# maybe replace with custom function

eigenvalues, eigenvectors = np.linalg.eig(K)

eigenvalues, eigenvectors

(array([1.00244385, 0.68029064, 0.5850016 , 0.50181911, 0.45811383,
        0.42209691, 0.36218514, 0.34618343, 0.32222055, 0.3154165 ,
        0.26328737, 0.25958291, 0.23576551, 0.21104976, 0.20178519,
        0.18492403, 0.17671467, 0.14980042, 0.1475237 , 0.13995451,
        0.12886042, 0.11449635, 0.10983665, 0.10055874, 0.09503518,
        0.09103855, 0.08417678, 0.07793335, 0.07367732, 0.0705138 ,
        0.06465424, 0.06235307, 0.0604404 , 0.05617269, 0.04196334,
        0.04007382, 0.03712316, 0.03084347, 0.02913823, 0.02467993,
        0.02362469, 0.01784515, 0.01404871, 0.00497884, 0.00129363,
        0.00167848, 0.00224649, 0.00302789, 0.00368294, 0.00358384]),
 array([[ 0.10686747, -0.00548069,  0.18507237, ...,  0.15276877,
         -0.08820277, -0.03310572],
        [-0.08439332, -0.34861174, -0.25638246, ...,  0.10971348,
         -0.00039904,  0.04962353],
        [-0.02751158,  0.02971406, -0.21743432, ..., -0.15147681,
          0.24564685, -0.13620884],
        ...,

In [34]:
eigenvectors = eigenvectors.T

In [35]:
# selecting top k eigen vectors
k = 10
top_k_eig = eigenvectors[:k]

In [36]:
def getProjectionOnEigSpace(point):
    projected_vector = []
    for eigvec in top_k_eig:
        projected_vector.append(np.dot(point, eigvec))
    return projected_vector

In [37]:
getProjectionOnEigSpace(model.get_vector("india"))

[-1.1601698835577354,
 -1.115070366022729,
 -1.2812101352924623,
 -1.611899824373968,
 1.7889040385397394,
 0.7522699097705754,
 -1.4527401450323325,
 0.5906267584522675,
 2.2436400066530533,
 0.10116232939267411]

In [42]:
np.dot(eigenvectors[0], eigenvectors[6])

8.153200337090993e-17