In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader

import numpy as np

import re

from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

import relations

In [3]:
# import word list
imported_tuples = relations.read_relations("./relations/country-city.txt")

In [4]:
# Load the model
model = gensim.downloader.load('glove-wiki-gigaword-50')
dims = 50

In [6]:
# preprocess data
word_tuples = []
for tup in imported_tuples:
    if " " not in tup[0].strip() and " " not in tup[1].strip():
        word_tuples.append((tup[0].strip().lower(), tup[1].strip().lower()))

In [7]:
# conversion to vectors
word_vectors = []
for tup in word_tuples:
    if tup[0] in model.vocab and tup[1] in model.vocab:
        word_vectors.append((model.get_vector(tup[0]), model.get_vector(tup[1])))

In [9]:
word_tuples[:5]

[('afghanistan', 'kabul'),
 ('albania', 'tirana'),
 ('algeria', 'algiers'),
 ('angola', 'luanda'),
 ('armenia', 'yerevan')]

In [11]:
word_vectors[1]

(array([ 0.83302  , -0.45109  ,  0.036124 , -0.071542 , -0.43803  ,
        -0.35345  ,  0.44737  ,  0.80139  , -0.68828  , -0.78343  ,
         0.72771  , -1.2222   ,  0.19957  , -0.77006  ,  0.58453  ,
        -0.091821 ,  0.087678 ,  0.31854  ,  0.70997  ,  0.9374   ,
        -0.46157  , -0.17633  , -0.40542  ,  0.67545  ,  0.089017 ,
        -0.75488  ,  0.24461  , -0.059126 ,  0.0026154,  0.27766  ,
         1.6444   ,  0.94396  , -1.3238   ,  0.020666 , -0.52746  ,
         0.83906  , -0.86562  ,  0.067062 , -0.38675  ,  0.21263  ,
        -0.069293 ,  0.36148  ,  0.83636  , -1.8639   , -0.85996  ,
         0.55211  , -0.090987 ,  0.70084  , -0.11893  , -1.7953   ],
       dtype=float32),
 array([ 0.82105 , -0.21744 ,  0.43371 ,  0.31729 , -0.94054 , -1.2881  ,
         0.7401  ,  0.68178 , -1.1084  , -0.045213,  0.31385 , -1.5741  ,
        -0.069105, -0.92858 , -0.12261 , -0.05564 , -0.58251 ,  1.3842  ,
         0.28483 ,  0.50501 ,  0.24093 ,  1.017   , -0.26853 ,  0.60525 ,


## Actual Work

#### Create an Array X: x<sub>i</sub> = v<sub>i</sub> - u<sub>i</sub>

In [40]:
X = list()
for u, v in word_vectors:
    X.append(u - v)
    
X[1]

array([ 0.01196998, -0.23365001, -0.39758602, -0.388832  ,  0.50251   ,
        0.93465   , -0.29273003,  0.11961001,  0.42012   , -0.738217  ,
        0.41386002,  0.35189998,  0.268675  ,  0.15851998,  0.70713997,
       -0.036181  ,  0.670188  , -1.06566   ,  0.42514   ,  0.43238997,
       -0.7025    , -1.1933299 , -0.13689   ,  0.07020003,  0.173776  ,
       -0.55845   , -0.0662    , -0.326136  ,  0.9675654 ,  0.59071004,
        0.97688   , -0.01770997, -0.41065997,  0.005025  , -0.71237   ,
        0.32705003, -0.94807404,  0.322702  , -0.7925    , -0.38448998,
       -0.22698301,  0.02359   ,  0.48071998, -0.5540999 , -0.32920003,
        0.15780002, -0.010754  ,  0.03328001, -0.66625   , -0.3699    ],
      dtype=float32)

#### Shift Origin by computing M and subtracting from X

In [42]:
M = np.zeros((dims))

for x in X:
    M += x

M /= len(X)

In [43]:
M

array([-0.16358909, -0.29327068,  0.14220066,  0.41063505,  0.13229261,
        0.8263264 , -0.07271546,  0.0092053 ,  0.60268251, -0.1694377 ,
        0.46712787,  0.42413205,  0.16789689, -0.00893131,  0.34081271,
       -0.13949517,  0.45539964, -0.42583672,  0.24460038,  0.01284352,
       -0.78364116, -0.56847489,  0.36275386, -0.03245362,  0.264267  ,
       -0.47392792,  0.08204632, -0.60435721,  0.17515299,  0.30515276,
        0.89206032,  0.21477669, -0.17062754,  0.41196745, -0.27975058,
        0.07242421, -0.49159007,  0.19052438, -0.54776943, -0.60771226,
       -0.06636489,  0.08486955,  0.41415574, -0.36608785,  0.00916646,
        0.42577849, -0.38137972,  0.01636136, -0.25266835, -0.22136516])

In [44]:
for i in range(len(X)):
    X[i] = X[i] - M

X[1]

array([ 0.17555907,  0.05962067, -0.53978668, -0.79946705,  0.37021741,
        0.1083236 , -0.22001458,  0.11040471, -0.18256251, -0.56877929,
       -0.05326785, -0.07223207,  0.10077811,  0.16745129,  0.36632726,
        0.10331417,  0.21478837, -0.63982328,  0.18053961,  0.41954646,
        0.08114117, -0.62485504, -0.49964386,  0.10265365, -0.090491  ,
       -0.08452206, -0.14824632,  0.27822121,  0.79241243,  0.28555728,
        0.0848197 , -0.23248667, -0.24003243, -0.40694245, -0.4326194 ,
        0.25462582, -0.45648398,  0.13217761, -0.24473059,  0.22322228,
       -0.16061812, -0.06127955,  0.06656424, -0.18801207, -0.33836649,
       -0.26797847,  0.37062572,  0.01691866, -0.41358164, -0.14853483])

#### Compute the Covariance matrix K

In [53]:
dim = dict()

for i in range(dims):
    dim[i] = np.zeros((len(X)))
    for j in range(len(X)):
        dim[i][j] = X[j][i]

# should be 0.17555907 (see above)
dim[0][1]

0.17555907144085736

In [55]:
# test that * works as element-wise product
print(dim[1][0], dim[2][0])

print((dim[1] * dim[2])[0])

-0.01247930047201351 -0.11409069716613343
0.0014237720909976795


In [61]:
def covariance(i, j):
    temp_array = dim[i] * dim[j]    
    k = np.sum(temp_array)/len(X)
    #print(k, temp_array)
    return k
    
covariance(1, 2)

0.016251801836633637

In [65]:
K = np.zeros((dims, dims))

for i in range(dims):    
    for j in range(dims):
        K[i][j] = covariance(i, j)
        
K[1][2]

0.016251801836633637

In [68]:
# maybe replace with custom function

eigenvalues, eigenvectors = np.linalg.eig(K)

eigenvalues, eigenvectors

(array([1.00510262, 0.69495076, 0.57102298, 0.49854698, 0.47618954,
        0.38597204, 0.3439272 , 0.33056629, 0.30771702, 0.29542961,
        0.26532177, 0.25464861, 0.22036294, 0.20498949, 0.19928709,
        0.17553225, 0.15996901, 0.16287064, 0.14757134, 0.14003818,
        0.11645407, 0.114497  , 0.1030991 , 0.09884275, 0.09080937,
        0.09384069, 0.08280012, 0.07604933, 0.07477184, 0.06784962,
        0.06575399, 0.0625915 , 0.05708753, 0.05194403, 0.04509545,
        0.03920581, 0.03612059, 0.03004703, 0.0291661 , 0.02349725,
        0.02428605, 0.01775964, 0.0167138 , 0.00500559, 0.00131223,
        0.00171265, 0.00230432, 0.00303247, 0.00360654, 0.00380415]),
 array([[ 0.06849259, -0.07427186, -0.08238581, ..., -0.13856332,
          0.0907733 , -0.04034255],
        [-0.07439516, -0.30155978,  0.06519565, ..., -0.07920169,
          0.04277651,  0.05022674],
        [-0.01152306, -0.00170462,  0.15533989, ...,  0.08274053,
         -0.30193931, -0.05878471],
        ...,