# Establish Relations 

In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader
#Import numpy for array calculation
import numpy as np

import re

#import nltk for string operations
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

import relations

In [2]:
# Load the model
model = gensim.downloader.load('glove-wiki-gigaword-50')
dims = 50

In [3]:
# Prepared a country-capital tuple list
country_capital_tuples = relations.read_relations("./relations/country-city.txt")

In [4]:
# created a separate list for country-capital word vectors
country_capital_vectors = []

for tup in country_capital_tuples:
    try:
        country_vec = model[tup[0].lower()]
        capital_vec = model[tup[1].lower().strip()]
        country_capital_vectors.append((country_vec, capital_vec))
    except:
        print(tup[0], tup[1])


## Relation Visualisation

### Reduction to 3 - dimensions

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook

In [6]:
# Reduced dimensionality of word vectors to two

vec_3d = []
start_idx = 0
end_idx = 3
for tup in country_capital_vectors:
    vec_3d.append((tup[0][start_idx:end_idx], tup[1][start_idx:end_idx]))

In [7]:
# plot country points
x1 = np.array([a[0] for a,b in vec_3d])
y1 = np.array([a[1] for a,b in vec_3d])
z1 = np.array([a[2] for a,b in vec_3d])
#capital points
x2 = np.array([b[0] for a,b in vec_3d])
y2 = np.array([b[1] for a,b in vec_3d])
z2 = np.array([b[2] for a,b in vec_3d])

fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue' )
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1))

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1))

plt.legend(["Country", "Capital"])

plt.show()

## Multivariate Multiple Linear Regression

Since we want to establish a relationship between each country vector and capital vector with each vector having 50 dimensions, we'll have to regress separately on each dimension of the dependent variable.

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
model.get_vector("india") # y1 = ax + bx2 + cx3 + c

array([-0.20356 , -0.8707  , -0.19172 ,  0.73862 ,  0.18494 ,  0.14926 ,
        0.48079 , -0.21633 ,  0.72753 , -0.36912 ,  0.13397 , -0.1143  ,
       -0.18075 , -0.64683 , -0.18484 ,  0.83575 ,  0.48179 ,  0.76026 ,
       -0.50381 ,  0.80743 ,  1.2195  ,  0.3459  ,  0.22185 ,  0.31335 ,
        1.2066  , -1.8441  ,  0.14064 , -0.99715 , -1.1402  ,  0.32342 ,
        3.2128  ,  0.42708 ,  0.19504 ,  0.80113 ,  0.38555 , -0.12568 ,
       -0.26533 ,  0.055264, -1.1557  ,  0.16836 , -0.82228 ,  0.20394 ,
        0.089235, -0.60125 , -0.032878,  1.3735  , -0.51661 ,  0.29611 ,
        0.23951 , -1.3801  ], dtype=float32)

In [17]:
# create a coefficients array
coeff = []
intercepts = []
avg_score = 0
# regressing a linear model on each independent dimension of response variable
for i in range(dims):
    y = [x[i] for x in country_vecs]
    x = capital_vecs
    regr = LinearRegression()
    regr.fit(x, y)
    coeff.append(regr.coef_)
    intercepts.append(regr.intercept_)
    avg_score += regr.score(x,y)
print("Model score: ", avg_score/dims)

Model score:  0.7872284582058566


Each list in coeff represent the coefficients of that particular dimension

In [18]:
# Predicting country vectors from the coefficients
country_vecs_pred = []
for cap_vec in capital_vecs:
    country_vec = []
    for i in range(dims):
        country_vec.append(sum([a*b for a,b in zip(cap_vec,coeff[i])]) + intercepts[i])
    country_vecs_pred.append(np.array(country_vec))

Plotting the predicted and actual values in 3d space

#actual points
x1 = np.array([a[0] for a in country_vecs])
y1 = np.array([a[1] for a in country_vecs])
z1 = np.array([a[2] for a in country_vecs])

#predicted points
x2 = np.array([a[0] for a in country_vecs_pred])
y2 = np.array([a[1] for a in country_vecs_pred])
z2 = np.array([a[2] for a in country_vecs_pred])

#3d plotting of the points
fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue')
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1))

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1))

plt.legend(["Actual", "Predicted"])

plt.show()

**Testing on country-capital pairs outside the training set**

In [20]:
country_vec = []
cap_vec = model.get_vector("delhi")
for i in range(dims):
    country_vec.append(sum([a*b for a,b in zip(cap_vec,coeff[i])]) + intercepts[i])
np.array(country_vec)

array([-0.28454536, -0.57818883, -0.14240528,  0.59526229,  0.10127134,
       -0.1738913 ,  0.27610445, -0.16543022,  0.89064522, -0.62444191,
        0.19509159, -0.21489233,  0.13871893, -0.45864954, -0.060893  ,
        1.04250209,  0.48640189,  0.47583331, -0.49979887,  1.0555295 ,
        0.79811794,  0.11726616,  0.2773406 ,  0.31930587,  1.10137744,
       -1.7283029 ,  0.40846794, -0.75157191, -1.02560588,  0.36716251,
        2.6931011 ,  0.69254462,  0.25064179,  0.66257842,  0.31880789,
        0.04387083,  0.04128279, -0.05560709, -1.11510517,  0.15490074,
       -0.81673772,  0.30231851,  0.04179684, -0.52633328,  0.13519456,
        1.27182813, -0.63266816,  0.59181991,  0.42529267, -1.21362986])

In [21]:
model.get_vector("india")

array([-0.20356 , -0.8707  , -0.19172 ,  0.73862 ,  0.18494 ,  0.14926 ,
        0.48079 , -0.21633 ,  0.72753 , -0.36912 ,  0.13397 , -0.1143  ,
       -0.18075 , -0.64683 , -0.18484 ,  0.83575 ,  0.48179 ,  0.76026 ,
       -0.50381 ,  0.80743 ,  1.2195  ,  0.3459  ,  0.22185 ,  0.31335 ,
        1.2066  , -1.8441  ,  0.14064 , -0.99715 , -1.1402  ,  0.32342 ,
        3.2128  ,  0.42708 ,  0.19504 ,  0.80113 ,  0.38555 , -0.12568 ,
       -0.26533 ,  0.055264, -1.1557  ,  0.16836 , -0.82228 ,  0.20394 ,
        0.089235, -0.60125 , -0.032878,  1.3735  , -0.51661 ,  0.29611 ,
        0.23951 , -1.3801  ], dtype=float32)

In [22]:
np.linalg.norm(country_vec-model.get_vector("china"))

4.338153551923051