# Establish Relations 

In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader
#Import numpy for array calculation
import numpy as np

import re

#import nltk for string operations
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

In [41]:
# Load the model
model = gensim.downloader.load('glove-wiki-gigaword-50')
dims = 50

In [3]:
# Prepared a country-capital tuple list
country_capital_tuples = [
("india", "delhi"),
("afghanistan", "kabul"),
("angola", "luanda"),
("australia", "canberra"),
("brazil", "brasilia"),
("canada", "ottawa"),
("denmark", "copenhagen"),
("egypt", "cairo"),
("england", "london"),
("greece", "athens"),
("indonesia", "jakarta"),
("iran", "tehran"),
("ireland", "dublin"),
("japan", "tokyo"),
("russia", "moscow"),
("spain", "madrid"),
]

In [4]:
# created a separate list for country-capital word vectors
country_capital_vectors = []

for tup in country_capital_tuples:
    country_vec = model.get_vector(tup[0])
    capital_vec = model.get_vector(tup[1])
    country_capital_vectors.append((country_vec, capital_vec))


## Relation Visualisation

### Reduction to 3 - dimensions

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook

In [44]:
# Reduced dimensionality of word vectors to two

vec_3d = []
start_idx = 0
end_idx = 3
for tup in country_capital_vectors:
    vec_3d.append((tup[0][start_idx:end_idx], tup[1][start_idx:end_idx]))

In [7]:
# plot country points
x1 = np.array([a[0] for a,b in vec_3d])
y1 = np.array([a[1] for a,b in vec_3d])
z1 = np.array([a[2] for a,b in vec_3d])
#capital points
x2 = np.array([b[0] for a,b in vec_3d])
y2 = np.array([b[1] for a,b in vec_3d])
z2 = np.array([b[2] for a,b in vec_3d])

In [8]:
fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue' )
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1))

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1))

plt.legend(["Country", "Capital"])

plt.show()

<IPython.core.display.Javascript object>

### Reduction using PCA

In [9]:
# Check if all the dimensions are on the same scale
# if yes, no need of standardization
model.get_vector("earth")

array([ 1.0828   ,  0.8278   ,  1.0101   ,  0.012863 ,  0.8429   ,
       -0.27394  , -0.0047314, -0.47085  ,  0.33517  ,  0.042372 ,
        0.59308  ,  0.16438  ,  0.28729  ,  1.2204   , -0.034446 ,
        0.77297  ,  0.70817  ,  0.502    , -1.373    , -0.29783  ,
       -0.21993  ,  0.27519  ,  0.89638  , -0.73123  ,  1.439    ,
       -0.61846  , -0.90979  ,  0.28174  ,  0.79897  ,  0.21661  ,
        2.5256   , -0.54832  , -0.66141  , -0.94656  , -0.79848  ,
       -0.083708 , -0.44645  , -0.088577 ,  0.1127   ,  0.15461  ,
       -0.41408  , -0.47965  ,  0.22811  , -0.025582 , -0.58716  ,
        0.34786  ,  0.57678  , -1.2095   , -0.60164  , -0.89549  ],
      dtype=float32)

In [10]:
# creating a separate list of country & capital vectors
country_vecs, capital_vecs = [], []

for country_vec, capital_vec in country_capital_vectors:
    country_vecs.append(country_vec)
    capital_vecs.append(capital_vec)

In [11]:
from sklearn.decomposition import PCA
pca1 = PCA()
pca_dims_country = pca1.fit_transform(country_vecs)
pca2 = PCA()
pca_dims_capital = pca2.fit_transform(capital_vecs)

In [12]:
print("Variance in country pca vectors:")
print(pca1.explained_variance_ratio_)
print("Variance in capital pca vectors:")
print(pca2.explained_variance_ratio_)

Variance in country pca vectors:
[2.51804657e-01 1.76597847e-01 1.72036515e-01 1.15442417e-01
 6.38507755e-02 4.80079333e-02 4.55293836e-02 3.13409977e-02
 2.23540935e-02 2.10899850e-02 1.85147336e-02 1.44538571e-02
 9.01669485e-03 5.38033412e-03 4.57977562e-03 1.00478678e-32]
Variance in capital pca vectors:
[2.32343950e-01 1.63358919e-01 1.44292240e-01 1.13458289e-01
 8.10088189e-02 5.94548936e-02 4.73744895e-02 3.72699603e-02
 2.84446167e-02 2.53870245e-02 2.30685599e-02 1.80619811e-02
 1.23486991e-02 7.33564690e-03 6.79191146e-03 2.62044231e-33]


Even after applying PCA, there are 15 dimensions, most of the variance is explained by the first 4 variables. Visualisation will be easier with the first 3 variables, which together explain ~ 64% variance in country vectors

In [13]:
# country points
x1 = np.array([a[0] for a in pca_dims_country])
y1 = np.array([a[1] for a in pca_dims_country])
z1 = np.array([a[2] for a in pca_dims_country])
# capital points
x2 = np.array([a[0] for a in pca_dims_capital])
y2 = np.array([a[1] for a in pca_dims_capital])
z2 = np.array([a[2] for a in pca_dims_capital])

In [14]:
# 3d plotting of the points
fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue' )
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1))

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1))

plt.legend(["Country", "Capital"])

plt.show()

<IPython.core.display.Javascript object>

PCA doesn't work and gives really bad results

## Multivariate Multiple Linear Regression

Since we want to establish a relationship between each country vector and capital vector with each vector having 50 dimensions, we'll have to regress separately on each dimension of the dependent variable.

In [29]:
from sklearn.linear_model import LinearRegression

In [80]:
# create a coefficients array
coeff = []
intercepts = []
avg_score = 0
# regressing a linear model on each independent dimension of response variable
for i in range(dims):
    y = [x[i] for x in country_vecs]
    x = capital_vecs
    regr = LinearRegression()
    regr.fit(x, y)
    coeff.append(regr.coef_)
    intercepts.append(regr.intercept_)
    avg_score += regr.score(x,y)
print("Model score: ", avg_score/dims)

Model score:  0.9999999999995224


Each list in coeff represent the coefficients of that particular dimension

In [89]:
# Predicting country vectors from the coefficients
country_vecs_pred = []
for cap_vec in capital_vecs:
    country_vec = []
    for i in range(dims):
        country_vec.append(sum([a*b for a,b in zip(cap_vec,coeff[i])]) + intercepts[i])
    country_vecs_pred.append(np.array(country_vec))

Plotting the predicted and actual values in 3d space

In [96]:
# actual points
x1 = np.array([a[0] for a in country_vecs])
y1 = np.array([a[1] for a in country_vecs])
z1 = np.array([a[2] for a in country_vecs])
# predicted points
x2 = np.array([a[0] for a in country_vecs_pred])
y2 = np.array([a[1] for a in country_vecs_pred])
z2 = np.array([a[2] for a in country_vecs_pred])

# 3d plotting of the points
fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue')
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1))

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1))

plt.legend(["Actual", "Predicted"])

plt.show()

<IPython.core.display.Javascript object>

The model predicts actual values almost perfectly