# Establish Relations 

In [1]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import gensim.downloader
#Import numpy for array calculation
import numpy as np

import re

#import nltk for string operations
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize

import relations

In [2]:
# Load the model
model = gensim.downloader.load('glove-wiki-gigaword-50')
dims = 50

In [3]:
male_female_tuples = relations.read_relations("./relations/male-female.txt")
male_female_tuples

[('Father', 'Mother'),
 ('Son', 'Daughter'),
 ('Brother', 'Sister'),
 ('God', 'goddess'),
 ('Man', 'Woman'),
 ('Bachelor', 'Maid'),
 ('Cock', 'Hen'),
 ('Dog', 'Bitch'),
 ('Bull', 'Cow'),
 ('Drone', 'Bee'),
 ('Gander', 'Goose'),
 ('Waiter', 'Waitress'),
 ('Stag', 'Hind'),
 ('Gentleman', 'Lady'),
 ('Earl', 'Countess'),
 ('Husband', 'Wife'),
 ('Dad', 'mom'),
 ('Lord', 'Lady'),
 ('King', 'Queen'),
 ('Monk', 'Nun'),
 ('Boy', 'Girl'),
 ('Sir', 'Madam'),
 ('Uncle', 'Aunt'),
 ('Nephew', 'Niece'),
 ('Daddy', 'mummy'),
 ('Wizard', 'Witch'),
 ('Hart', 'Roe'),
 ('Drake', 'Duck'),
 ('Lion', 'Lioness'),
 ('Count', 'Countess'),
 ('Giant', 'Giantess'),
 ('Priest', 'Priestess'),
 ('Prophet', 'Prophetess'),
 ('Poet', 'Poetess'),
 ('Patron', 'Patroness'),
 ('Host', 'Hostess'),
 ('Viscount', 'Viscountess'),
 ('Shepherd', 'Shepherdess'),
 ('Steward', 'Stewardess'),
 ('Author', 'Authoress'),
 ('Manager', 'Manageress'),
 ('Bridegroom', 'Bride'),
 ('Jew', 'Jewess'),
 ('Baron', 'Baroness'),
 ('Mayor', 'Mayores

In [4]:
# created a separate list for country-capital word vectors
male_female_vectors = []

for tup in male_female_tuples:
    if tup[0].lower() in model.vocab and tup[1].lower() in model.vocab:
        male_vec = model.get_vector(tup[0].lower())
        female_vec = model.get_vector(tup[1].lower())
        male_female_vectors.append((male_vec, female_vec))


## Relation Visualisation

### Reduction to 3 - dimensions

In [5]:
import matplotlib.pyplot as plt
import pandas as pd
%matplotlib notebook

In [6]:
# Reduced dimensionality of word vectors to two

vec_3d = []
start_idx = 0
end_idx = 3
for tup in male_female_vectors:
    vec_3d.append((tup[0][start_idx:end_idx], tup[1][start_idx:end_idx]))

In [7]:
# plot country points
x1 = np.array([a[0] for a,b in vec_3d])
y1 = np.array([a[1] for a,b in vec_3d])
z1 = np.array([a[2] for a,b in vec_3d])
#capital points
x2 = np.array([b[0] for a,b in vec_3d])
y2 = np.array([b[1] for a,b in vec_3d])
z2 = np.array([b[2] for a,b in vec_3d])

In [8]:
fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue' )
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1), fontsize="xx-small")

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1), fontsize="xx-small")

plt.legend(["male", "female"])

plt.show()

<IPython.core.display.Javascript object>

## Multivariate Multiple Linear Regression

Since we want to establish a relationship between each country vector and capital vector with each vector having 50 dimensions, we'll have to regress separately on each dimension of the dependent variable.

In [9]:
from sklearn.linear_model import LinearRegression

In [10]:
model.get_vector("male") # y1 = ax + bx2 + cx3 + c

array([-0.23046  ,  0.65937  , -0.28411  , -0.44366  ,  1.5922   ,
        1.8564   , -0.0054708, -0.58679  , -0.1506   , -0.021166 ,
        1.1029   , -0.79502  ,  1.1899   ,  0.53535  ,  0.25256  ,
       -0.15882  , -0.31825  ,  0.53609  , -0.5944   , -0.21288  ,
       -0.94989  ,  0.91619  ,  0.4879   ,  0.77063  , -0.16215  ,
       -1.0515   , -0.70571  , -0.79814  , -0.79354  , -0.086372 ,
        2.2497   ,  0.68785  , -0.085613 , -0.68004  ,  0.62212  ,
       -0.02536  ,  0.10967  , -0.38748  , -0.62791  , -1.0871   ,
       -0.37412  , -0.061965 ,  0.19225  ,  0.89263  ,  0.51763  ,
       -1.4791   , -0.23219  , -1.1589   ,  0.066075 , -0.038772 ],
      dtype=float32)

In [11]:
# create a coefficients array
coeff = []
intercepts = []
male_vecs = [tup[0] for tup in male_female_vectors]
female_vecs = [tup[1] for tup in male_female_vectors]
avg_score = 0
# regressing a linear model on each independent dimension of response variable
for i in range(dims):
    y = [x[i] for x in female_vecs]
    x = male_vecs
    regr = LinearRegression()
    regr.fit(x, y)
    coeff.append(regr.coef_)
    intercepts.append(regr.intercept_)
    avg_score += regr.score(x,y)
print("Model score: ", avg_score/dims)

Model score:  0.9300574388443092


Each list in coeff represent the coefficients of that particular dimension

In [12]:
# Predicting country vectors from the coefficients
female_vecs_pred = []
for male_vec in male_vecs:
    female_vec = []
    for i in range(dims):
        female_vec.append(sum([a*b for a,b in zip(male_vec,coeff[i])]) + intercepts[i])
    female_vecs_pred.append(np.array(female_vec))

Plotting the predicted and actual values in 3d space

In [13]:
# actual points
x1 = np.array([a[0] for a in female_vecs])
y1 = np.array([a[1] for a in female_vecs])
z1 = np.array([a[2] for a in female_vecs])
# predicted points
x2 = np.array([a[0] for a in female_vecs_pred])
y2 = np.array([a[1] for a in female_vecs_pred])
z2 = np.array([a[2] for a in female_vecs_pred])

# 3d plotting of the points
fig = plt.figure(figsize = (7,5))
ax = plt.axes(projection ="3d")

ax.scatter3D(x1,y1,z1, color = 'blue')
for i in range(len(x1)):
    ax.text(x1[i],y1[i],z1[i], str(i+1), fontsize="xx-small")

ax.scatter3D(x2,y2,z2,color = 'red')
for i in range(len(x1)):
    ax.text(x2[i],y2[i],z2[i], str(i+1), fontsize="xx-small")

plt.legend(["Actual", "Predicted"])

plt.show()

<IPython.core.display.Javascript object>

The model predicts actual values almost perfectly

**Testing on country-capital pairs outside the training set**

In [14]:
country_vec = []
cap_vec = model.get_vector("actor")
for i in range(dims):
    country_vec.append(sum([a*b for a,b in zip(cap_vec,coeff[i])]) + intercepts[i])
np.array(country_vec)

array([-4.19593817e-01,  1.44451965e+00, -8.13946096e-01,  1.97332624e-01,
        2.84080867e-01,  1.99529558e+00, -4.59791172e-01, -1.98559969e-01,
       -1.96600950e-01,  7.04800222e-01,  3.09124341e-01,  1.92178743e-01,
       -1.11358085e-01,  3.88489471e-01,  1.38690102e+00, -2.79801955e-01,
       -1.22225999e-01,  2.80242853e-01, -3.54325311e-01,  5.96508711e-01,
        1.00452984e-01,  2.17452968e+00,  4.31288703e-01,  7.57532930e-01,
        6.03170233e-01, -1.41616027e+00, -4.65612176e-01, -2.28942136e-01,
       -1.09925996e+00, -4.82310296e-01,  1.32740881e+00,  2.48367146e-01,
        7.56706873e-01, -4.59155763e-01,  5.55280991e-01, -1.94409099e-01,
       -1.40803159e-01,  2.51954916e-01, -5.91125519e-01, -1.65395953e+00,
        6.98611140e-04,  1.65581198e+00,  4.49069808e-01, -1.90838032e+00,
       -5.87454959e-01, -8.94251765e-01, -4.14123952e-01, -1.79459343e+00,
       -2.00338785e-01,  9.19299743e-01])

In [15]:
model.get_vector("actress")

array([-0.50281 ,  1.3602  , -0.79567 ,  0.17978 ,  0.30326 ,  2.0034  ,
       -0.29183 , -0.13314 , -0.22833 ,  0.74604 ,  0.29479 ,  0.058113,
        0.080724,  0.60262 ,  1.3897  , -0.49205 , -0.1202  ,  0.46184 ,
       -0.25282 ,  0.60568 ,  0.1024  ,  2.1639  ,  0.52841 ,  0.73382 ,
        0.68283 , -1.2     , -0.59849 , -0.15926 , -1.1145  , -0.49408 ,
        1.1492  ,  0.2328  ,  0.85071 , -0.28973 ,  0.60732 , -0.051936,
       -0.14617 ,  0.34957 , -0.50612 , -1.5682  , -0.078745,  1.7379  ,
        0.46563 , -1.991   , -0.69053 , -1.0609  , -0.37481 , -1.713   ,
       -0.14678 ,  0.94823 ], dtype=float32)

In [16]:
from sklearn.metrics.pairwise import cosine_similarity,cosine_distances

In [18]:
def get_cosine_similarity(masculine_word, feminine_word):
    female_vec = []
    male_vec = model.get_vector(masculine_word)
    for i in range(dims):
        female_vec.append(sum([a*b for a,b in zip(male_vec,coeff[i])]) + intercepts[i])
    female_vec_real = model.get_vector(feminine_word)
    print(cosine_similarity(np.array(female_vec).reshape(1,-1),np.array(female_vec_real).reshape(1,-1)))
get_cosine_similarity("actor","actress")

[[0.99331978]]
