In [7]:
import io, sys
import numpy as np
from heapq import *

In [23]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray(list(map(float, tokens[1:])))
        
    return data

In [59]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

word_vectors = load_vectors('wiki.en.vec')
word_vectors['queen']


 ** Word vectors ** 



array([-0.060511  ,  0.049607  , -0.20885   ,  0.10349   ,  0.14276   ,
        0.13577   , -0.46691   ,  0.15968   , -0.18779   ,  0.36288   ,
       -0.10825   ,  0.096172  ,  0.02492   ,  0.11521   , -0.27606   ,
       -0.26194   , -0.13782   , -0.21345   , -0.30993   ,  0.18901   ,
       -0.68209   ,  0.33919   ,  0.0658    , -0.37599   , -0.25713   ,
        0.04412   , -0.23731   , -0.28451   ,  0.13815   ,  0.45995   ,
        0.15902   ,  0.21194   , -0.17454   , -0.13933   , -0.24543   ,
       -0.010022  , -0.014216  , -0.32172   ,  0.26391   , -0.68158   ,
        0.11384   , -0.1007    , -0.37438   , -0.0032968 ,  0.049377  ,
       -0.34198   , -0.37895   , -0.075533  ,  0.1981    , -0.20643   ,
        0.10688   ,  0.016414  ,  0.17665   ,  0.27321   ,  0.13638   ,
        0.15284   ,  0.29849   , -0.071184  , -0.18808   ,  0.14807   ,
        0.15638   , -0.04346   , -0.084261  , -0.31188   ,  0.07669   ,
        0.16689   ,  0.046978  , -0.0044613 ,  0.15034   ,  0.04

In [25]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    dot_prod = u@v.T
    cos_sim = dot_prod/(norm_u * norm_v)
    ## FILL CODE
    return cos_sim#0.0

## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

In [71]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))
print('similarity(queen, women) = %.3f' %
      cosine(word_vectors['queen'], word_vectors['kind']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212
similarity(queen, women) = 0.157


In [76]:
## Functions for nearest neighbors

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    best_score = -1.0
    best_word = ''
    for word in word_vectors:
        if word not in exclude_words:
            sim = cosine(x, word_vectors[word])
            if sim > best_score :
                best_score = sim
                best_word = word
        
    ## FILL CODE
    return best_word

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.

def knn(x, vectors, k):
    heap = []
    for word in vectors:
        sim = cosine(x, vectors[word])
        if len(heap) < k:
            heappush(heap , (sim, word))
        else :
            heappushpop(heap, (sim, word))
            
    ## FILL CODE

    return [heappop(heap) for i in range(len(heap))][::-1]

In [77]:
# looking at nearest neighbors of a word

print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words =['cat', 'cats']))

knn_cat = knn(word_vectors['cat'], word_vectors, 5)
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print(word + '\t%.3f' % score)

The nearest neighbor of cat is: dog

cat
--------------
cat	1.000
cats	0.732
dog	0.638
pet	0.573
rabbit	0.549


In [121]:
## This function return the word d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    a = a.lower()
    b = b.lower()
    c = c.lower()
    
    v_a = word_vectors[a]
    v_b = word_vectors[b]
    v_c = word_vectors[c]
    
    n_a = v_a/np.linalg.norm(v_a)
    n_b = v_b/np.linalg.norm(v_b)
    n_c = v_c/np.linalg.norm(v_c)
    
    
#     #norm = np.linalg.norm(v_b - v_a + v_c)
#     analogie = ''
#     best_score = float('-inf')
#     for word in word_vectors:
        
#         if True in [i in word for i in [a, b,c] ]: #word not in [a,b,c]:
#             continue
#         n_word = word_vectors[word]/np.linalg.norm(word_vectors[word])
#         anal = (n_c + n_b - n_a)@n_word.T
#         #anal = ((v_b - v_a + v_c)@word_vectors[word].T)/norm
#         if anal > best_score:
#             best_score = anal
#             analogie = word
#     # FILL CODE
#     return analogie#''
    x = v_b - v_a + v_c
    return nearest_neighbor(x, word_vectors, exclude_words=[a,b,c])

In [122]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('pAris', 'France', 'rome', word_vectors))
print('king - man + woman = ' + analogy('man','king','woman', word_vectors))

print('rome - italy + france = ' + analogy('italy', 'rome', 'france', word_vectors))


france - paris + rome = italy
king - man + woman = queen
rome - italy + france = paris


In [45]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [49]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    strength = 0.0
    card_A = len(A)
    card_B = len(B)
    sum_A = 0.0
    sum_B = 0.0
    
    for a in A:
        sum_A += cosine(vectors[w], vectors[a])
    sum_A /= card_A
    
    for b in B:
        sum_B += cosine(vectors[w], vectors[b])
    sum_B /= card_B
    
    strength =sum_A - sum_B 
    ## FILL CODE
    return strength

## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    weat_X = 0.0
    weat_Y = 0.0
    for x in X:
        weat_X += association_strength(x, A, B, vectors)
    for y in Y:
        weat_Y += association_strength(y, A, B, vectors)
    
        
    score = weat_X - weat_Y#0.0
    ## FILL CODE
    return score

In [50]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847
