<a href="https://colab.research.google.com/github/PimiYvan/word2vector/blob/main/intro_to_wordvectors.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


<h1 style="font-family:verdana;font-size:300%;text-align:center;background-color:#f2f2f2;color:#0d0d0d">AMMI NLP - Review sessions</h1>

<h1 style="font-family:verdana;font-size:180%;text-align:Center;color:#993333"> Lab 2: Introduction to wordvectors </h1>

**Big thanks to Amr Khalifa who improved this lab and made it to a Jupyter Notebook!**

In [1]:
import io, sys
import numpy as np

In [2]:
def load_vectors(filename):
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    n, d = map(int, fin.readline().split())
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        data[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
    return data

In [6]:
# Loading word vectors

print('')
print(' ** Word vectors ** ')
print('')

'''
word_vectors is a dictionary that maps words to their numerical word vector
[word (string)] = [np-array] 
'''
word_vectors = load_vectors('wiki.en.vec')

tree_vector = word_vectors['tree']
print(type(tree_vector), len(tree_vector))


 ** Word vectors ** 

<class 'numpy.ndarray'> 300


In [7]:
word_vectors['apples'].shape

(300,)

In [8]:
## This function computes the cosine similarity between vectors u and v

def cosine(u, v):
    '''
    Parameters:
    u : 1-D numpy array
    v : 1-D numpy array 
    
    Returns:
    cos (float) : value of the cosine similairy between vectors u, v 
    '''

    ## FILL CODE
    cos = (u.T @ v)/(np.linalg.norm(u)*np.linalg.norm(v))
    return cos 


In [9]:
# compute similarity between words

print('similarity(apple, apples) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['apples']))
print('similarity(apple, banana) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['banana']))
print('similarity(apple, tiger) = %.3f' %
      cosine(word_vectors['apple'], word_vectors['tiger']))

similarity(apple, apples) = 0.637
similarity(apple, banana) = 0.431
similarity(apple, tiger) = 0.212


In [10]:
import heapq

a = [2, 5, 3, 7, 6, 8]
heapq.heapify(a)

# heapq.heappush(a, 4)
print(a)
heapq.heappop(a)
a

[2, 5, 3, 7, 6, 8]


[3, 5, 8, 7, 6]

In [93]:
type(word_vectors)

dict

In [11]:
## Functions for nearest neighbor
## This function returns the word corresponding to 
## nearest neighbor vector of x
## The list exclude_words can be used to exclude some
## words from the nearest neighbors search

def nearest_neighbor(x, word_vectors, exclude_words=[]):
    '''
    Parameters:
    x (string): word to find its nearest neighbour 
    word_vectors (Python dict): {word (string): np-array of word vector}
    exclude_words (list of strings): words to be excluded from the search
    
    Returns:
    best_word (string) : the word whose word vector is the nearest neighbour 
    to the word vector of x
    '''
    best_score = -1.0
    best_word = None

    ## FILL CODE
    for word in word_vectors:
        
        score = cosine(x, word_vectors[word])
        if score > best_score and word not in exclude_words:
            best_score = score
            best_word = word
    # print(best_score)
    return best_word

In [12]:
print('')
print('The nearest neighbor of cat is: ' +
      nearest_neighbor(word_vectors['cat'], word_vectors, exclude_words = ['cat', 'cats']))


The nearest neighbor of cat is: dog


#### Hint (using python priorty queues with the heapq datastructure): 
if you don't want to store all the words and scores you can use the priortiy queue and only store the best K element so far. 

In [13]:
a=[3, 1, 3]
heapq.heapify(a)
a

[1, 3, 3]

In [14]:
h = []
heapq.heapify(h)
heapq.heappush(h, (5, 'write code'))
heapq.heappush(h, (7, 'release product'))
heapq.heappush(h, (1, 'write spec'))
heapq.heappush(h, (3, 'create tests'))
# heapq.heappop(h)
print(len(h))

4


In [15]:

## This function return the words corresponding to the
## K nearest neighbors of vector x.
## You can use the functions heappush and heappop.
import heapq

def knn(x, vectors, k):
    '''
    Parameters:
    x (string): word to find its nearest neighbour 
    word_vectors (Python dict): {word (string): np-array of word vector}
    k (int): number of nearest neighbours to be found
    
    Returns: 
    k_nearest_neighbors (list of tuples): [(score, word), (score, word), ....]
    '''
    # k_nearest_neighbors = []
    ## FILL CODE
    # one bad way to do it, optimization problem
    # exclude_words = []
    # for i in range(k):
    #     word = nearest_neighbor(x, vectors, exclude_words)
    #     score = cosine(vectors[word], x)
    #     k_nearest_neighbors.append((score, word))
    #     exclude_words.append(word)
    
    # k_nearest_neighbors = []
    # for word in vectors:
    #     score = cosine(x, vectors[word])
    #     if len(k_nearest_neighbors) < k:
    #         heapq.heappush(k_nearest_neighbors, (score, word))
    #         k_nearest_neighbors.sort(key=lambda x: x[0],reverse=True)
    #     else:
    #         if score >= k_nearest_neighbors[-1][0]:
    #             k_nearest_neighbors.pop()
    #             heapq.heappush(k_nearest_neighbors, (score, word))
    #             k_nearest_neighbors.sort(key=lambda x: x[0],reverse=True)
    
    k_nearest_neighbors = []
    heapq.heapify(k_nearest_neighbors)
    for word in vectors:
        score = cosine(x, vectors[word])
        if len(k_nearest_neighbors) < k:
            heapq.heappush(k_nearest_neighbors, (score, word))
        else:
            if score >= k_nearest_neighbors[0][0]:
                heapq.heappop(k_nearest_neighbors)
                heapq.heappush(k_nearest_neighbors, (score, word))
    return k_nearest_neighbors

In [16]:
knn_cat = knn(word_vectors['cat'], word_vectors, 5)
print('')
print('cat')
print('--------------')
for score, word in knn(word_vectors['cat'], word_vectors, 5):
    print (word + '\t%.3f' % score)



cat
--------------
rabbit	0.549
pet	0.573
dog	0.638
cat	1.000
cats	0.732


#### Hint: 
To find the analogies, we find the nearest neighbour associated with the wordvector d
$$ d = \frac{c}{\Vert {c} \Vert} + \frac{b}{\Vert {b} \Vert} - \frac{a}{\Vert {a} \Vert}$$


In [17]:
## This function return the words d, such that a:b and c:d
## verifies the same relation

def analogy(a, b, c, word_vectors):
    '''
    Parameters:
    a (string): word a
    b (string): word b
    c (string): word c
    word_vectors (Python dict): {word (string): np-array of word vector}
    
    Returnrs: 
    the word d (string) associated with c such that c:d is similar to a:b 
    
    '''
    
    ## FILL CODE
    a_vector = word_vectors[a]
    b_vector = word_vectors[b]
    c_vector = word_vectors[c]

    d_vector = c_vector/np.linalg.norm(c_vector) + b_vector/np.linalg.norm(b_vector) - a_vector/np.linalg.norm(a_vector)
    d = nearest_neighbor(d_vector, word_vectors)
    return d

In [18]:
# Word analogies

print('')
print('france - paris + rome = ' + analogy('paris', 'france', 'rome', word_vectors))


france - paris + rome = italy


## A word about biases in word vectors

In [19]:
## A word about biases in word vectors:

print('')
print('similarity(genius, man) = %.3f' %
      cosine(word_vectors['man'], word_vectors['genius']))
print('similarity(genius, woman) = %.3f' %
      cosine(word_vectors['woman'], word_vectors['genius']))


similarity(genius, man) = 0.445
similarity(genius, woman) = 0.325


In [20]:
## Compute the association strength between:
##   - a word w
##   - two sets of attributes A and B

def association_strength(w, A, B, vectors):
    '''
    Parameters:
    w (string): word w
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}
    
    Returnrs: 
    strength (float): the value of the association strength 
    '''
    
    strength = 0.0
    part_a = 0.0
    part_b = 0.0 
    
    ## FILL CODE
    for a in A:
        part_a += cosine(vectors[w], vectors[a])
        
    for b in B:
        part_b += cosine(vectors[w], vectors[b])
    
    strength = (part_a/len(A)) - (part_b/len(B))
    return strength

In [21]:
## Perform the word embedding association test between:
##   - two sets of words X and Y
##   - two sets of attributes A and B

def weat(X, Y, A, B, vectors):
    '''
    Parameters:
    X (list of strings): The words belonging to set X
    Y (list of strings): The words belonging to set Y
    A (list of strings): The words belonging to set A
    B (list of strings): The words belonging to set B
    vectors (Python dict): {word (string): np-array of word vector}
    
    Returns: 
    score (float): the value of the group association strength  
    '''
    
    score = 0.0

    part_x = 0.0
    part_y = 0.0
    ## FILL CODE

    for x in X : 
        part_x += association_strength(x, A, B, vectors)
    for y in Y : 
        part_y += association_strength(y, A, B, vectors)
    score = part_x - part_y
    return score

In [22]:
## Replicate one of the experiments from:
##
## Semantics derived automatically from language corpora contain human-like biases
## Caliskan, Bryson, Narayanan (2017)

career = ['executive', 'management', 'professional', 'corporation', 
          'salary', 'office', 'business', 'career']
family = ['home', 'parents', 'children', 'family',
          'cousins', 'marriage', 'wedding', 'relatives']
male = ['john', 'paul', 'mike', 'kevin', 'steve', 'greg', 'jeff', 'bill']
female = ['amy', 'joan', 'lisa', 'sarah', 'diana', 'kate', 'ann', 'donna']

print('')
print('Word embedding association test: %.3f' %
      weat(career, family, male, female, word_vectors))


Word embedding association test: 0.847


## Word translation using word vectors

In the following, we will use word vectors in English and French to translate words from English to French. The idea is to learn a linear function that maps English word vectors to their correponding French word vectors. To learn this linear mapping, we will use a small bilingual lexicon, that contains pairs of words in English and French that are translations of each other.

The following function will load the small English-French bilingual lexicon:

In [23]:
def load_lexicon(filename):
    '''
    Parameters:
    filename(string): the path of the lexicon
    
    Returns:
    data(list of pairs of string): the bilingual lexicon
    '''
    fin = io.open(filename, 'r', encoding='utf-8', newline='\n')
    data = []
    for line in fin:
        a, b = line.rstrip().split(' ')
        data.append((a, b))
    return data

('the', 'le')

In [24]:
word_vectors_en = load_vectors('wiki.en.vec')
word_vectors_fr = load_vectors('wiki.fr.vec')
lexicon = load_lexicon("lexicon-en-fr.txt")
print(lexicon[:5])

[('the', 'le'), ('the', 'les'), ('the', 'la'), ('and', 'et'), ('was', 'fut')]


In [25]:
# We split the lexicon into a train and validation set
train = lexicon[:5000]
valid = lexicon[5000:5100]

The following function will learn the mapping from English to French. The idea is to build two matrices $X_{\text{en}}$ and $X_{\text{fr}}$, and to find a mapping $M$ that minimizes $||X_{\text{en}} W - X_{\text{fr}} ||_2$. In numpy, this mapping can be obtained by using the `numpy.linalg.lstsq` function.

In [180]:
def align1(word_vectors_en, word_vectors_fr, lexicon):
    '''
    Parameters:
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    lexicon(list of pairs of string): bilingual training lexicon
    
    Returns
    mapping(np.array): the mapping from English to French vectors
    '''
    x_en, x_fr = [], []
    
    ## FILL CODE
    for lex in lexicon:
        x_en.append(word_vectors_en[lex[0]])
        x_fr.append(word_vectors_fr[lex[1]])
    
    x_en = np.array(x_en)
    x_fr = np.array(x_fr)

    M = np.linalg.inv(x_en.T @ x_en) @ (x_en.T @ x_fr)
    print(x_en.shape) 
    return M

In [181]:
def align(word_vectors_en, word_vectors_fr, lexicon):
    '''
    Parameters:
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    lexicon(list of pairs of string): bilingual training lexicon
    
    Returns
    mapping(np.array): the mapping from English to French vectors
    '''
    x_en, x_fr = [], []
    
    ## FILL CODE
    for lex in lexicon:
        x_en.append(word_vectors_en[lex[0]])
        x_fr.append(word_vectors_fr[lex[1]])
    
    x_en = np.array(x_en)
    x_fr = np.array(x_fr)
    # print(x_en.shape, x_fr.shape)

    M, C, A, B = np.linalg.lstsq(x_en, x_fr, rcond=None)
    # print(M.shape)
    # print(C.shape)
    # print(A)
    # print(B.shape)
    
    

    return M

In [182]:
mapping = align(word_vectors_en, word_vectors_fr, lexicon)
mapping.shape

(300, 300)

In [183]:
# mapping1 = align1(word_vectors_en, word_vectors_fr, lexicon)
# mapping1.shape

In [184]:
# (mapping1 == mapping).any()

Given a mapping, a set of word English word vector and French word vectors, the next function will translate the English word to French. To do so, we apply the mapping on the English word, and retrieve the nearest neighbor of the obtained vector in the set of French word vectors. The translation is then the corresponding French word.

In [207]:
def translate(word, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    word(string): an English word
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors
    
    Returns
    A string containing the translation of the English word
    '''
    input_vector = word_vectors_en[word]
    #print(input_vector.shape)
    #print('oo',mapping.shape)
    mapping_result = input_vector @ mapping
    # print(mapping_result.shape)
    out = knn(mapping_result, word_vectors_fr, k=1)
    ## FILL CODE
    #print(mapping_result.shape)
    # print(mapping_result.shape)

    return out[0][1]

In [208]:
# result = translate("world", word_vectors_en, word_vectors_fr, mapping)
# result

In [209]:
# cosine(word_vectors_fr['monde'], result )

In [210]:
# word_vectors_fr['monde']

In [211]:
print(translate("world", word_vectors_en, word_vectors_fr, mapping))
print(translate("machine", word_vectors_en, word_vectors_fr, mapping))
print(translate("learning", word_vectors_en, word_vectors_fr, mapping))

monde
machine
apprentissage


Finally, let's implement a function to evaluate this method on the validation lexicon:

In [219]:
def evaluate(valid, word_vectors_en, word_vectors_fr, mapping):
    '''
    Parameters:
    valid(a list of pairs of string): the validation lexicon
    word_vectors_en(dict: string -> np.array): English word vectors
    word_vectors_en(dict: string -> np.array): French word vectors
    mapping(np.array): the mapping from English to French vectors
    
    Returns
    Accuracy(float): the accuracy on the validation lexicon
    '''
    acc, n = 0.0, 0
    ## FILL CODE
    for word_en_fr in valid:
        out = translate(word_en_fr[0], word_vectors_en, word_vectors_fr, mapping)
        # print(out, word_en_fr[1])
        # print(word_en_fr[0], out)
        if out == word_en_fr[1]:
            n +=  1

    acc = n/len(valid)
    return acc

In [220]:
evaluate(valid, word_vectors_en, word_vectors_fr, mapping)

0.64