# Improving vector space using retrofitting


In [41]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

In [42]:
import argparse
import gzip
import math
import numpy as np
import re
import sys
import pandas as pd

import gensim
import gensim.downloader as api
from gensim import corpora, matutils

from scipy.spatial.distance import cosine

from nltk.corpus import wordnet as wn
from copy import deepcopy
from scipy.sparse import csr_matrix

## 1. Implement the retrofitting algorithm proposed by Faruqui et al. on a lexicon of distributional vector representations of words

In [43]:
''' Getting the pretrained word embeddings '''
print(gensim.__file__)

C:\Users\ninan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\gensim\__init__.py


In [44]:
# available pre-trained models
# gensim.downloader.info()

# load pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')

# download the pre-trained word2vec model
model_name = 'word2vec-google-news-300'
path = api.load(model_name, return_path=True)
print(path)

C:\Users\ninan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


### 1.1 Initial functions (from Faruqi)

In [45]:
# From Faruqi
isNumber = re.compile(r'\d+.*')

def norm_word(word):
  """
  - input: word
  - return: a normalized version of it
  Normalization process: includes checking if the word is a number or a punctuation mark and replacing it with special tokens
  """
  if isNumber.search(word.lower()):
    return '---num---'
  # check if the word consists only of non-alphanumeric characters by removing all non-alphanumeric characters from the word 
  # and checking if the result is an empty string
  elif re.sub(r'\W+', '', word) == '':
    return '---punc---'
  else:
  # if input word not a number nor a punctuation mark, return a lowercase version of input word
    return word.lower()
  

  
''' Read all the word vectors and normalize them '''
def read_word_vecs(filename):
  """
  - input: name of the file containing the word vectors
  """
  wordVectors = {}
  with open(filename, 'r', encoding='utf-8') as fileObject:
    for line in fileObject:
      line = line.strip().lower()
      # The first word is assumed to be the word itself, and the remaining words are assumed to be the components of the word vector
      word = line.split()[0]
      # initialize a numpy array of zeros with the same length as the word vector
      wordVectors[word] = np.zeros(len(line.split())-1, dtype=float)
      for index, vecVal in enumerate(line.split()[1:]):
        # assign the values in the numpy array to the corresponding components of the word vector
        wordVectors[word][index] = float(vecVal)
      ''' normalize weight vector '''
      # divide each element by the square root of the sum of the squares of all the elements in the array
      # plus a small constant (1e-6) to avoid division by zero
      wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
  
  # standard error indicating that the vectors have been read from the file 
  sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors


''' Write word vectors to file '''
def print_word_vecs(wordVectors, outFileName):
  """
  - input: a dictionary wordVectors where keys are words and values are their corresponding word vectors
           file name outFileName
  """
  sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
  outFile = open(outFileName, 'w', encoding= 'utf-8')  
  for word, values in wordVectors.items():
    outFile.write(word+' ')
    for val in wordVectors[word]:
      # write the word vectors to the ouptut file in the format:
      # word1 val1 val2 val3 ...
      # word2 val1 val2 val3 ...
      # ...
      outFile.write('%.4f' %(val)+' ')
    outFile.write('\n')      
  outFile.close()


  ''' Read the PPDB word relations as a dictionary '''
def read_lexicon(filename):
  lexicon = {}
  for line in open(filename, 'r'):
    words = line.lower().strip().split()
    lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
  return lexicon

### 1.2 Data

In [46]:
input_file = "../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean"
lexicon_file = "../data/English/lexicon/ws353_lexical_similarity.txt" 
output_file = "../data/English/output_vectors/output_vectors.txt"

wordVecs = read_word_vecs(input_file)
lexicon = read_lexicon(lexicon_file)
outFileName = output_file

Vectors read from: ../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean 


### 1.3 Retrofitting 

In [47]:
# TODO: add comments of the form of
"""
Retrofit word vectors to a lexical ontology.

Parameters:
-----------
wordVecs : numpy.ndarray
    Matrix containing word embeddings where each row represents a word vector.
lexicon : scipy.sparse.csr_matrix
    Sparse matrix where each row represents a node and its neighbors in the ontology.
alpha : float
    Scaling factor for the original embedding. Default is 1.0.
beta : float
    Scaling factor for the ontology. Default is 1.0.
num_iters : int
    Number of iterations to run the algorithm. Default is 10.

Returns:
--------
numpy.ndarray
    Matrix containing the retrofitted word embeddings where each row represents a word vector.
"""

'\nRetrofit word vectors to a lexical ontology.\n\nParameters:\n-----------\nwordVecs : numpy.ndarray\n    Matrix containing word embeddings where each row represents a word vector.\nlexicon : scipy.sparse.csr_matrix\n    Sparse matrix where each row represents a node and its neighbors in the ontology.\nalpha : float\n    Scaling factor for the original embedding. Default is 1.0.\nbeta : float\n    Scaling factor for the ontology. Default is 1.0.\nnum_iters : int\n    Number of iterations to run the algorithm. Default is 10.\n\nReturns:\n--------\nnumpy.ndarray\n    Matrix containing the retrofitted word embeddings where each row represents a word vector.\n'

In [48]:
''' Working version but no matrix involved'''
# Hyperparameters
ALPHA = 1 #coefficient minimizing the euclidean distance
BETA = 1/ len(lexicon) # minimizes each vector distance to ? 
NB_ITER = 10

''' Retrofit word vectors to a lexicon '''
def retrofit(wordVecs, lexicon):
  newWordVecs = deepcopy(wordVecs)
  wvVocab = set(newWordVecs.keys())
  loopVocab = wvVocab.intersection(set(lexicon.keys()))
  for _ in range(NB_ITER):
    # loop through every node also in ontology (else just use data estimate)
    for word in loopVocab:
      wordNeighbours = set(lexicon[word]).intersection(wvVocab)
      numNeighbours = len(wordNeighbours)
      #no neighbours, pass - use data estimate
      if numNeighbours == 0:
        continue
      # the weight of the data estimate if the number of neighbours
      newVec = ALPHA * wordVecs[word] + BETA * sum(newWordVecs[ppWord] for ppWord in wordNeighbours) 
      newWordVecs[word] = newVec/ (ALPHA + BETA * numNeighbours)
  return newWordVecs


In [49]:
''' Version with modified beta = 1 over set of synonyms '''

def retrofit_beta_updated(wordVecs, lexicon):
    newWordVecs = deepcopy(wordVecs)
    wvVocab = set(newWordVecs.keys())
    loopVocab = wvVocab.intersection(set(lexicon.keys()))
    for _ in range(NB_ITER):
        # loop through every node also in ontology (else just use data estimate)
        for word in loopVocab:
            wordNeighbours = set(lexicon[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
            # no neighbours, pass - use data estimate
            if numNeighbours == 0:
                continue
            BETA = 1 / numNeighbours  # BETA as one over the number of neighbors
            # the weight of the data estimate is the number of neighbours
            newVec = ALPHA * wordVecs[word] + BETA * sum(newWordVecs[ppWord] for ppWord in wordNeighbours)
            newWordVecs[word] = newVec / (ALPHA + BETA * numNeighbours)
    return newWordVecs

In [50]:
''' Retrofit word vectors to a lexicon using matrix operations '''
def retrofit_matrix(wordVecs, lexicon):
    # Convert the word vectors dictionary to a matrix
    wordVecMat = np.array(list(wordVecs.values()))
    
    # Create a set of vocabulary indices based on the shape of wordVecMat
    wvVocab = set(range(wordVecMat.shape[0]))
    
    # Find the common vocabulary between wordVecMat and the lexicon
    loopVocab = wvVocab.intersection(set(lexicon.keys()))
    
    # Iterate over the specified number of iterations
    for _ in range(NB_ITER):
        for word in loopVocab:
            wordNeighbours = set(lexicon[word]).intersection(wvVocab)
            numNeighbours = len(wordNeighbours)
            
            if numNeighbours == 0:
                continue
            
            newVec = ALPHA * wordVecMat[word] + BETA * np.sum(wordVecMat[list(wordNeighbours)], axis=0)
            newWordVec = newVec / (ALPHA + BETA * numNeighbours)
            
            wordVecMat[word] = newWordVec
    
    # Convert the matrix back to a dictionary of word vectors
    newWordVecs = {word: vec for word, vec in zip(wordVecs.keys(), wordVecMat)}
    
    return newWordVecs

In [51]:
# print_word_vecs(retrofit(wordVecs, lexicon, numIter), outFileName)
retrofittedVecs = retrofit_beta_updated(wordVecs, lexicon)

with open(output_file, 'w', encoding='utf-8') as outputFile:
    for word in retrofittedVecs.keys():
        outputFile.write(word + ' ' + ' '.join(str(x) for x in retrofittedVecs[word]) + '\n')

In [52]:
# Apply retrofitting using matrix operations
retrofittedVecMat = retrofit_matrix(wordVecs, lexicon)

# Convert the retrofitted matrix back to a dictionary of word vectors
retrofittedVecs = {}
for word, vec in zip(wordVecs.keys(), retrofittedVecMat):
    retrofittedVecs[word] = vec

# Save the retrofitted vectors to an output file
with open(output_file, 'w', encoding='utf-8') as outputFile:
    for word, vec in retrofittedVecs.items():
        outputFile.write(word + ' ' + ' '.join(str(val) for val in vec) + '\n')

  wordVecMat = np.array(list(wordVecs.values()))


In [53]:
# Compare the retrofitted vectors with the original word vectors
data = []
for word in retrofittedVecs.keys():
    originalVec = wordVecs[word]
    retrofittedVec = retrofittedVecs[word]
    data.append([word, originalVec, retrofittedVec])
    
    # Compare the vectors and print the results
    if np.array_equal(originalVec, retrofittedVec):
        print(f"The vector for word '{word}' has not been updated.")
    else:
        print(f"The vector for word '{word}' has been updated.")

The vector for word '125776' has been updated.
The vector for word ',' has been updated.
The vector for word 'the' has been updated.
The vector for word '.' has been updated.
The vector for word 'of' has been updated.
The vector for word '-' has been updated.
The vector for word 'and' has been updated.
The vector for word 'in' has been updated.
The vector for word 'to' has been updated.
The vector for word ''' has been updated.
The vector for word 'a' has been updated.
The vector for word ')' has been updated.
The vector for word '(' has been updated.
The vector for word 'is' has been updated.
The vector for word 's' has been updated.
The vector for word 'for' has been updated.
The vector for word 'was' has been updated.
The vector for word 'on' has been updated.
The vector for word 'that' has been updated.
The vector for word 'as' has been updated.
The vector for word 'it' has been updated.
The vector for word 'with' has been updated.
The vector for word 'by' has been updated.
The vec

In [54]:
# Create the DataFrame with appropriate column names
df = pd.DataFrame(data, columns=["Word", "Original Vector", "Retrofitted Vector"])
df

Unnamed: 0,Word,Original Vector,Retrofitted Vector
0,125776,[0.999999999992],125776
1,",","[0.09237275157258702, -0.015168234649546966, 0...",","
2,the,"[0.10407956575011922, -0.08842506250895255, 0....",the
3,.,"[0.0782619069941855, 0.016955160717094176, 0.0...",.
4,of,"[0.07484435145234972, 0.09453938682497252, -0....",of
...,...,...,...
125772,scow,"[-0.032097010483529725, -0.045305368695939846,...",scow
125773,orgone,"[0.05948983169732881, -0.068219823071458, 0.03...",orgone
125774,tambourines,"[0.09232989179466854, 0.10664696891927536, 0.0...",tambourines
125775,khyentse,"[0.08054658855202265, -0.05460365172834627, 0....",khyentse


## 2. Use semantic resources (such as WOLF for French or PPDB/WordNet for English) to enhance the lexicon by incorporating knowledge from synonymy, hypernymy relations, etc.

### 2.1 Extraction of semantic relations in both languages 

In [55]:
''' Loading ppdb datafile '''
# lexicon = {}
# with open('ppdb-2.0-xl-all', 'r') as f:
#     for line in f:
#         fields = line.strip().split('\t')
#         if len(fields) == 2:
#             lexicon[(fields[0], fields[1])] = 1.0
#         elif len(fields) == 3:
#             lexicon[(fields[0], fields[1])] = float(fields[2])

' Loading ppdb datafile '

## 3. Evaluate the performance of the retrofitting algorithm on two tasks

### 3.1 Lexical similarity task: Measure the improvement in capturing semantic relationships between words in the lexicon

In [56]:
# TODO: check similarity before and after retroffit
# ''' Cosine similarity '''
# for i, word1 in enumerate(output_file[:-1]):
#     print(f'Similarities with "{word1}":')
#     for j, word2 in enumerate(output_file[i+1:]):
#         similarity = model.similarity(word1, word2)
#         print(f'  - "{word2}": {similarity:.2f}')
#     print()

''' Word2Vec, GloVe: These models assign a vector to each word in a high-dimensional vector space based on the context in which the word appears. The similarity between two words can then be calculated as the cosine similarity between their corresponding vectors.'''

' Word2Vec, GloVe: These models assign a vector to each word in a high-dimensional vector space based on the context in which the word appears. The similarity between two words can then be calculated as the cosine similarity between their corresponding vectors.'

In [57]:
toy_corpus = ["cat", "dog", "caramel", "cheese", "chocolate", "right", "left"]

In [58]:
''' Cosine similarity '''
for i, word1 in enumerate(toy_corpus[:-1]):
    print(f'Similarities with "{word1}":')
    for j, word2 in enumerate(toy_corpus[i+1:]):
        similarity = model.similarity(word1, word2)
        print(f'  - "{word2}": {similarity:.2f}')
    print()

Similarities with "cat":
  - "dog": 0.76
  - "caramel": 0.17
  - "cheese": 0.14
  - "chocolate": 0.24
  - "right": 0.08
  - "left": 0.11

Similarities with "dog":
  - "caramel": 0.16
  - "cheese": 0.18
  - "chocolate": 0.30
  - "right": 0.11
  - "left": 0.08

Similarities with "caramel":
  - "cheese": 0.48
  - "chocolate": 0.67
  - "right": 0.07
  - "left": 0.07

Similarities with "cheese":
  - "chocolate": 0.61
  - "right": 0.08
  - "left": 0.08

Similarities with "chocolate":
  - "right": 0.04
  - "left": 0.05

Similarities with "right":
  - "left": 0.49



In [59]:
''' Convert toy_corpus to a co-occurrence matrix '''
# Create a dictionary from the corpus
dictionary = corpora.Dictionary([toy_corpus])

# Convert the corpus to a bag-of-words representation
bow_corpus = [dictionary.doc2bow(text) for text in [toy_corpus]]

# Convert the corpus to a co-occurrence matrix
cooccur_matrix = matutils.corpus2csc(bow_corpus).dot(matutils.corpus2csc(bow_corpus).transpose())

In [60]:
''' Use the pre-trained Word2Vec model to create an embedding matrix '''
embedding_matrix = {}
for word in toy_corpus:
    embedding = model[word]
    embedding_matrix[dictionary.token2id[word]] = embedding

In [61]:
''' Retrofit the embedding matrix '''
# Could also use the 'from retrofitting import retrofit' package
# = an implementation of the algorithm proposed by Mrksic et al. (2017) which is faster and more scalable
cooccur_matrix_dict = cooccur_matrix.todok()
retrofitted_embeddings = retrofit(embedding_matrix, cooccur_matrix_dict)

In [62]:
''' Use the retrofitted embeddings to compute semantic similarity '''
for i, word1 in enumerate(toy_corpus):
    for j, word2 in enumerate(toy_corpus[i+1:], i+1):
        similarity_before = cosine(embedding_matrix[i], embedding_matrix[j])
        similarity_after = cosine(retrofitted_embeddings[i], retrofitted_embeddings[j])
        print(f"Similarity between '{word1}' and '{word2}' before retrofitting: {similarity_before:.2f}")
        print(f"Similarity between '{word1}' and '{word2}' after retrofitting: {similarity_after:.2f}\n")

Similarity between 'cat' and 'dog' before retrofitting: 0.83
Similarity between 'cat' and 'dog' after retrofitting: 0.83

Similarity between 'cat' and 'caramel' before retrofitting: 0.52
Similarity between 'cat' and 'caramel' after retrofitting: 0.52

Similarity between 'cat' and 'cheese' before retrofitting: 0.33
Similarity between 'cat' and 'cheese' after retrofitting: 0.33

Similarity between 'cat' and 'chocolate' before retrofitting: 0.84
Similarity between 'cat' and 'chocolate' after retrofitting: 0.84

Similarity between 'cat' and 'right' before retrofitting: 0.93
Similarity between 'cat' and 'right' after retrofitting: 0.93

Similarity between 'cat' and 'left' before retrofitting: 0.93
Similarity between 'cat' and 'left' after retrofitting: 0.93

Similarity between 'dog' and 'caramel' before retrofitting: 0.86
Similarity between 'dog' and 'caramel' after retrofitting: 0.86

Similarity between 'dog' and 'cheese' before retrofitting: 0.76
Similarity between 'dog' and 'cheese' afte

### 3.2. Sentiment analysis task: Apply the retrofitted word vectors to a corpus of film reviews and assess if they lead to better sentiment analysis performance compared to the original word vectors.