In [3]:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-

In [45]:
import argparse
import gzip
import math
import numpy as np
import re
import sys

import gensim
import gensim.downloader as api
from gensim import corpora, matutils

from scipy.spatial.distance import cosine

from nltk.corpus import wordnet as wn
from copy import deepcopy

In [5]:
# TODO: keep?

isNumber = re.compile(r'\d+.*')

def norm_word(word):
  """
  - input: word
  - return: a normalized version of it
  Normalization process: includes checking if the word is a number or a punctuation mark and replacing it with special tokens
  """
  if isNumber.search(word.lower()):
    return '---num---'
  # check if the word consists only of non-alphanumeric characters by removing all non-alphanumeric characters from the word 
  # and checking if the result is an empty string
  elif re.sub(r'\W+', '', word) == '':
    return '---punc---'
  else:
  # if input word not a number nor a punctuation mark, return a lowercase version of input word
    return word.lower()

In [6]:
''' Read all the word vectors and normalize them '''
def read_word_vecs(filename):
  """
  - input: name of the file containing the word vectors
  """
  wordVectors = {}
  with open(filename, 'r', encoding='utf-8') as fileObject:
    for line in fileObject:
      line = line.strip().lower()
      # The first word is assumed to be the word itself, and the remaining words are assumed to be the components of the word vector
      word = line.split()[0]
      # initialize a numpy array of zeros with the same length as the word vector
      wordVectors[word] = np.zeros(len(line.split())-1, dtype=float)
      for index, vecVal in enumerate(line.split()[1:]):
        # assign the values in the numpy array to the corresponding components of the word vector
        wordVectors[word][index] = float(vecVal)
      ''' normalize weight vector '''
      # divide each element by the square root of the sum of the squares of all the elements in the array
      # plus a small constant (1e-6) to avoid division by zero
      wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
  
  # standard error indicating that the vectors have been read from the file 
  sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors

In [38]:
''' Write word vectors to file '''
def print_word_vecs(wordVectors, outFileName):
  """
  - input: a dictionary wordVectors where keys are words and values are their corresponding word vectors
           file name outFileName
  """
  sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
  outFile = open(outFileName, 'w', encoding= 'utf-8')  
  for word, values in wordVectors.items():
    outFile.write(word+' ')
    for val in wordVectors[word]:
      # write the word vectors to the ouptut file in the format:
      # word1 val1 val2 val3 ...
      # word2 val1 val2 val3 ...
      # ...
      outFile.write('%.4f' %(val)+' ')
    outFile.write('\n')      
  outFile.close()

In [8]:
''' Read the PPDB word relations as a dictionary '''
def read_lexicon(filename):
  lexicon = {}
  for line in open(filename, 'r'):
    words = line.lower().strip().split()
    lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]
  return lexicon

## Data

In [11]:
input_file = "../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean"
lexicon_file = "../data/English/lexicon/ws353_lexical_similarity.txt"
output_file = "../data/English/output_vectors/output_vectors.txt"

wordVecs = read_word_vecs(input_file)
lexicon = read_lexicon(lexicon_file)
outFileName = output_file

Vectors read from: ../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean 


In [12]:
# TODO: modify the input so that it doesn't take one word a the time but rather matrices

In [39]:
# TODO: Should we modify it to use np.arrays, tensors?

# Hyperparameters
ALPHA = 1 #coefficient minimizing the euclidean distance
BETA = 1/ len(lexicon) # minimizes each vector distance to ? TODO: 1 over the number of neighbors?
NB_ITER = 10

''' Retrofit word vectors to a lexicon '''
def retrofit(wordVecs, lexicon):
  newWordVecs = deepcopy(wordVecs)
  wvVocab = set(newWordVecs.keys())
  loopVocab = wvVocab.intersection(set(lexicon.keys()))
  for _ in range(NB_ITER):
    # loop through every node also in ontology (else just use data estimate)
    for word in loopVocab:
      wordNeighbours = set(lexicon[word]).intersection(wvVocab)
      numNeighbours = len(wordNeighbours)
      #no neighbours, pass - use data estimate
      if numNeighbours == 0:
        continue
      # the weight of the data estimate if the number of neighbours
      newVec = ALPHA * wordVecs[word] + BETA * sum(newWordVecs[ppWord] for ppWord in wordNeighbours) 
      newWordVecs[word] = newVec/ (ALPHA + BETA * numNeighbours)
  return newWordVecs


In [40]:
# TODO: work in progress, not effective yet
from scipy.sparse import lil_matrix, csr_matrix

def retrofit_matrix(wordVecs, lexicon):
    """
    Retrofit word vectors to a lexical ontology.

    Parameters:
    -----------
    wordVecs : numpy.ndarray
        Matrix containing word embeddings where each row represents a word vector.
    lexicon : scipy.sparse.csr_matrix
        Sparse matrix where each row represents a node and its neighbors in the ontology.
    alpha : float
        Scaling factor for the original embedding. Default is 1.0.
    beta : float
        Scaling factor for the ontology. Default is 1.0.
    num_iters : int
        Number of iterations to run the algorithm. Default is 10.

    Returns:
    --------
    numpy.ndarray
        Matrix containing the retrofitted word embeddings where each row represents a word vector.
    """
   # Convert the word vectors dictionary to a matrix
    wordVecMat = csr_matrix(np.array(list(wordVecs.values())))

    # Create a copy of the word vectors matrix
    newWordVecMat = wordVecMat.copy()

    # Create a dictionary of word indices
    wordIndices = {word: idx for idx, word in enumerate(wordVecs.keys())}

    # Loop through the words in the lexicon
    for _ in range(NB_ITER):
        for word in lexicon.keys():
            # Get the scores and neighbors for the current word
            scores = np.array(lexicon[word]).ravel()
            neighbors = np.array([wordVecs.get(w, np.zeros(wordVecMat.shape[1])) for w in lexicon[word].indices])

            # Compute the neighbor matrix
            neighborMat = csr_matrix(neighbors)

            # Compute the weighted average of the neighbors
            weightedNeighborVec = (neighborMat.transpose().dot(scores) / neighborMat.sum(axis=1)).ravel()

            # Update the word vector for the current word
            wordVec = wordVecMat[wordIndices[word], :]
            newWordVec = ALPHA * wordVec + BETA * weightedNeighborVec
            newWordVecMat[wordIndices[word], :] = newWordVec

    # Convert the new word vectors matrix to a dictionary
    newWordVecs = {word: newWordVecMat[wordIndices[word], :].toarray().ravel() for word in wordVecs.keys()}

    # Return the new word vectors dictionary
    return newWordVecs


In [30]:
# ''' Enrich the word vectors using ppdb and print the enriched vectors '''
# print_word_vecs(retrofit(wordVecs, lexicon, numIter), outFileName)
retrofittedVecs = retrofit(wordVecs, lexicon)

with open(output_file, 'w', encoding='utf-8') as outputFile:
    for word in retrofittedVecs.keys():
        outputFile.write(word + ' ' + ' '.join(str(x) for x in retrofittedVecs[word]) + '\n')


## Similarity

Word2Vec, GloVe: These models assign a vector to each word in a high-dimensional vector space based on the context in which the word appears. The similarity between two words can then be calculated as the cosine similarity between their corresponding vectors.

In [41]:
# TODO: check similarity before and after retroffit

### Getting the pretrained word embeddings

In [17]:
print(gensim.__file__)

C:\Users\ninan\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\gensim\__init__.py


In [18]:
# available pre-trained models
# gensim.downloader.info()

In [19]:
# load pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')

In [20]:
# download the pre-trained word2vec model
model_name = 'word2vec-google-news-300'
path = api.load(model_name, return_path=True)
print(path)

C:\Users\ninan/gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


In [21]:
toy_corpus = ["cat", "dog", "caramel", "cheese", "chocolate", "right", "left"]

In [37]:
''' Cosine similarity '''
for i, word1 in enumerate(toy_corpus[:-1]):
    print(f'Similarities with "{word1}":')
    for j, word2 in enumerate(toy_corpus[i+1:]):
        similarity = model.similarity(word1, word2)
        print(f'  - "{word2}": {similarity:.2f}')
    print()

Similarities with "cat":
  - "dog": 0.76
  - "caramel": 0.17
  - "cheese": 0.14
  - "chocolate": 0.24
  - "right": 0.08
  - "left": 0.11

Similarities with "dog":
  - "caramel": 0.16
  - "cheese": 0.18
  - "chocolate": 0.30
  - "right": 0.11
  - "left": 0.08

Similarities with "caramel":
  - "cheese": 0.48
  - "chocolate": 0.67
  - "right": 0.07
  - "left": 0.07

Similarities with "cheese":
  - "chocolate": 0.61
  - "right": 0.08
  - "left": 0.08

Similarities with "chocolate":
  - "right": 0.04
  - "left": 0.05

Similarities with "right":
  - "left": 0.49



In [44]:
''' Convert toy_corpus to a co-occurrence matrix '''
# Create a dictionary from the corpus
dictionary = corpora.Dictionary([toy_corpus])

# Convert the corpus to a bag-of-words representation
bow_corpus = [dictionary.doc2bow(text) for text in [toy_corpus]]

# Convert the corpus to a co-occurrence matrix
cooccur_matrix = matutils.corpus2csc(bow_corpus).dot(matutils.corpus2csc(bow_corpus).transpose())

In [24]:
''' Use the pre-trained Word2Vec model to create an embedding matrix '''
embedding_matrix = {}
for word in toy_corpus:
    embedding = model[word]
    embedding_matrix[dictionary.token2id[word]] = embedding

In [25]:
''' Retrofit the embedding matrix '''
# Could also use the 'from retrofitting import retrofit' package
# = an implementation of the algorithm proposed by Mrksic et al. (2017) which is faster and more scalable
cooccur_matrix_dict = cooccur_matrix.todok()
retrofitted_embeddings = retrofit(embedding_matrix, cooccur_matrix_dict)

In [46]:
''' Use the retrofitted embeddings to compute semantic similarity '''
for i, word1 in enumerate(toy_corpus):
    for j, word2 in enumerate(toy_corpus[i+1:], i+1):
        similarity_before = cosine(embedding_matrix[i], embedding_matrix[j])
        similarity_after = cosine(retrofitted_embeddings[i], retrofitted_embeddings[j])
        print(f"Similarity between '{word1}' and '{word2}' before retrofitting: {similarity_before:.2f}")
        print(f"Similarity between '{word1}' and '{word2}' after retrofitting: {similarity_after:.2f}\n")

Similarity between 'cat' and 'dog' before retrofitting: 0.83
Similarity between 'cat' and 'dog' after retrofitting: 0.83

Similarity between 'cat' and 'caramel' before retrofitting: 0.52
Similarity between 'cat' and 'caramel' after retrofitting: 0.52

Similarity between 'cat' and 'cheese' before retrofitting: 0.33
Similarity between 'cat' and 'cheese' after retrofitting: 0.33

Similarity between 'cat' and 'chocolate' before retrofitting: 0.84
Similarity between 'cat' and 'chocolate' after retrofitting: 0.84

Similarity between 'cat' and 'right' before retrofitting: 0.93
Similarity between 'cat' and 'right' after retrofitting: 0.93

Similarity between 'cat' and 'left' before retrofitting: 0.93
Similarity between 'cat' and 'left' after retrofitting: 0.93

Similarity between 'dog' and 'caramel' before retrofitting: 0.86
Similarity between 'dog' and 'caramel' after retrofitting: 0.86

Similarity between 'dog' and 'cheese' before retrofitting: 0.76
Similarity between 'dog' and 'cheese' afte