In [1]:
# Standard library imports
import argparse
import gzip
import math
import re
import sys
import urllib.request
import io
import random
from copy import deepcopy

# Third-party library imports
import numpy as np
import pandas as pd
import gensim
import gensim.downloader as api
from gensim import corpora, matutils
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import wordnet
from sklearn.cluster import KMeans

In [2]:
# load pre-trained Word2Vec model
model = api.load('word2vec-google-news-300')

In [3]:
isNumber = re.compile(r'\d+.*')

def norm_word(word):
  """
  - input: word
  - return: a normalized version of it
  Normalization process: includes checking if the word is a number or a punctuation mark and replacing it with special tokens
  """
  if isNumber.search(word.lower()):
    return '---num---'
  # check if the word consists only of non-alphanumeric characters by removing all non-alphanumeric characters from the word 
  # and checking if the result is an empty string
  elif re.sub(r'\W+', '', word) == '':
    return '---punc---'
  else:
  # if input word not a number nor a punctuation mark, return a lowercase version of input word
    return word.lower()
  

  
''' Read all the word vectors and normalize them '''
def read_word_vecs(filename):
  """
  - input: name of the file containing the word vectors
  """
  wordVectors = {}
  with open(filename, 'r', encoding='utf-8') as fileObject:
    for line in fileObject:
      line = line.strip().lower()
      # The first word is assumed to be the word itself, and the remaining words are assumed to be the components of the word vector
      word = line.split()[0]
      # initialize a numpy array of zeros with the same length as the word vector
      wordVectors[word] = np.zeros(len(line.split())-1, dtype=float)
      for index, vecVal in enumerate(line.split()[1:]):
        # assign the values in the numpy array to the corresponding components of the word vector
        wordVectors[word][index] = float(vecVal)
      ''' normalize weight vector '''
      # divide each element by the square root of the sum of the squares of all the elements in the array
      # plus a small constant (1e-6) to avoid division by zero
      wordVectors[word] /= math.sqrt((wordVectors[word]**2).sum() + 1e-6)
  
  # standard error indicating that the vectors have been read from the file 
  sys.stderr.write("Vectors read from: "+filename+" \n")
  return wordVectors

  ''' Write word vectors to file '''
def print_word_vecs(wordVectors, outFileName):
  """
  - input: a dictionary wordVectors where keys are words and values are their corresponding word vectors
           file name outFileName
  """
  sys.stderr.write('\nWriting down the vectors in '+outFileName+'\n')
  outFile = open(outFileName, 'w', encoding= 'utf-8')  
  for word, values in wordVectors.items():
    outFile.write(word+' ')
    for val in wordVectors[word]:
      # write the word vectors to the ouptut file in the format:
      # word1 val1 val2 val3 ...
      # word2 val1 val2 val3 ...
      # ...
      outFile.write('%.4f' %(val)+' ')
    outFile.write('\n')      
  outFile.close()

''' Read the PPDB word relations as a dictionary '''
def read_lexicon(filename):
    lexicon = {}
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            words = line.lower().strip().split()
            lexicon[norm_word(words[0])] = [norm_word(word) for word in words[1:]]

In [16]:
def convert_matrix_to_dict(wordVecMat, wordList):
    wordVecs = {}

    for i, word in enumerate(wordList):
        wordVecs[word] = wordVecMat[i]

    return wordVecs

def convert_dict_to_matrix(wordVecs):
    words = list(wordVecs.keys())
    wordVecMat = np.array(list(wordVecs.values()))

    # Reshape the matrix if it is one-dimensional
    if wordVecMat.ndim == 1:
        wordVecMat = wordVecMat.reshape(1, -1)

    return wordVecMat

def vectorize_list(corpus):
    corpus_vecs = [model[word] for word in corpus]

    return corpus_vecs

In [5]:
toy_corpus = ["cat", "kitten", "dog", "donkey", "caramel", "cheese", "chocolate", "cacao", "right", "left"]

In [6]:
toy_corpus_vecs = vectorize_list(toy_corpus)
toy_corpus_vecs

[array([ 0.0123291 ,  0.20410156, -0.28515625,  0.21679688,  0.11816406,
         0.08300781,  0.04980469, -0.00952148,  0.22070312, -0.12597656,
         0.08056641, -0.5859375 , -0.00445557, -0.296875  , -0.01312256,
        -0.08349609,  0.05053711,  0.15136719, -0.44921875, -0.0135498 ,
         0.21484375, -0.14746094,  0.22460938, -0.125     , -0.09716797,
         0.24902344, -0.2890625 ,  0.36523438,  0.41210938, -0.0859375 ,
        -0.07861328, -0.19726562, -0.09082031, -0.14160156, -0.10253906,
         0.13085938, -0.00346375,  0.07226562,  0.04418945,  0.34570312,
         0.07470703, -0.11230469,  0.06738281,  0.11230469,  0.01977539,
        -0.12353516,  0.20996094, -0.07226562, -0.02783203,  0.05541992,
        -0.33398438,  0.08544922,  0.34375   ,  0.13964844,  0.04931641,
        -0.13476562,  0.16308594, -0.37304688,  0.39648438,  0.10693359,
         0.22167969,  0.21289062, -0.08984375,  0.20703125,  0.08935547,
        -0.08251953,  0.05957031,  0.10205078, -0.1

In [7]:
def calculate_cosine_similarity(vec1, vec2):
    # Calculate cosine similarity between two vectors
    dot_product = np.dot(vec1, vec2)
    norm_product = np.linalg.norm(vec1) * np.linalg.norm(vec2)
    similarity = dot_product / norm_product
    return similarity

def generate_cosine_similarity_matrix(vectors):
    num_vectors = len(vectors)
    similarity_matrix = np.zeros((num_vectors, num_vectors))

    for i in range(num_vectors):
        for j in range(num_vectors):
            similarity = calculate_cosine_similarity(vectors[i], vectors[j])
            similarity_matrix[i, j] = similarity
    return similarity_matrix

def print_vec_similarities(corpus, vectorized_corpus):
    for i, vec1 in enumerate(vectorized_corpus):
        word1 = corpus[i]
        print(f'Similarities with "{word1}":')
        for j, vec2 in enumerate(vectorized_corpus):
            if i == j:
                continue
            similarity = calculate_cosine_similarity(vec1, vec2)
            print(f'  - "{corpus[j]}": {similarity:.4f}')
        print()

In [8]:
similarity_matrix = generate_cosine_similarity_matrix(toy_corpus_vecs)
print_vec_similarities(toy_corpus, similarity_matrix)

Similarities with "cat":
  - "kitten": 0.9688
  - "dog": 0.9724
  - "donkey": 0.7901
  - "caramel": 0.4268
  - "cheese": 0.3910
  - "chocolate": 0.5103
  - "cacao": 0.3105
  - "right": 0.2302
  - "left": 0.2539

Similarities with "kitten":
  - "cat": 0.9688
  - "dog": 0.9466
  - "donkey": 0.7895
  - "caramel": 0.4282
  - "cheese": 0.3506
  - "chocolate": 0.4904
  - "cacao": 0.2887
  - "right": 0.1814
  - "left": 0.2117

Similarities with "dog":
  - "cat": 0.9724
  - "kitten": 0.9466
  - "donkey": 0.8274
  - "caramel": 0.4426
  - "cheese": 0.4316
  - "chocolate": 0.5462
  - "cacao": 0.3257
  - "right": 0.2435
  - "left": 0.2427

Similarities with "donkey":
  - "cat": 0.7901
  - "kitten": 0.7895
  - "dog": 0.8274
  - "caramel": 0.4150
  - "cheese": 0.4585
  - "chocolate": 0.5234
  - "cacao": 0.3917
  - "right": 0.2643
  - "left": 0.2802

Similarities with "caramel":
  - "cat": 0.4268
  - "kitten": 0.4282
  - "dog": 0.4426
  - "donkey": 0.4150
  - "cheese": 0.8493
  - "chocolate": 0.9353


In [9]:
toy_wordVecs = convert_matrix_to_dict(toy_corpus_vecs, toy_corpus)

In [49]:
import numpy as np

def retrofitting_wordVecs(wordVecs, wordnet_lexicon, alpha, beta_, nb_iter, normalizing=True):
    # Convert the word vectors dictionary to a matrix
    wordVecMat = np.array(list(wordVecs.values()))
    wordList = list(wordVecs.keys())

    # Print the original shape of wordVecMat
    print("Original shape of wordVecMat:", wordVecMat.shape)

    # Print the lengths of the word vectors
    lengths = np.linalg.norm(wordVecMat, axis=1)
    print("Lengths of word vectors:", lengths)

    # Find the common vocabulary between wordVecMat and the wordnet_lexicon
    loopVocab = set(wordnet_lexicon.keys())

    # Iterate over the specified number of iterations
    for _ in range(nb_iter):
        wordIndices = [wordList.index(word) for word in loopVocab]
        wordNeighbours = np.array([list(wordnet_lexicon[word]) for word in loopVocab], dtype=object)
        numNeighbours = np.array([len(neighbours) for neighbours in wordNeighbours])

        # Retrieve semantic relations from WordNet
        wordnet_relations = np.array([wordnet_lexicon.get(word) for word in loopVocab], dtype=object)

        # Combine the relations with the existing lexicon
        combined_relations = np.concatenate((wordNeighbours, wordnet_relations), axis=0).reshape(-1, 1)

        beta = beta_ / combined_relations.shape[0]

        mask = numNeighbours != 0
        masked_wordVecMat = wordVecMat[wordIndices][mask]
        masked_combined_relations = combined_relations[wordIndices][mask]

        newVec = alpha * masked_wordVecMat + beta * np.sum(wordVecMat[np.where(masked_combined_relations.flatten())], axis=0)
        newWordVec = newVec / (alpha + beta * numNeighbours[mask].reshape(-1, 1))

        wordVecMat[wordIndices][mask] = newWordVec

    print("Shape of wordVecMat after retrofitting:", wordVecMat.shape)

    # Normalize the word vectors
    if normalizing:
        wordVecMat = wordVecMat / np.linalg.norm(wordVecMat, axis=1)[:, np.newaxis]
    print("Shape of wordVecMat after normalization:", wordVecMat.shape)

    # Convert the matrix back to a dictionary of word vectors
    newWordVecs = {word: vec for word, vec in zip(wordList, wordVecMat)}

    return newWordVecs



In [11]:
def get_wordnet_lexicon(target_words, synonyms=False, antonyms=False, hyponyms=False, hypernyms=False, meronyms=False, holonyms=False, homonyms=False):
    lexicon = {}
    for word in target_words:
        related_words = []
        
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                if lemma.name() != word:
                    if synonyms:
                        related_words.append(lemma.name())
            if antonyms:           
                #check antonym for first lemma of the synset and append
                if syn.lemmas()[0].antonyms():
                    related_words.append(syn.lemmas()[0].antonyms()[0].name())
            if hyponyms:
                for hypo in syn.hyponyms():
                    for lemma in hypo.lemmas():
                        related_words.append(lemma.name())
            if hypernyms:          
                for hyper in syn.hypernyms():
                    for lemma in hyper.lemmas():
                        related_words.append(lemma.name())
            if meronyms:             
                for part in syn.part_meronyms():
                    for lemma in part.lemmas():
                        related_words.append(lemma.name())
            if holonyms:             
                for whole in syn.part_holonyms():
                    for lemma in whole.lemmas(): 
                        related_words.append(lemma.name())
            if homonyms:             
                #iterate through each lemma for the current synset
                #for each lemma not the same as target word
                #find all lemmas that have same spelling and append
                for lemma in syn.lemmas():
                    if lemma.name() != word:
                        homonyms = wordnet.lemmas(lemma.name())
                        for homonym in homonyms:
                            related_words.append(homonym.name())
        lexicon[word] = related_words
    return lexicon

In [12]:
# word embeddings
wordVecs = read_word_vecs("../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean")
# semantic relations
lexical_similarity = read_lexicon("../data/English/lexicon/ws353_lexical_similarity.txt")
# ppdb_lexicon0 = read_lexicon('../data/English/lexicon/ppdb-xl.txt')
# wordnet_lexicon0 = read_lexicon('../data/English/lexicon/wordnet-synonyms+.txt')
# the file for the updated embeddings
output_file = "../data/English/output_vectors/output_vectors.txt"

Vectors read from: ../data/English/wordEmbeddings/vectors_datatxt_250_sg_w10_i5_c500_gensim_clean 


In [13]:
wordnet_lexicon = get_wordnet_lexicon(toy_corpus, synonyms=True)

In [50]:
retrofitted_toy_vecs = retrofitting_wordVecs(toy_wordVecs, wordnet_lexicon, alpha=1, beta_=1, nb_iter=10, normalizing=False)

Original shape of wordVecMat: (10, 300)
Lengths of word vectors: [3.0376644 3.3723967 2.9811234 3.0983987 3.4778101 3.1255875 3.0988083
 4.049726  1.9395463 1.9547043]
Shape of wordVecMat after retrofitting: (10, 300)
Shape of wordVecMat after normalization: (10, 300)


In [51]:
retrofitted_toy_matrix = convert_dict_to_matrix(retrofitted_toy_vecs)
retrofitted_similarity_matrix = generate_cosine_similarity_matrix(retrofitted_toy_matrix)
print_vec_similarities(toy_corpus, retrofitted_similarity_matrix)

Similarities with "cat":
  - "kitten": 0.9688
  - "dog": 0.9724
  - "donkey": 0.7901
  - "caramel": 0.4268
  - "cheese": 0.3910
  - "chocolate": 0.5103
  - "cacao": 0.3105
  - "right": 0.2302
  - "left": 0.2539

Similarities with "kitten":
  - "cat": 0.9688
  - "dog": 0.9466
  - "donkey": 0.7895
  - "caramel": 0.4282
  - "cheese": 0.3506
  - "chocolate": 0.4904
  - "cacao": 0.2887
  - "right": 0.1814
  - "left": 0.2117

Similarities with "dog":
  - "cat": 0.9724
  - "kitten": 0.9466
  - "donkey": 0.8274
  - "caramel": 0.4426
  - "cheese": 0.4316
  - "chocolate": 0.5462
  - "cacao": 0.3257
  - "right": 0.2435
  - "left": 0.2427

Similarities with "donkey":
  - "cat": 0.7901
  - "kitten": 0.7895
  - "dog": 0.8274
  - "caramel": 0.4150
  - "cheese": 0.4585
  - "chocolate": 0.5234
  - "cacao": 0.3917
  - "right": 0.2643
  - "left": 0.2802

Similarities with "caramel":
  - "cat": 0.4268
  - "kitten": 0.4282
  - "dog": 0.4426
  - "donkey": 0.4150
  - "cheese": 0.8493
  - "chocolate": 0.9353


In [52]:
def print_similarity_difference(similarity_matrix, retrofitted_similarity_matrix):
    difference = np.abs(similarity_matrix - retrofitted_similarity_matrix)
    print("Similarity Difference Matrix:")
    print(difference)

print_similarity_difference(similarity_matrix, retrofitted_similarity_matrix)

Similarity Difference Matrix:
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [None]:
# TODO tune hyperparameters