In [1]:
from gensim.models import KeyedVectors
from gensim import models
from gensim.models import Word2Vec
import numpy as np
from scipy.spatial.distance import cosine
from gensim.models.poincare import PoincareModel
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
word2vec_model = Word2Vec.load("/workspaces/master_thesis/word2vec_wiki_snomed_preprocessed.model")

In [3]:
poincare_model=PoincareModel.load('/workspaces/master_thesis/poincare_100d_preprocessed')

In [4]:
def normalize_embeddings(embeddings):
    norms = np.linalg.norm(embeddings, axis=1, keepdims=True)
    return embeddings / norms

Normalization is not strictly necessary, but it can be helpful when combining embeddings from different models. Normalizing the embeddings ensures that they are on the same scale, which can lead to more meaningful results when combining them using methods like weighted sum or concatenation.

If the embeddings from the two models are on very different scales or if one model's embeddings dominate the other's, the combined embeddings might be heavily influenced by one model, which can lead to a loss of valuable information from the other model. Normalizing the embeddings mitigates this issue by ensuring that both models contribute more evenly to the combined embeddings.

If you decide not to normalize the embeddings, you can still combine them using the weighted sum or concatenation techniques. However, you might need to carefully adjust the weights in the weighted sum method to balance the contributions from both models. Alternatively, you can experiment with and without normalization and compare the results to see which approach works better for your specific use case.

In [5]:
normalized_poincare_embeddings = normalize_embeddings(poincare_model.kv.vectors)
normalized_word2vec_embeddings = normalize_embeddings(word2vec_model.wv.vectors)

In [6]:
def combine_weighted_sum(embedding1, embedding2, weight1, weight2):
    return weight1 * embedding1 + weight2 * embedding2

In [7]:
def sentence_embedding(sentence, word2vec_model):
    words = sentence.split()
    word_vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    
    if not word_vectors:
        return None
    
    return np.mean(word_vectors, axis=0)

In [8]:
import pickle

with open("/workspaces/master_thesis/snomed_preprocessed", "rb") as fp:   # Unpickling
  concepts = pickle.load(fp)

In [9]:
list_of_concepts = [' '.join(concept) for concept in concepts]

In [11]:
concept_word2vec_embeddings = {}

for concept in list_of_concepts:
    concept_word2vec_embeddings[concept] = sentence_embedding(concept, word2vec_model)


In [27]:
normalized_word2vec_embeddings = normalize_embeddings(list(concept_word2vec_embeddings.values()))

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (1263358,) + inhomogeneous part.

In [14]:
combined_embeddings = {}
weight1 = 0.5  
weight2 = 0.5

In [26]:
for concept in list_of_concepts:
    if concept in poincare_model.kv and concept in concept_word2vec_embeddings:
        combined_embeddings[concept] = combine_weighted_sum(
            normalized_poincare_embeddings[poincare_model.kv.key_to_index[concept]],
            normalized_word2vec_embeddings[list(concept_word2vec_embeddings.keys()).index(concept)],
            weight1, weight2
        )
    elif concept in concept_word2vec_embeddings:
        combined_embeddings[concept] = concept_word2vec_embeddings[concept]

ValueError: operands could not be broadcast together with shapes (100,) (300,) 