In [1]:
from gensim.models import KeyedVectors
from gensim import models
from gensim.models import Word2Vec
import numpy as np
from scipy.spatial.distance import cosine
from gensim.models.poincare import PoincareModel
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
word2vec_model = Word2Vec.load("/workspaces/master_thesis/word2vec_wiki_snomed_preprocessed.model")

In [5]:
word2vec_model.wv['reproductive']

array([-1.3393052e+00,  9.6904553e-02, -9.6875226e-01, -1.0539744e+00,
       -2.5268003e-01,  1.1038302e+00, -3.0247996e+00, -5.6130666e-01,
        1.7299324e+00,  1.2606411e+00, -2.2933331e+00,  8.8047737e-01,
        2.8178045e-01,  1.6604291e+00, -1.1903011e+00, -2.0154345e+00,
       -1.2673988e+00,  1.6732681e+00,  2.7048326e+00,  1.1591354e-01,
       -7.2387719e-01,  2.6251552e+00,  6.1091506e-01,  1.0157254e+00,
       -1.4806093e+00, -2.3722546e+00,  1.8288122e+00,  3.9278510e+00,
        2.4022624e+00, -9.9955899e-01, -1.3298825e+00, -1.3173047e+00,
       -1.2632543e+00, -1.0562584e+00, -3.8755655e+00, -1.5524970e+00,
        8.1405795e-01, -1.8252021e+00,  3.8668224e-01,  2.2663846e+00,
        1.5557190e+00,  3.8574785e-01,  5.7300448e-01,  6.9619817e-01,
        2.6698589e+00,  8.4978956e-01,  9.0141617e-02, -1.1178824e+00,
       -6.0006320e-01, -1.5897150e+00, -6.3172233e-01, -5.0465184e-01,
        8.1736571e-01,  3.8620433e-01, -1.7984356e-01, -9.9856222e-01,
      

In [80]:
poincare_model=PoincareModel.load('/workspaces/master_thesis/poincare/poincare_20d_preprocessed')

In [81]:
poincare_model.kv.similarity('necrospermia', 'male reproduct find')

0.606241537673438

In [82]:
def get_embedding_with_fallback(model, word, size, random_state=None):
    try:
        return model[word]  # Use .wv to access KeyedVectors
    except KeyError:
        # Return a random vector if the word is not in the model
        if random_state is None:
            random_state = np.random.default_rng()
        return random_state.normal(0, 1, size)

In [83]:
# Define a function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

In [84]:
def get_embedding_with_fallback(model, word, size, random_state=None):
    try:
        return model[word]  # Use .wv to access KeyedVectors
    except KeyError:
        # Return a random vector if the word is not in the model
        if random_state is None:
            random_state = np.random.default_rng()
        return random_state.normal(0, 1, size)

In [85]:
# Tokenize sentences into words
def tokenize_sentence(sentence):
    return word_tokenize(sentence.lower())

In [86]:
# Compute the average vector representation for a sentence
def sentence_vector_word2vec(sentence, model, random_state=None):
    tokens = tokenize_sentence(sentence)
    vectors = [get_embedding_with_fallback(model, token, word2vec_model.vector_size) for token in tokens]
    return np.mean(vectors, axis=0)

In [89]:
# Compute fused similarity between sentences
def fused_sentence_similarity(sentence1, sentence2, alpha=0.5, random_state=None):
    # Get Word2Vec average embeddings
    word2vec_vec1 = sentence_vector_word2vec(sentence1, word2vec_model.wv, random_state)
    word2vec_vec2 = sentence_vector_word2vec(sentence2, word2vec_model.wv, random_state)

    # Get Poincare average embeddings
    poincare_vec1 = poincare_model.kv[sentence1]

    poincare_vec2 = poincare_model.kv[sentence2]

    # Calculate similarities
    word2vec_similarity = cosine_similarity(word2vec_vec1, word2vec_vec2)
    poincare_similarity = cosine_similarity(poincare_vec1, poincare_vec2)

    # Fuse the similarities using a weighted average (alpha for Word2Vec, 1-alpha for Poincare)
    fused_similarity = alpha * word2vec_similarity + (1 - alpha) * poincare_similarity

    return fused_similarity

In [90]:
fused_sentence_similarity('necrospermia', 'male reproduct find')

0.5471074756591261

In [None]:
import numpy as np
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize

# Load pre-trained Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('word2vec_model.bin', binary=True)

# Load pre-trained Poincare embeddings
poincare_model = KeyedVectors.load('poincare_model.kv')

# Define a function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Define a function to normalize Poincare embeddings
def normalize_poincare(poincare_vec):
    norm = np.linalg.norm(poincare_vec)
    return 2 * poincare_vec / (1 + norm**2)

# Late fusion of Word2Vec and Poincare embeddings
def fused_similarity(word1, word2, alpha=0.5):
    # Get Word2Vec embeddings
    word2vec_vec1 = word2vec_model[word1]
    word2vec_vec2 = word2vec_model[word2]

    # Get Poincare embeddings
    poincare_vec1 = normalize_poincare(poincare_model[word1])
    poincare_vec2 = normalize_poincare(poincare_model[word2])

    # Calculate similarities
    word2vec_similarity = cosine_similarity(word2vec_vec1, word2vec_vec2)
    poincare_similarity = cosine_similarity(poincare_vec1, poincare_vec2)

    # Fuse the similarities using a weighted average (alpha for Word2Vec, 1-alpha for Poincare)
    fused_similarity = alpha * word2vec_similarity + (1 - alpha) * poincare_similarity

    return fused_similarity

# Example usage
word1 = 'cat'
word2 = 'dog'

similarity = fused_similarity(word1, word2)
print(f"The fused similarity between '{word1}' and '{word2}' is: {similarity:.4f}")


Ivergny	Arras
Avot	Dijon
Chabrac	Confolens
Luchem	Langerwehe

congenital rectal fissure	lesion of rectum
Necrospermia	Male reproductive finding

In [43]:
import spacy
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_sm")
def preprocessing(sample):
    sample = sample.lower()
    stemmer = PorterStemmer()
    token_list = []
    doc = nlp(sample)
    token_list = [stemmer.stem(token.text)
            for token in doc
            if not token.is_stop and not token.is_punct
        ]
    text = " ".join(token_list)
    return text  


In [44]:
preprocessing('Necrospermia')

'necrospermia'

In [45]:
preprocessing('Male reproductive finding')

'male reproduct find'