In [77]:
from gensim.models import KeyedVectors
from gensim import models
from gensim.models import Word2Vec
import numpy as np
from scipy.spatial.distance import cosine
from gensim.models.poincare import PoincareModel
import nltk
from nltk.tokenize import word_tokenize

nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [78]:
word2vec_model = Word2Vec.load("/workspaces/master_thesis/word2vec_pubmed.model")

In [79]:
word2vec_model.wv['reproductive']

array([-0.7357295 , -1.3917009 , -3.0318334 , -2.357155  ,  1.1424465 ,
       -0.02943132, -0.39708394, -1.142466  ,  0.25679728, -0.8578018 ,
       -1.3225051 ,  0.357423  ,  0.4506469 , -1.465964  ,  0.83961564,
        0.8856604 ,  2.7216253 ,  1.0778252 , -3.4893215 ,  0.166895  ,
        0.16774897, -1.8470141 ,  0.27980718,  0.21232131,  1.3932807 ,
       -0.26180035,  2.4073963 , -1.935196  , -2.011056  , -0.7977714 ,
        0.84900767,  2.1022625 ,  0.48991847,  1.5260231 , -1.2548038 ,
       -0.06538101, -2.814732  , -0.78244203,  3.327612  ,  0.47770292,
        0.45387393, -1.0638123 , -0.6918469 , -2.8515708 ,  1.0283729 ,
       -0.19389367,  2.589224  ,  1.3255198 ,  3.061081  ,  0.4531893 ,
       -2.9245992 , -0.18764217, -0.24312156, -1.6939746 , -0.1621548 ,
        0.15502797, -2.6995914 ,  1.5194799 , -3.0585392 ,  2.5285625 ,
        1.0150543 ,  1.0662335 , -1.7643276 , -1.4378964 , -0.92283463,
        3.3171194 , -1.7529407 , -2.2593923 , -1.3944848 ,  0.87

In [80]:
poincare_model=PoincareModel.load('/workspaces/master_thesis/poincare/poincare_20d_preprocessed')

In [81]:
poincare_model.kv.similarity('necrospermia', 'male reproduct find')

0.606241537673438

In [82]:
def get_embedding_with_fallback(model, word, size, random_state=None):
    try:
        return model[word]  # Use .wv to access KeyedVectors
    except KeyError:
        # Return a random vector if the word is not in the model
        if random_state is None:
            random_state = np.random.default_rng()
        return random_state.normal(0, 1, size)

In [83]:
# Define a function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

In [84]:
def get_embedding_with_fallback(model, word, size, random_state=None):
    try:
        return model[word]  # Use .wv to access KeyedVectors
    except KeyError:
        # Return a random vector if the word is not in the model
        if random_state is None:
            random_state = np.random.default_rng()
        return random_state.normal(0, 1, size)

In [85]:
# Tokenize sentences into words
def tokenize_sentence(sentence):
    return word_tokenize(sentence.lower())

In [86]:
# Compute the average vector representation for a sentence
def sentence_vector_word2vec(sentence, model, random_state=None):
    tokens = tokenize_sentence(sentence)
    vectors = [get_embedding_with_fallback(model, token, word2vec_model.vector_size) for token in tokens]
    return np.mean(vectors, axis=0)

In [89]:
# Compute fused similarity between sentences
def fused_sentence_similarity(sentence1, sentence2, alpha=0.5, random_state=None):
    # Get Word2Vec average embeddings
    word2vec_vec1 = sentence_vector_word2vec(sentence1, word2vec_model.wv, random_state)
    word2vec_vec2 = sentence_vector_word2vec(sentence2, word2vec_model.wv, random_state)

    # Get Poincare average embeddings
    poincare_vec1 = poincare_model.kv[sentence1]

    poincare_vec2 = poincare_model.kv[sentence2]

    # Calculate similarities
    word2vec_similarity = cosine_similarity(word2vec_vec1, word2vec_vec2)
    poincare_similarity = cosine_similarity(poincare_vec1, poincare_vec2)

    # Fuse the similarities using a weighted average (alpha for Word2Vec, 1-alpha for Poincare)
    fused_similarity = alpha * word2vec_similarity + (1 - alpha) * poincare_similarity

    return fused_similarity

In [90]:
fused_sentence_similarity('necrospermia', 'male reproduct find')

0.5471074756591261

In [None]:
import numpy as np
from gensim.models import KeyedVectors
from scipy.spatial.distance import cosine
from sklearn.preprocessing import normalize

# Load pre-trained Word2Vec embeddings
word2vec_model = KeyedVectors.load_word2vec_format('word2vec_model.bin', binary=True)

# Load pre-trained Poincare embeddings
poincare_model = KeyedVectors.load('poincare_model.kv')

# Define a function to calculate cosine similarity
def cosine_similarity(vec1, vec2):
    return 1 - cosine(vec1, vec2)

# Define a function to normalize Poincare embeddings
def normalize_poincare(poincare_vec):
    norm = np.linalg.norm(poincare_vec)
    return 2 * poincare_vec / (1 + norm**2)

# Late fusion of Word2Vec and Poincare embeddings
def fused_similarity(word1, word2, alpha=0.5):
    # Get Word2Vec embeddings
    word2vec_vec1 = word2vec_model[word1]
    word2vec_vec2 = word2vec_model[word2]

    # Get Poincare embeddings
    poincare_vec1 = normalize_poincare(poincare_model[word1])
    poincare_vec2 = normalize_poincare(poincare_model[word2])

    # Calculate similarities
    word2vec_similarity = cosine_similarity(word2vec_vec1, word2vec_vec2)
    poincare_similarity = cosine_similarity(poincare_vec1, poincare_vec2)

    # Fuse the similarities using a weighted average (alpha for Word2Vec, 1-alpha for Poincare)
    fused_similarity = alpha * word2vec_similarity + (1 - alpha) * poincare_similarity

    return fused_similarity

# Example usage
word1 = 'cat'
word2 = 'dog'

similarity = fused_similarity(word1, word2)
print(f"The fused similarity between '{word1}' and '{word2}' is: {similarity:.4f}")


Ivergny	Arras
Avot	Dijon
Chabrac	Confolens
Luchem	Langerwehe

congenital rectal fissure	lesion of rectum
Necrospermia	Male reproductive finding

In [43]:
import spacy
from nltk.stem import PorterStemmer
nlp = spacy.load("en_core_web_sm")
def preprocessing(sample):
    sample = sample.lower()
    stemmer = PorterStemmer()
    token_list = []
    doc = nlp(sample)
    token_list = [stemmer.stem(token.text)
            for token in doc
            if not token.is_stop and not token.is_punct
        ]
    text = " ".join(token_list)
    return text  


In [44]:
preprocessing('Necrospermia')

'necrospermia'

In [45]:
preprocessing('Male reproductive finding')

'male reproduct find'