In [1]:
test_data=[
'What artery was this detected at?'
,'Atrioventricular block'
,'Atrioventricular block degree'
,'Are you aware of a coronary heart disease diagnosed by your doctor?'
,'Has a cardioversion already been performed on you?'
,'Congenital heart disease'
,'Was contrast agent used for cardio magnetic resonance imaging?'
,'Have you been in control echocardiography for the last 12 months?'
,'Have you ever been diagnosed with atrial fibrillation by a doctor?'
,'Have you ever been diagnosed with deep vein thrombosis by a doctor?'
,'Have you ever been diagnosed by a doctor with a sagging of another artery (aneurysm)?'
,'Have you ever been diagnosed by a doctor with window disease or blood circulation disorder in your legs, also called Claudicatio or arterial disease?'
,'Have you ever been diagnosed with a heart attack by a doctor?'
,'Have you ever been diagnosed with "open legs" by a doctor or an Ulcus cruris?'
,'Is a mobile electrocardiogram issued?'
,'How old were you when the diagnosis of atrial fibrillation was made?'
,'Do you have pain in one or both legs while walking?'
,'Riva-Rocci systolic before examination'
,'Sclerosis of the mitral valve']

## Data preporation for Bi-LSTM with Poincare and with DeepWalk

In [2]:
import spacy
import pandas as pd
import numpy as np
nlp = spacy.load("en_core_web_sm")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def preprocessing_without_stemming(sample):
    if isinstance(sample, str):
        sample = sample.lower()
        token_list = []
        doc = nlp(sample)
        token_list = [token.text
            for token in doc
                if not token.is_stop and not token.is_punct
            ]
        sentence = " ".join(token_list)
        return sentence
    else:
        print(f"Invalid input {sample}: expected a string.")
        return None

In [4]:
#preprocessing of the data -> remove punctuation, stopwords, lowercase
test_data_prep = [preprocessing_without_stemming(x) for x in test_data]

## Initialise model BiLSTM

In [5]:
import torch.nn as nn
import torch

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # 2 for bidirection

    def forward(self, x):
        # Reshape the input to (batch_size, seq_len, features)
        x = x.view(x.size(0), 20, 300)

        # Forward propagate LSTM
        out, _ = self.lstm(x)  # out: tensor of shape (batch_size, seq_length, hidden_size*2)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [6]:
def get_phrase_vector(phrase, model, max_len):
    words = str(phrase).split()
    phrase_vector = np.zeros((max_len, model.vector_size))
    for i in range(max_len):
        if i < len(words) and words[i] in model.wv:
            phrase_vector[i] = model.wv[words[i]]
    phrase_vector = phrase_vector.flatten()
    
    return torch.tensor(phrase_vector, dtype=torch.float)

In [7]:
df_concept=pd.read_csv('/workspaces/master_thesis/CONCEPT.csv', on_bad_lines='skip', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


## Load Poincare results

In [8]:
model_map_poincare = BiLSTM(input_size=300, hidden_size=300, output_size=100)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_map_poincare.parameters())

In [9]:
model_map_poincare.load_state_dict(torch.load('/workspaces/master_thesis/combination_three_methods/models/model_50epochs_conceptid.ckpt'))
#model.load_state_dict(torch.load('/workspaces/master_thesis/model_50epochs_conceptid_deepwalk.ckpt'))
#device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_map_poincare.eval() 

BiLSTM(
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=600, out_features=100, bias=True)
)

In [10]:
from gensim.models import Word2Vec
from gensim.models.poincare import PoincareModel
w2v_model = Word2Vec.load("/workspaces/master_thesis/combination_three_methods/models/word2vec_pubmed_wiki.model")
poincare_model = PoincareModel.load('/workspaces/master_thesis/combination_three_methods/models/poincare_100d_concept_id')
#deepwalk_model = Word2Vec.load("/workspaces/master_thesis/combination_three_methods/models/deepwalk_snomed.model")

In [11]:
df_poincare_overall_results = pd.DataFrame()

for n in test_data_prep :
    x = get_phrase_vector(n, w2v_model, 20)
    x = x.unsqueeze(0)
    output = model_map_poincare(x)
    #result = deepwalk_model.wv.most_similar([output.detach().numpy()[0]], topn=5)
    result = poincare_model.kv.most_similar([output.detach().numpy()[0]], topn=5)
    
    for i in range(len(result)):
        # create a new dataframe for each result
        df_temp = df_concept[df_concept['concept_id'] == int(result[i][0])].copy()
        df_temp['score'] = result[i][1]
        df_temp['sourcename'] = n
        
        # concatenate this dataframe with df_overall_results
        df_poincare_overall_results = pd.concat([df_poincare_overall_results, df_temp[['sourcename', 'concept_id', 'concept_name', 'domain_id', 'score']]])

In [12]:
df_poincare_overall_results['id'] = np.repeat(np.arange(len(test_data_prep)), 5)

In [13]:
df_poincare_overall_results = df_poincare_overall_results.drop(columns=['sourcename'])
df_poincare_overall_results['sourcename'] = np.repeat(test_data, 5)

In [14]:
df_poincare_overall_results

Unnamed: 0,concept_id,concept_name,domain_id,score,id,sourcename
723234,4114166,Systemic arterial finding,Condition,0.603746,0,What artery was this detected at?
284416,4042836,Disorder of head,Condition,0.732165,0,What artery was this detected at?
828468,321887,Disorder of artery,Condition,0.737198,0,What artery was this detected at?
908023,321052,Peripheral vascular disease,Condition,0.739125,0,What artery was this detected at?
723610,4115390,Mediastinal finding,Condition,0.758339,0,What artery was this detected at?
...,...,...,...,...,...,...
300179,4048213,Heart valve stenosis and regurgitation,Condition,1.459595,18,Sclerosis of the mitral valve
1039427,4189343,Aortic valve stenosis,Condition,1.467915,18,Sclerosis of the mitral valve
659653,4158911,Non-rheumatic heart valve disorder,Condition,1.501683,18,Sclerosis of the mitral valve
1158915,315273,Mitral valve stenosis,Condition,1.530830,18,Sclerosis of the mitral valve


## Load Deepwalk results

In [15]:
model_map_deepwalk= BiLSTM(input_size=300, hidden_size=300, output_size=100)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model_map_deepwalk.parameters())

In [16]:
model_map_deepwalk.load_state_dict(torch.load('/workspaces/master_thesis/combination_three_methods/models/model_50epochs_conceptid_deepwalk.ckpt'))
#device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_map_deepwalk.eval() 

BiLSTM(
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=600, out_features=100, bias=True)
)

In [17]:
from gensim.models import Word2Vec
from gensim.models.poincare import PoincareModel
w2v_model = Word2Vec.load("/workspaces/master_thesis/combination_three_methods/models/word2vec_pubmed_wiki.model")
deepwalk_model = Word2Vec.load("/workspaces/master_thesis/combination_three_methods/models/deepwalk_snomed.model")

In [18]:
df_deepwalk_overall_results = pd.DataFrame()

for n in test_data_prep :
    x = get_phrase_vector(n, w2v_model, 20)
    x = x.unsqueeze(0)
    output = model_map_deepwalk(x)
    result = deepwalk_model.wv.most_similar([output.detach().numpy()[0]], topn=5)
    
    for i in range(len(result)):
        # create a new dataframe for each result
        df_temp = df_concept[df_concept['concept_id'] == int(result[i][0])].copy()
        df_temp['score'] = result[i][1]
        df_temp['sourcename'] = n
        
        # concatenate this dataframe with df_overall_results
        df_deepwalk_overall_results = pd.concat([df_deepwalk_overall_results, df_temp[['sourcename', 'concept_id', 'concept_name', 'domain_id', 'score']]])

In [19]:
df_deepwalk_overall_results['id'] = np.repeat(np.arange(len(test_data_prep)), 5)

In [20]:
df_deepwalk_overall_results = df_deepwalk_overall_results.drop(columns=['sourcename'])
df_deepwalk_overall_results['sourcename'] = np.repeat(test_data, 5)

In [21]:
df_deepwalk_overall_results

Unnamed: 0,concept_id,concept_name,domain_id,score,id,sourcename
723234,4114166,Systemic arterial finding,Condition,0.875879,0,What artery was this detected at?
601086,4095634,Venous finding,Condition,0.842248,0,What artery was this detected at?
601065,4095631,Arterial finding,Condition,0.833645,0,What artery was this detected at?
967068,40484044,Surgically constructed radioulnar arteriovenou...,Condition,0.830968,0,What artery was this detected at?
962922,4207595,Surgically constructed ulnar to cephalic arter...,Condition,0.824887,0,What artery was this detected at?
...,...,...,...,...,...,...
848706,4281749,Heart valve disorder,Condition,0.938245,18,Sclerosis of the mitral valve
282915,319843,Mitral valve disorder,Condition,0.910050,18,Sclerosis of the mitral valve
914279,4230774,Heart valve regurgitation,Condition,0.904680,18,Sclerosis of the mitral valve
976328,40491478,Abnormality of aortic valve,Condition,0.904239,18,Sclerosis of the mitral valve


## Get Tf-idf mapping

In [22]:
#preprocessing of the data -> remove punctuation, stopwords, lowercase
nlp.Defaults.stop_words.remove("no")
nlp.Defaults.stop_words.remove("not")
nlp.Defaults.stop_words.remove("none")
nlp.Defaults.stop_words.remove("noone")
nlp.Defaults.stop_words.remove("back")
nlp.Defaults.stop_words.add("doctor")

def preprocessing_with_lemmatisation(sample):
    if isinstance(sample, str):
        sample = sample.lower()
        token_list = []
        doc = nlp(sample)
        token_list = [token.lemma_
                      for token in doc
                      if not token.lemma_ in nlp.Defaults.stop_words and not token.is_punct
                      ]
        sentence = " ".join(token_list)
        return sentence
    else:
        print(f"Invalid input {sample}: expected a string.")
        return None

In [23]:
test_data_lemma = [preprocessing_with_lemmatisation(x) for x in test_data]

In [24]:
test_data_lemma

['artery detect',
 'atrioventricular block',
 'atrioventricular block degree',
 'aware coronary heart disease diagnose',
 'cardioversion perform',
 'congenital heart disease',
 'contrast agent use cardio magnetic resonance imaging',
 'control echocardiography 12 month',
 'diagnose atrial fibrillation',
 'diagnose deep vein thrombosis',
 'diagnose sagging artery aneurysm',
 'diagnose window disease blood circulation disorder leg claudicatio arterial disease',
 'diagnose heart attack',
 'diagnose open leg ulcus cruris',
 'mobile electrocardiogram issue',
 'old diagnosis atrial fibrillation',
 'pain leg walk',
 'riva rocci systolic examination',
 'sclerosis mitral valve']

In [25]:
#concepts
from concepts import OmopConcepts
import heapq
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
synonym_file = '/workspaces/master_thesis/CONCEPT_SYNONYM.csv'
synonyms = pd.read_csv(synonym_file, on_bad_lines="skip", delimiter="\t", low_memory=False)

In [27]:
vocabulary_file = '/workspaces/master_thesis/VOCABULARY.csv'
vocabulary = pd.read_csv(vocabulary_file, on_bad_lines="skip", delimiter="\t", low_memory=False)
vocabulary.head()

Unnamed: 0,vocabulary_id,vocabulary_name,vocabulary_reference,vocabulary_version,vocabulary_concept_id
0,,OMOP Standardized Vocabularies,OMOP generated,v5.0 04-FEB-22,44819096
1,Visit Type,OMOP Visit Type,OMOP generated,,44819150
2,OSM,OpenStreetMap,"https://www.openstreetmap.org/copyright/en, ht...",OSM Release 2019-02-21,32541
3,Type Concept,OMOP Type Concept,OMOP generated,Type Concept 20210212,32808
4,Note Type,OMOP Note Type,OMOP generated,,44819146


In [28]:
concepts=OmopConcepts.concatenate_concept_with_their_synonyms(df_concept, synonyms, vocabulary, 'SNOMED')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concepts["concept_name"] = (


In [29]:
tfidf = TfidfVectorizer()

In [30]:
concepts_tfidf = tfidf.fit_transform(concepts.get_names())

In [31]:
df_tfidf_overall_results = pd.DataFrame()
for example in test_data_lemma:
    sample_tfidf = tfidf.transform([example])
    matrix_with_similarity_score = cosine_similarity(
            sample_tfidf, concepts_tfidf
        )
    predictions_list = []
    for seq_number, score in heapq.nlargest(
            5,
            enumerate(matrix_with_similarity_score[0]),
            key=lambda x: x[1],
        ):
            predictions_list.append((concepts[seq_number].name, concepts[seq_number].concept_id, concepts[seq_number].domain_id, score))
    #dataframe for each example prediction list
    df_temp = pd.DataFrame(predictions_list, columns=['concept_name', 'concept_id', 'domain_id', 'score'])
    df_temp['sourcename'] = example
    df_tfidf_overall_results = pd.concat([df_tfidf_overall_results, df_temp[['sourcename', 'concept_name', 'concept_id', 'domain_id', 'score']]])
    

In [32]:
df_tfidf_overall_results

Unnamed: 0,sourcename,concept_name,concept_id,domain_id,score
0,artery detect,ability to detect odor,4219729,Observation,0.640942
1,artery detect,ability to detect odour,4219729,Synonym,0.639337
2,artery detect,detect - error detection,4210550,Measurement,0.635013
3,artery detect,toxoplasma nucleic acid detect,4162819,Synonym,0.600570
4,artery detect,ability to detect odor (observable entity),4219729,Synonym,0.589273
...,...,...,...,...,...
0,sclerosis mitral valve,mitral valve sclerosis,4220765,Condition,1.000000
1,sclerosis mitral valve,mitral valve sclerosis (disorder),4220765,Synonym,0.966482
2,sclerosis mitral valve,mitral valve,4237831,Synonym,0.759840
3,sclerosis mitral valve,mitral valve disorder,319843,Condition,0.717271


In [33]:
#to df_tfidf_overall_results add column id with id for each five rows, so first five row will have id 1, second five rows will have id 2 etc.
df_tfidf_overall_results['id'] = np.repeat(np.arange(len(test_data_lemma)), 5)

In [34]:
#make new index through all
df_tfidf_overall_results = df_tfidf_overall_results.reset_index(drop=True)
df_tfidf_overall_results

Unnamed: 0,sourcename,concept_name,concept_id,domain_id,score,id
0,artery detect,ability to detect odor,4219729,Observation,0.640942,0
1,artery detect,ability to detect odour,4219729,Synonym,0.639337,0
2,artery detect,detect - error detection,4210550,Measurement,0.635013,0
3,artery detect,toxoplasma nucleic acid detect,4162819,Synonym,0.600570,0
4,artery detect,ability to detect odor (observable entity),4219729,Synonym,0.589273,0
...,...,...,...,...,...,...
90,sclerosis mitral valve,mitral valve sclerosis,4220765,Condition,1.000000,18
91,sclerosis mitral valve,mitral valve sclerosis (disorder),4220765,Synonym,0.966482,18
92,sclerosis mitral valve,mitral valve,4237831,Synonym,0.759840,18
93,sclerosis mitral valve,mitral valve disorder,319843,Condition,0.717271,18


In [35]:
#let's drop first column and replate it with test_data wheare each of the test data is repeated 5 times
df_tfidf_overall_results = df_tfidf_overall_results.drop(columns=['sourcename'])
df_tfidf_overall_results['sourcename'] = np.repeat(test_data, 5)

In [36]:
df_tfidf_overall_results[df_tfidf_overall_results['sourcename']=='Have you ever been diagnosed by a doctor with a sagging of another artery (aneurysm)?']

Unnamed: 0,concept_name,concept_id,domain_id,score,id,sourcename
50,pulmonary artery aneurysm,433783,Condition,0.817797,10,Have you ever been diagnosed by a doctor with ...
51,aneurysm,4332246,Condition,0.812089,10,Have you ever been diagnosed by a doctor with ...
52,aneurysm,4311963,Observation,0.812089,10,Have you ever been diagnosed by a doctor with ...
53,a-v aneurysm,4020316,Synonym,0.812089,10,Have you ever been diagnosed by a doctor with ...
54,aneurysm of spinal artery,4308138,Condition,0.802196,10,Have you ever been diagnosed by a doctor with ...


## Combination of different methods

In [37]:
def combine_dfs(df1, df2, df3, id_col='id', concept_col='concept_id', top_n=5):
    df_dict = {'df_tfidf': df1, 'df_poincare': df2, 'df_deepwalk': df3}
    final_dict = {}

    for name, df in df_dict.items():
        df = df.copy()  # copy the dataframe to avoid changing the original data
        df['source'] = name  # add source information
        df['original_order'] = df.groupby(id_col).cumcount()  # add order within each id
        df_grouped = df.groupby(id_col)

        for unique_id, group in df_grouped:
            if unique_id not in final_dict:
                final_dict[unique_id] = []

            for _, row in group.iterrows():
                # Only add new row if there's space and concept_id isn't already in the final list for this id
                if len(final_dict[unique_id]) < top_n and not any(d[concept_col] == row[concept_col] for d in final_dict[unique_id]):
                    final_dict[unique_id].append(row.to_dict())
                # Stop adding new rows once the top_n limit is reached
                if len(final_dict[unique_id]) == top_n:
                    break

    # Convert final dictionary to a DataFrame
    df_final = pd.DataFrame([item for sublist in final_dict.values() for item in sublist])

    # Sort the final dataframe by id and original order
    df_final = df_final.sort_values([id_col, 'original_order'])

    return df_final

# filter dataframes
df1_filtered = df_tfidf_overall_results[df_tfidf_overall_results['score'] > 0.55]
df2_filtered = df_poincare_overall_results[df_poincare_overall_results['score'] < 0.75]
df3_filtered = df_deepwalk_overall_results  # no filter for this one

df_final = combine_dfs(df1_filtered, df2_filtered, df3_filtered)


In [38]:
df_final

Unnamed: 0,concept_name,concept_id,domain_id,score,id,sourcename,source,original_order
0,ability to detect odor,4219729,Observation,0.640942,0,What artery was this detected at?,df_tfidf,0
3,Systemic arterial finding,4114166,Condition,0.603746,0,What artery was this detected at?,df_poincare,0
4,Disorder of head,4042836,Condition,0.732165,0,What artery was this detected at?,df_poincare,1
1,detect - error detection,4210550,Measurement,0.635013,0,What artery was this detected at?,df_tfidf,2
2,toxoplasma nucleic acid detect,4162819,Synonym,0.600570,0,What artery was this detected at?,df_tfidf,3
...,...,...,...,...,...,...,...,...
90,mitral valve sclerosis,4220765,Condition,1.000000,18,Sclerosis of the mitral valve,df_tfidf,0
94,Heart valve disorder,4281749,Condition,0.938245,18,Sclerosis of the mitral valve,df_deepwalk,0
91,mitral valve,4237831,Synonym,0.759840,18,Sclerosis of the mitral valve,df_tfidf,2
92,mitral valve disorder,319843,Condition,0.717271,18,Sclerosis of the mitral valve,df_tfidf,3


In [39]:
#let's put 'sourcename' column first in dataframe
cols = df_final.columns.tolist()
cols = cols[-3:] + cols[:-3]
df_final = df_final[cols]

In [40]:
df_final

Unnamed: 0,sourcename,source,original_order,concept_name,concept_id,domain_id,score,id
0,What artery was this detected at?,df_tfidf,0,ability to detect odor,4219729,Observation,0.640942,0
3,What artery was this detected at?,df_poincare,0,Systemic arterial finding,4114166,Condition,0.603746,0
4,What artery was this detected at?,df_poincare,1,Disorder of head,4042836,Condition,0.732165,0
1,What artery was this detected at?,df_tfidf,2,detect - error detection,4210550,Measurement,0.635013,0
2,What artery was this detected at?,df_tfidf,3,toxoplasma nucleic acid detect,4162819,Synonym,0.600570,0
...,...,...,...,...,...,...,...,...
90,Sclerosis of the mitral valve,df_tfidf,0,mitral valve sclerosis,4220765,Condition,1.000000,18
94,Sclerosis of the mitral valve,df_deepwalk,0,Heart valve disorder,4281749,Condition,0.938245,18
91,Sclerosis of the mitral valve,df_tfidf,2,mitral valve,4237831,Synonym,0.759840,18
92,Sclerosis of the mitral valve,df_tfidf,3,mitral valve disorder,319843,Condition,0.717271,18


In [41]:
#to excel
df_final.to_excel('/workspaces/master_thesis/combination_of_results_2807_055.xlsx', index=False)