In [27]:
test_data=[
'What artery was this detected at?'
,'Atrioventricular block'
,'Atrioventricular block degree'
,'Are you aware of a coronary heart disease diagnosed by your doctor?'
,'Has a cardioversion already been performed on you?'
,'Congenital heart disease'
,'Was contrast agent used for cardio magnetic resonance imaging?'
,'Have you been in control echocardiography for the last 12 months?'
,'Have you ever been diagnosed with atrial fibrillation by a doctor?'
,'Have you ever been diagnosed with deep vein thrombosis by a doctor?'
,'Have you ever been diagnosed by a doctor with a sagging of another artery (aneurysm)?'
,'Have you ever been diagnosed by a doctor with window disease or blood circulation disorder in your legs, also called Claudicatio or arterial disease?'
,'Have you ever been diagnosed with a heart attack by a doctor?'
,'Have you ever been diagnosed with "open legs" by a doctor or an Ulcus cruris?'
,'Is a mobile electrocardiogram issued?'
,'How old were you when the diagnosis of atrial fibrillation was made?'
,'Do you have pain in one or both legs while walking?'
,'Riva-Rocci systolic before examination'
,'Sclerosis of the mitral valve']

In [28]:
import spacy
import pandas as pd
import numpy as np
nlp = spacy.load("en_core_web_sm")

In [29]:
def preprocessing_without_stemming(sample):
    if isinstance(sample, str):
        sample = sample.lower()
        token_list = []
        doc = nlp(sample)
        token_list = [token.text
            for token in doc
                if not token.is_stop and not token.is_punct
            ]
        sentence = " ".join(token_list)
        return sentence
    else:
        print(f"Invalid input {sample}: expected a string.")
        return None

In [30]:
#preprocessing of the data -> remove punctuation, stopwords, lowercase
test_data = [preprocessing_without_stemming(x) for x in test_data]

## Initialise model BiLSTM

In [31]:
import torch.nn as nn
import torch

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # 2 for bidirection

    def forward(self, x):
        # Reshape the input to (batch_size, seq_len, features)
        x = x.view(x.size(0), 20, 300)

        # Forward propagate LSTM
        out, _ = self.lstm(x)  # out: tensor of shape (batch_size, seq_length, hidden_size*2)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [32]:
model = BiLSTM(input_size=300, hidden_size=300, output_size=100)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [33]:
model.load_state_dict(torch.load('/workspaces/master_thesis/model_50epochs_conceptid.ckpt'))
#model.load_state_dict(torch.load('/workspaces/master_thesis/model_50epochs_conceptid_deepwalk.ckpt'))
#device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval() 

BiLSTM(
  (lstm): LSTM(300, 300, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=600, out_features=100, bias=True)
)

In [34]:
from gensim.models import Word2Vec
from gensim.models.poincare import PoincareModel
w2v_model = Word2Vec.load("/workspaces/master_thesis/word2vec_pubmed.model")
poincare_model = PoincareModel.load('/workspaces/master_thesis/poincare_100d_concept_id')
#deepwalk_model = Word2Vec.load("/workspaces/master_thesis/deepwalk_snomed.model")

In [35]:
def get_phrase_vector(phrase, model, max_len):
    words = str(phrase).split()
    phrase_vector = np.zeros((max_len, model.vector_size))
    for i in range(max_len):
        if i < len(words) and words[i] in model.wv:
            phrase_vector[i] = model.wv[words[i]]
    phrase_vector = phrase_vector.flatten()
    
    return torch.tensor(phrase_vector, dtype=torch.float)

In [36]:
df_concept=pd.read_csv('/workspaces/master_thesis/CONCEPT.csv', on_bad_lines='skip', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [39]:
df_overall_results = pd.DataFrame()

for n in test_data:
    x = get_phrase_vector(n, w2v_model, 20)
    x = x.unsqueeze(0)
    output = model(x)
    #result = deepwalk_model.wv.most_similar([output.detach().numpy()[0]], topn=5)
    result = poincare_model.kv.most_similar([output.detach().numpy()[0]], topn=5)
    
    for i in range(len(result)):
        # create a new dataframe for each result
        df_temp = df_concept[df_concept['concept_id'] == int(result[i][0])].copy()
        df_temp['score'] = result[i][1]
        df_temp['sourcename'] = n
        
        # concatenate this dataframe with df_overall_results
        df_overall_results = pd.concat([df_overall_results, df_temp[['sourcename', 'concept_id', 'concept_name', 'domain_id', 'score']]])

In [40]:
df_overall_results

Unnamed: 0,sourcename,concept_id,concept_name,domain_id,score
723234,artery detected,4114166,Systemic arterial finding,Condition,0.603746
284416,artery detected,4042836,Disorder of head,Condition,0.732165
828468,artery detected,321887,Disorder of artery,Condition,0.737198
908023,artery detected,321052,Peripheral vascular disease,Condition,0.739125
723610,artery detected,4115390,Mediastinal finding,Condition,0.758339
...,...,...,...,...,...
300179,sclerosis mitral valve,4048213,Heart valve stenosis and regurgitation,Condition,1.459595
1039427,sclerosis mitral valve,4189343,Aortic valve stenosis,Condition,1.467915
659653,sclerosis mitral valve,4158911,Non-rheumatic heart valve disorder,Condition,1.501683
1158915,sclerosis mitral valve,315273,Mitral valve stenosis,Condition,1.530830


In [26]:
#to excel
#df_overall_results.to_excel('/workspaces/master_thesis/results_deepwalk_bilstm_1706.xlsx')
df_overall_results.to_excel('/workspaces/master_thesis/results_poincare_bilstm_1706.xlsx')

In [25]:
pip install openpyxl

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting openpyxl
  Downloading openpyxl-3.1.2-py2.py3-none-any.whl (249 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.0/250.0 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.2
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
