In [1]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models.poincare import PoincareModel
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

### Preparing dataset

In [2]:
df = pd.read_csv('/workspaces/master_thesis/mapping/data_ready_to_use.csv')

In [3]:
df

Unnamed: 0,concept_id,concept_name,concept_synonym_name,preprocessed,preprocessed_synonyms,preprocessed_without_stemming,preprocessed_synonyms_without_stemming
0,4001098,Radiating chest pain,Radiating chest pain (finding),radiat chest pain,radiat chest pain find,radiating chest pain,radiating chest pain finding
1,37392117,Urine tryptophan:creatinine ratio,Urine tryptophan:creatinine ratio (observable ...,urin tryptophan creatinin ratio,urin tryptophan creatinin ratio observ entiti,urine tryptophan creatinine ratio,urine tryptophan creatinine ratio observable e...
2,37398455,Urine threonine:creatinine ratio,Urine threonine:creatinine ratio (observable e...,urin threonin creatinin ratio,urin threonin creatinin ratio observ entiti,urine threonine creatinine ratio,urine threonine creatinine ratio observable en...
3,37392118,Urine taurine:creatinine ratio,Urine taurine:creatinine ratio (observable ent...,urin taurin creatinin ratio,urin taurin creatinin ratio observ entiti,urine taurine creatinine ratio,urine taurine creatinine ratio observable entity
4,37392119,Urine phenylalanine:creatinine ratio,Urine phenylalanine:creatinine ratio (observab...,urin phenylalanin creatinin ratio,urin phenylalanin creatinin ratio observ entiti,urine phenylalanine creatinine ratio,urine phenylalanine creatinine ratio observabl...
...,...,...,...,...,...,...,...
491491,37398450,Urine homocysteine:creatinine ratio,Urine homocysteine:creatinine ratio (observabl...,urin homocystein creatinin ratio,urin homocystein creatinin ratio observ entiti,urine homocysteine creatinine ratio,urine homocysteine creatinine ratio observable...
491492,37398451,Urine aspartate:creatinine ratio,Urine aspartate:creatinine ratio (observable e...,urin aspart creatinin ratio,urin aspart creatinin ratio observ entiti,urine aspartate creatinine ratio,urine aspartate creatinine ratio observable en...
491493,37398452,Urine alanine:creatinine ratio,Urine alanine:creatinine ratio (observable ent...,urin alanin creatinin ratio,urin alanin creatinin ratio observ entiti,urine alanine creatinine ratio,urine alanine creatinine ratio observable entity
491494,37398453,Urine valine:creatinine ratio,Urine valine:creatinine ratio (observable entity),urin valin creatinin ratio,urin valin creatinin ratio observ entiti,urine valine creatinine ratio,urine valine creatinine ratio observable entity


In [4]:
class PhraseEmbeddingDataset(Dataset):
    def __init__(self, X, y, w2v_model, poincare_model, max_len=20):
        self.X = X
        self.y = y
        self.w2v_model = w2v_model
        self.poincare_model = poincare_model
        self.max_len = max_len

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Get Word2Vec embedding
        X = self.get_phrase_vector(self.X.iloc[idx], self.w2v_model, self.max_len)
        
        # Get Poincare embedding
        y = torch.tensor(self.poincare_model.kv[self.y.iloc[idx]], dtype=torch.float)

        return X, y

    @staticmethod
    def get_phrase_vector(phrase, model, max_len):
        words = str(phrase).split()
        phrase_vector = np.zeros((max_len, model.vector_size))

        for i in range(max_len):
            if i < len(words) and words[i] in model.wv:
                phrase_vector[i] = model.wv[words[i]]

        phrase_vector = phrase_vector.flatten()
        
        return torch.tensor(phrase_vector, dtype=torch.float)



In [5]:
w2v_model = Word2Vec.load("/workspaces/master_thesis/word2vec_pubmed.model")
poincare_model = PoincareModel.load('/workspaces/master_thesis/poincare_100d_preprocessed')

In [6]:
# Split your phrases into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_synonyms_without_stemming'], df['preprocessed'], test_size=0.2, random_state=42)

# Create your datasets
train_dataset = PhraseEmbeddingDataset(X_train, y_train, w2v_model, poincare_model)
test_dataset = PhraseEmbeddingDataset(X_test, y_test, w2v_model, poincare_model)

# Create your data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

### Training model

In [7]:
import torch.nn as nn

class BiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(BiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)  # 2 for bidirection

    def forward(self, x):
        # Reshape the input to (batch_size, seq_len, features)
        x = x.view(x.size(0), 20, 300)

        # Forward propagate LSTM
        out, _ = self.lstm(x)  # out: tensor of shape (batch_size, seq_length, hidden_size*2)

        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out

In [8]:
model = BiLSTM(input_size=300, hidden_size=300, output_size=100)

In [9]:
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

In [11]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Define the number of epochs
num_epochs = 10

# Training loop
for epoch in range(num_epochs):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')


Epoch [1/10], Step [100/6144], Loss: 0.0071


KeyboardInterrupt: 

### Evaluation of the model

In [10]:
def hyporbolic_distance(x,y):
    #calculate hyporbolic distance between two vectors
    return np.arccosh(1 + 2 * np.linalg.norm(x-y)**2 / ((1 - np.linalg.norm(x)**2) * (1 - np.linalg.norm(y)**2)))

In [25]:
#load the model
model = BiLSTM(input_size=300, hidden_size=300, output_size=100)
model.load_state_dict(torch.load('model.ckpt'))
#device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.eval() 
k_values = [1, 5, 10, 20, 50]
accuracy_values = []

with torch.no_grad():
    for k in k_values:
        correct = 0
        total = 0
        for inputs, labels in test_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)
            outputs = outputs.cpu().numpy()
            labels = labels.cpu().numpy()
            for i in range(len(outputs)):
                distances = []
                for j in range(len(labels)):
                    distances.append(hyporbolic_distance(outputs[i], labels[j]))
                # get the indices of the k nearest neighbors
                indices = np.argsort(distances)[:k]
                # get the labels of the k nearest neighbors
                nearest_neighbors = labels[indices]
                # check if the true label is among the k nearest neighbors
                true_label = labels[i]
                if true_label in nearest_neighbors:
                    correct += 1
                total += 1
        accuracy = correct / total
        accuracy_values.append(accuracy)
        
for i, accuracy in zip(k_values, accuracy_values):
    print(f"Accuracy for k={i}: {accuracy * 100}%")

  return np.arccosh(1 + 2 * np.linalg.norm(x-y)**2 / ((1 - np.linalg.norm(x)**2) * (1 - np.linalg.norm(y)**2)))


Accuracy for k=1: 15.828077314343844%
Accuracy for k=5: 34.91353001017294%
Accuracy for k=10: 53.097660223804674%
Accuracy for k=20: 70.43336724313326%
Accuracy for k=50: 96.91556459816887%


In [16]:
#load the model
model = BiLSTM(input_size=300, hidden_size=300, output_size=100)
model.load_state_dict(torch.load('model.ckpt'))
#device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#get predicted label from the model and calculate the distance between the predicted label and the true label
model.eval() 
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        labels = labels.to(device)
        outputs = model(inputs)
        outputs = outputs.cpu().numpy()
        print(outputs)
        print(outputs.shape)
        labels = labels.cpu().numpy()
        print(labels)
        print(labels.shape)
        distance = hyporbolic_distance(outputs, labels)
        print(distance)
        #stop after one iteration
        break

[[ 0.15149002 -0.00588327 -0.01927167 ...  0.13347381  0.05824211
  -0.110452  ]
 [ 0.09172472  0.01436075  0.04251928 ... -0.04174102  0.03248358
  -0.01895724]
 [ 0.03682062  0.01093031  0.01620699 ...  0.02730098  0.00170489
  -0.02050288]
 ...
 [ 0.13538103  0.09132892 -0.00368471 ...  0.16022784 -0.01432297
   0.05173496]
 [-0.0598892   0.07564677 -0.04869265 ... -0.0221441   0.00432681
   0.13782363]
 [ 0.02208643  0.06257834 -0.00210662 ...  0.08658007 -0.00262798
  -0.01696791]]
(64, 100)
[[ 0.19665678  0.02667253 -0.02934773 ...  0.18277912  0.03209405
  -0.08987681]
 [ 0.18238278  0.15877901  0.08966684 ...  0.02679526 -0.00069893
  -0.07635263]
 [ 0.04590896 -0.00326635  0.1301228  ... -0.10906892  0.13626042
  -0.28794193]
 ...
 [ 0.1141592   0.08765475  0.00559242 ...  0.09552231 -0.05993672
   0.04217599]
 [-0.07532855  0.06801189 -0.0207578  ...  0.01642261 -0.00719422
   0.10086993]
 [ 0.08028945  0.04673268  0.05113677 ...  0.03378233 -0.10373269
  -0.0321836 ]]
(64, 1