In [1]:
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec
from gensim.models.poincare import PoincareModel
import pandas as pd
import numpy as np
import torch
import torch.nn as nn


In [2]:
df = pd.read_csv('/workspaces/master_thesis/mapping/data_ready_to_use.csv')

In [3]:
df

Unnamed: 0,concept_id,concept_name,concept_synonym_name,preprocessed,preprocessed_synonyms,preprocessed_without_stemming,preprocessed_synonyms_without_stemming
0,4001098,Radiating chest pain,Radiating chest pain (finding),radiat chest pain,radiat chest pain find,radiating chest pain,radiating chest pain finding
1,37392117,Urine tryptophan:creatinine ratio,Urine tryptophan:creatinine ratio (observable ...,urin tryptophan creatinin ratio,urin tryptophan creatinin ratio observ entiti,urine tryptophan creatinine ratio,urine tryptophan creatinine ratio observable e...
2,37398455,Urine threonine:creatinine ratio,Urine threonine:creatinine ratio (observable e...,urin threonin creatinin ratio,urin threonin creatinin ratio observ entiti,urine threonine creatinine ratio,urine threonine creatinine ratio observable en...
3,37392118,Urine taurine:creatinine ratio,Urine taurine:creatinine ratio (observable ent...,urin taurin creatinin ratio,urin taurin creatinin ratio observ entiti,urine taurine creatinine ratio,urine taurine creatinine ratio observable entity
4,37392119,Urine phenylalanine:creatinine ratio,Urine phenylalanine:creatinine ratio (observab...,urin phenylalanin creatinin ratio,urin phenylalanin creatinin ratio observ entiti,urine phenylalanine creatinine ratio,urine phenylalanine creatinine ratio observabl...
...,...,...,...,...,...,...,...
491491,37398450,Urine homocysteine:creatinine ratio,Urine homocysteine:creatinine ratio (observabl...,urin homocystein creatinin ratio,urin homocystein creatinin ratio observ entiti,urine homocysteine creatinine ratio,urine homocysteine creatinine ratio observable...
491492,37398451,Urine aspartate:creatinine ratio,Urine aspartate:creatinine ratio (observable e...,urin aspart creatinin ratio,urin aspart creatinin ratio observ entiti,urine aspartate creatinine ratio,urine aspartate creatinine ratio observable en...
491493,37398452,Urine alanine:creatinine ratio,Urine alanine:creatinine ratio (observable ent...,urin alanin creatinin ratio,urin alanin creatinin ratio observ entiti,urine alanine creatinine ratio,urine alanine creatinine ratio observable entity
491494,37398453,Urine valine:creatinine ratio,Urine valine:creatinine ratio (observable entity),urin valin creatinin ratio,urin valin creatinin ratio observ entiti,urine valine creatinine ratio,urine valine creatinine ratio observable entity


In [4]:
class PhraseEmbeddingDataset(Dataset):
    def __init__(self, X, y, w2v_model, poincare_model, max_len=20):
        self.X = X
        self.y = y
        self.w2v_model = w2v_model
        self.poincare_model = poincare_model
        self.max_len = max_len

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        # Get Word2Vec embedding
        X = self.get_phrase_vector(self.X.iloc[idx], self.w2v_model, self.max_len)
        
        # Get Poincare embedding
        y = torch.tensor(self.poincare_model.kv[self.y.iloc[idx]], dtype=torch.float)
        #y = torch.tensor(self.poincare_model.wv[str(self.y.iloc[idx])], dtype=torch.float)
        return X, y

    @staticmethod
    def get_phrase_vector(phrase, model, max_len):
        words = str(phrase).split()
        phrase_vector = np.zeros((max_len, model.vector_size))

        for i in range(max_len):
            if i < len(words) and words[i] in model.wv:
                phrase_vector[i] = model.wv[words[i]]

        phrase_vector = phrase_vector.flatten()
        
        return torch.tensor(phrase_vector, dtype=torch.float)

In [5]:
w2v_model = Word2Vec.load("/workspaces/master_thesis/word2vec_pubmed.model")
poincare_model = PoincareModel.load('/workspaces/master_thesis/poincare_100d_concept_id')
#deepwalk_model = Word2Vec.load("/workspaces/master_thesis/deepwalk_snomed.model")

In [6]:
# Split your phrases into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_synonyms_without_stemming'], df['concept_id'], test_size=0.02, random_state=42)

# Create your datasets
train_dataset = PhraseEmbeddingDataset(X_train, y_train, w2v_model, poincare_model)
#train_dataset = PhraseEmbeddingDataset(X_train, y_train, w2v_model, deepwalk_model)
test_dataset = PhraseEmbeddingDataset(X_test, y_test, w2v_model, poincare_model)
#test_dataset = PhraseEmbeddingDataset(X_test, y_test, w2v_model, deepwalk_model)

# Create your data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [18]:
import torch
from torch import nn

class TransformerModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, nhead=10, num_layers=2):
        super(TransformerModel, self).__init__()

        self.hidden_size = hidden_size
        encoder_layers = nn.TransformerEncoderLayer(d_model=input_size, nhead=nhead, dim_feedforward=hidden_size)
        self.transformer = nn.TransformerEncoder(encoder_layers, num_layers)
        self.fc = nn.Linear(input_size, output_size)  # Adjusting the input dimension of the FC layer to match the output of the TransformerEncoder

    def forward(self, x):
        # Reshape the input to (seq_len, batch_size, features)
        x = x.view(20, x.size(0), 300)  # TransformerEncoder expects (seq_len, batch_size, features)
        # Forward propagate transformer
        out = self.transformer(x)  # out: tensor of shape (seq_len, batch_size, hidden_size)
        # Decode the hidden state of the last time step
        out = self.fc(out[-1])
        return out


In [19]:
# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
# Initialize the model, loss function, and optimizer
model = TransformerModel(300, 300, 100).to(device)
criterion = nn.MSELoss()  # adjust the loss function to your problem
optimizer = torch.optim.Adam(model.parameters())

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    for i, (phrases, labels) in enumerate(train_loader):
        phrases = phrases.to(device)
        labels = labels.to(device)
        # Forward pass
        outputs = model(phrases)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))


Epoch [1/10], Step [100/7527], Loss: 0.0167
Epoch [1/10], Step [200/7527], Loss: 0.0102


KeyboardInterrupt: 