# 1 Preprocess

In [None]:
import pandas as pd
import re
import numpy as np
from sentence_transformers import SentenceTransformer


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -U gdown
import gdown

file_id = '165wV72OUUHYDO3avmcrVOI2QGkoGTrL-'
gdown.download(f"https://drive.google.com/uc?id={file_id}", "pubmed_metadata_sample_full.csv", quiet=False)




Downloading...
From (original): https://drive.google.com/uc?id=165wV72OUUHYDO3avmcrVOI2QGkoGTrL-
From (redirected): https://drive.google.com/uc?id=165wV72OUUHYDO3avmcrVOI2QGkoGTrL-&confirm=t&uuid=f9698a8e-522c-428c-9d13-cdf68ff239d7
To: /content/pubmed_metadata_sample_full.csv
100%|██████████| 294M/294M [00:02<00:00, 107MB/s]


'pubmed_metadata_sample_full.csv'

In [None]:
pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl (31.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.3/31.3 MB[0m [31m59.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.11.0


In [None]:
import json
import math
import numpy as np
import pandas as pd
import re
from sentence_transformers import SentenceTransformer
import faiss

##############################################
# 1. Preprocess the PubMed Metadata (Corpus)
##############################################

# Read the CSV file with columns: pmid, title, abstract, keywords
df = pd.read_csv("pubmed_metadata_sample_full.csv", usecols=[0, 1, 2, 3])
df.columns = ['pmid', 'title', 'abstract', 'keywords']
df = df.dropna(subset=['title', 'abstract'])

def clean_text(text):
    text = re.sub(r'\s+', ' ', text)  # merge whitespace
    text = re.sub(r'[^a-zA-Z0-9., ]', '', text)  # remove special characters
    return text.strip()

df['title'] = df['title'].apply(clean_text)
df['abstract'] = df['abstract'].apply(clean_text)
df['keywords'] = df['keywords'].fillna("").apply(lambda x: clean_text(x.lower()))
# Combine text fields as full text
df['full_text'] = df['title'] + " " + df['abstract'] + " " + df['keywords']
df.to_csv("cleaned_clinical_trials.csv", index=False)
print(f"✅ Cleaned dataset: {df.shape[0]} articles")

##############################################
# 2. Build Corpus from Cleaned CSV and Create Embeddings
##############################################
# We use the cleaned CSV file to create our corpus.
# Make sure 'pmid' is treated as integer.
df['pmid'] = df['pmid'].astype(int)

# Create a dictionary mapping pmid -> full_text
corpus_text = {row['pmid']: row['full_text'] for _, row in df.iterrows()}

# Build a list of PMIDs and texts (order matters for FAISS index)
all_pmids = list(corpus_text.keys())
all_texts = [corpus_text[pid] for pid in all_pmids]


✅ Cleaned dataset: 162360 articles


# 2 choose one from the following to run (Autoencoder, BART, BioBERT, SBERT)

# Autoencoder

In [None]:
!pip install huggingface_hub[hf_xet]

Collecting hf-xet>=0.1.4 (from huggingface_hub[hf_xet])
  Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (494 bytes)
Downloading hf_xet-1.1.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (53.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.6/53.6 MB[0m [31m41.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: hf-xet
Successfully installed hf-xet-1.1.0


In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader
import numpy as np
import faiss
import hf_xet

# Define the Autoencoder model
class TextAutoencoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(TextAutoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 768),
            nn.ReLU(),
            nn.Linear(768, 512),
            nn.ReLU(),
            nn.Linear(512, encoding_dim)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 768),
            nn.ReLU(),
            nn.Linear(768, input_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return encoded, decoded

# Initialize tokenizer and base BERT model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert_model = AutoModel.from_pretrained('bert-base-uncased')

# Model parameters
encoding_dim = 256  # Dimension of our final embeddings
model = TextAutoencoder(768, encoding_dim)  # BERT hidden size is 768

# Move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
bert_model = bert_model.to(device)
model = model.to(device)
model.eval()
bert_model.eval()

# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=512):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# Encode function
def encode_texts(texts, batch_size=32):
    dataset = TextDataset(texts, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size)

    all_embeddings = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)

            # Get BERT embeddings
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            # Use mean pooling of last hidden states
            last_hidden_states = outputs.last_hidden_state
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_states.size()).float()
            sum_embeddings = torch.sum(last_hidden_states * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            bert_embeddings = sum_embeddings / sum_mask

            # Get autoencoder embeddings
            embeddings, _ = model(bert_embeddings.float())  # Ensure float type
            all_embeddings.append(embeddings.cpu())

    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings.numpy()

# Example use:
embeddings = encode_texts(all_texts, batch_size=32)

# Normalize embeddings
embeddings = embeddings.astype('float32')  # Ensure float32 for FAISS
faiss.normalize_L2(embeddings)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Save the FAISS index (optional)
faiss.write_index(index, "clinical_trials_faiss_autoencoder.index")

# Create a mapping from FAISS index position to PMIDs.
index_to_pmid = {i: pid for i, pid in enumerate(all_pmids)}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

# BART


In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

# Model
model_name = 'facebook/bart-base'  # you can also try 'facebook/bart-large' if you have enough memory
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Move to GPU
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Encode function
def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        encoded_input = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoded_input.input_ids.to(device)
        attention_mask = encoded_input.attention_mask.to(device)
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            # Take the first token hidden state (equivalent to [CLS] pooling)
            embeddings = outputs.last_hidden_state[:, 0, :]  # (batch_size, hidden_size)
            all_embeddings.append(embeddings.cpu())
    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings.numpy()


embeddings = encode_texts(all_texts, batch_size=32)

# Normalize embeddings
faiss.normalize_L2(embeddings)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Save the FAISS index (optional)
faiss.write_index(index, "clinical_trials_faiss.index")

# Create a mapping from FAISS index position to PMIDs.
index_to_pmid = {i: pid for i, pid in enumerate(all_pmids)}

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

# BioBERT

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch
import faiss

# Load BioBERT
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Set to eval mode
model.eval()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# Mean Pooling - Take average of all token embeddings (better than [CLS])
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # (batch_size, seq_len, hidden_size)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

# Function to encode texts
def encode_texts(texts, batch_size=32):
    all_embeddings = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        encoded_input = tokenizer(batch, padding=True, truncation=True, max_length=512, return_tensors="pt")
        input_ids = encoded_input.input_ids.to(device)
        attention_mask = encoded_input.attention_mask.to(device)
        with torch.no_grad():
            model_output = model(input_ids=input_ids, attention_mask=attention_mask)
            embeddings = mean_pooling(model_output, attention_mask)
            all_embeddings.append(embeddings.cpu())
    all_embeddings = torch.cat(all_embeddings, dim=0)
    return all_embeddings.numpy()

embeddings = encode_texts(all_texts, batch_size=32)

# Normalize embeddings
faiss.normalize_L2(embeddings)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Save FAISS index if needed
faiss.write_index(index, "clinical_trials_faiss_biobert.index")

# Save mapping pmid -> index
index_to_pmid = {i: pid for i, pid in enumerate(all_pmids)}


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

# SBERT

In [None]:
model = SentenceTransformer('all-mpnet-base-v2')

# Encode all texts into embeddings.
embeddings = model.encode(all_texts, batch_size=32, show_progress_bar=True, convert_to_numpy=True)

# Normalize embeddings so cosine similarity equals inner product.
faiss.normalize_L2(embeddings)

# Build a FAISS index (inner product based, which works as cosine similarity for normalized vectors)
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# Save the FAISS index (optional)
faiss.write_index(index, "clinical_trials_faiss.index")

# Create a mapping from FAISS index position to PMIDs.
index_to_pmid = {i: pid for i, pid in enumerate(all_pmids)}

# VAE

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

# 1. Enhanced TF-IDF Vectorization
vectorizer = TfidfVectorizer(
    max_features=10000,  # Increased feature size
    ngram_range=(1, 2),  # Include bigrams for better representation
    min_df=2,  # Filter extremely rare terms
    max_df=0.95  # Filter extremely common terms
)
X_tfidf = vectorizer.fit_transform(all_texts).toarray()
X_tensor = torch.tensor(X_tfidf, dtype=torch.float32)

# 2. Improved VAE Model with Deeper Architecture
class ImprovedVAE(nn.Module):
    def __init__(self, input_dim, hidden_dims=[1024, 512], latent_dim=256, dropout_rate=0.2):
        super(ImprovedVAE, self).__init__()

        # Encoder layers
        encoder_layers = []
        prev_dim = input_dim
        for h_dim in hidden_dims:
            encoder_layers.append(nn.Linear(prev_dim, h_dim))
            encoder_layers.append(nn.BatchNorm1d(h_dim))
            encoder_layers.append(nn.LeakyReLU(0.2))
            encoder_layers.append(nn.Dropout(dropout_rate))
            prev_dim = h_dim

        self.encoder = nn.Sequential(*encoder_layers)
        self.fc_mu = nn.Linear(hidden_dims[-1], latent_dim)
        self.fc_logvar = nn.Linear(hidden_dims[-1], latent_dim)

        # Decoder layers
        decoder_layers = []
        prev_dim = latent_dim
        for h_dim in reversed(hidden_dims):
            decoder_layers.append(nn.Linear(prev_dim, h_dim))
            decoder_layers.append(nn.BatchNorm1d(h_dim))
            decoder_layers.append(nn.LeakyReLU(0.2))
            decoder_layers.append(nn.Dropout(dropout_rate))
            prev_dim = h_dim

        self.decoder = nn.Sequential(*decoder_layers)
        self.fc_output = nn.Linear(hidden_dims[0], input_dim)

    def encode(self, x):
        h = self.encoder(x)
        return self.fc_mu(h), self.fc_logvar(h)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5 * logvar)
        eps = torch.randn_like(std)
        return mu + eps * std

    def decode(self, z):
        h = self.decoder(z)
        return torch.sigmoid(self.fc_output(h))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

# 3. Better Loss Function with KL annealing
class VAELoss:
    def __init__(self, beta_start=0.0, beta_end=1.0, beta_steps=10000):
        self.beta = beta_start
        self.beta_start = beta_start
        self.beta_end = beta_end
        self.beta_steps = beta_steps
        self.step_count = 0

    def __call__(self, recon_x, x, mu, logvar):
        recon_loss = F.binary_cross_entropy(recon_x, x, reduction='sum')
        kl_loss = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())

        # Update beta for annealing
        if self.step_count < self.beta_steps:
            self.beta = self.beta_start + (self.beta_end - self.beta_start) * (self.step_count / self.beta_steps)
            self.step_count += 1

        return recon_loss + self.beta * kl_loss, recon_loss.item(), kl_loss.item()

# 4. Training with Early Stopping and Learning Rate Scheduling
def train_vae(model, X_tensor, num_epochs=100, batch_size=128, learning_rate=1e-3, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    dataset = TensorDataset(X_tensor)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

    loss_fn = VAELoss(beta_start=0.0, beta_end=1.0, beta_steps=10000)

    best_loss = float('inf')
    no_improve_epochs = 0
    history = {'total_loss': [], 'recon_loss': [], 'kl_loss': []}

    for epoch in range(num_epochs):
        model.train()
        epoch_total_loss = 0
        epoch_recon_loss = 0
        epoch_kl_loss = 0

        for batch_idx, (data,) in enumerate(tqdm(loader, desc=f"Epoch {epoch+1}/{num_epochs}")):
            data = data.to(device)
            optimizer.zero_grad()

            recon_batch, mu, logvar = model(data)
            loss, recon_loss, kl_loss = loss_fn(recon_batch, data, mu, logvar)

            loss.backward()
            # Gradient clipping to prevent exploding gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

            epoch_total_loss += loss.item()
            epoch_recon_loss += recon_loss
            epoch_kl_loss += kl_loss

        avg_loss = epoch_total_loss / len(loader)
        avg_recon = epoch_recon_loss / len(loader)
        avg_kl = epoch_kl_loss / len(loader)

        history['total_loss'].append(avg_loss)
        history['recon_loss'].append(avg_recon)
        history['kl_loss'].append(avg_kl)

        print(f"Epoch {epoch+1}: Loss = {avg_loss:.4f}, Recon = {avg_recon:.4f}, KL = {avg_kl:.4f}, Beta = {loss_fn.beta:.4f}")

        scheduler.step(avg_loss)

        # Early stopping
        if avg_loss < best_loss:
            best_loss = avg_loss
            no_improve_epochs = 0
            # Save best model
            torch.save(model.state_dict(), "best_vae_model.pt")
        else:
            no_improve_epochs += 1
            if no_improve_epochs >= patience:
                print(f"Early stopping at epoch {epoch+1}")
                # Load best model
                model.load_state_dict(torch.load("best_vae_model.pt"))
                break

    # Plot loss curves
    plt.figure(figsize=(10, 6))
    plt.plot(history['total_loss'], label='Total Loss')
    plt.plot(history['recon_loss'], label='Reconstruction Loss')
    plt.plot(history['kl_loss'], label='KL Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.title('VAE Training Loss')
    plt.savefig('vae_loss_curves.png')
    plt.close()

    return model

# 5. Improved Text Encoding Function
def encode_texts_with_improved_vae(model, texts, batch_size=64):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()

    tfidf_vecs = vectorizer.transform(texts).toarray()
    tfidf_tensor = torch.tensor(tfidf_vecs, dtype=torch.float32)

    with torch.no_grad():
        all_mu = []
        for i in range(0, len(tfidf_tensor), batch_size):
            batch = tfidf_tensor[i:i+batch_size].to(device)
            mu, _ = model.encode(batch)
            all_mu.append(mu.cpu())
        return torch.cat(all_mu, dim=0).numpy()

# 6. Train the Improved VAE
input_dim = X_tfidf.shape[1]
improved_vae = ImprovedVAE(input_dim=input_dim)
trained_vae = train_vae(improved_vae, X_tensor, num_epochs=100, batch_size=128)

# 7. Generate embeddings with the improved VAE
embeddings = encode_texts_with_improved_vae(trained_vae, all_texts, batch_size=32)

# Normalize embeddings for cosine similarity
norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
normalized_embeddings = embeddings / norm

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # Inner product for cosine similarity with normalized vectors
index.add(normalized_embeddings)

# Save the FAISS index
faiss.write_index(index, "clinical_trials_faiss_improved_vae.index")

Epoch 1/100: 100%|██████████| 1269/1269 [00:11<00:00, 109.57it/s]


Epoch 1: Loss = 11948.1858, Recon = 11691.9001, KL = 5557.0481, Beta = 0.1268


Epoch 2/100: 100%|██████████| 1269/1269 [00:10<00:00, 121.94it/s]


Epoch 2: Loss = 8377.4475, Recon = 8019.8708, KL = 1936.8720, Beta = 0.2537


Epoch 3/100: 100%|██████████| 1269/1269 [00:10<00:00, 120.16it/s]


Epoch 3: Loss = 8459.2670, Recon = 8045.6955, KL = 1317.2186, Beta = 0.3806


Epoch 4/100: 100%|██████████| 1269/1269 [00:10<00:00, 122.07it/s]


Epoch 4: Loss = 8567.9318, Recon = 8123.1148, KL = 1007.6896, Beta = 0.5075


Epoch 5/100: 100%|██████████| 1269/1269 [00:10<00:00, 122.22it/s]


Epoch 5: Loss = 8664.1257, Recon = 8205.1994, KL = 806.9428, Beta = 0.6344


Epoch 6/100: 100%|██████████| 1269/1269 [00:10<00:00, 121.50it/s]


Epoch 6: Loss = 8744.5747, Recon = 8284.2494, KL = 661.4830, Beta = 0.7613


Epoch 7/100: 100%|██████████| 1269/1269 [00:10<00:00, 121.49it/s]


Epoch 7: Loss = 8814.7724, Recon = 8359.4903, KL = 553.3254, Beta = 0.8882


Epoch 8/100: 100%|██████████| 1269/1269 [00:10<00:00, 123.11it/s]


Epoch 8: Loss = 8874.2250, Recon = 8432.5494, KL = 465.4062, Beta = 0.9999


Epoch 9/100: 100%|██████████| 1269/1269 [00:10<00:00, 121.90it/s]


Epoch 9: Loss = 8865.4541, Recon = 8423.6901, KL = 441.8081, Beta = 0.9999


Epoch 10/100: 100%|██████████| 1269/1269 [00:10<00:00, 120.95it/s]


Epoch 10: Loss = 8859.5217, Recon = 8412.2709, KL = 447.2956, Beta = 0.9999


Epoch 11/100: 100%|██████████| 1269/1269 [00:10<00:00, 120.66it/s]


Epoch 11: Loss = 8856.7433, Recon = 8407.7039, KL = 449.0843, Beta = 0.9999


Epoch 12/100: 100%|██████████| 1269/1269 [00:10<00:00, 120.08it/s]


Epoch 12: Loss = 8854.3262, Recon = 8404.4439, KL = 449.9273, Beta = 0.9999
Early stopping at epoch 12


# 3

In [None]:
##############################################
# 3. Load RELISH Labels and Build Ground-Truth Mapping
##############################################
# The RELISH JSON file contains query PMIDs and their candidate relevance information.
# It is assumed that each entry has a 'pmid' and a 'response' field,
# where response contains lists under keys 'relevant', 'partial', and 'irrelevant'.

def load_labeled_data(json_file_path, num_entries=100):
    with open(json_file_path, 'r') as f:
        labeled_data = json.load(f)
    return labeled_data[:num_entries]

def extract_pmid_and_responses(labeled_data):
    queries = []
    for entry in labeled_data:
        pmid = entry['pmid']
        response = entry['response']
        queries.append({
            'pmid': pmid,
            'relevant': response.get('relevant', []),
            'partial': response.get('partial', []),
            'irrelevant': response.get('irrelevant', [])
        })
    return queries

# Update the file path as needed.
json_file_path = '/content/drive/MyDrive/RELISH_v1.json'
labeled_data = load_labeled_data(json_file_path)
queries_list = extract_pmid_and_responses(labeled_data)

# Build ground_truth mapping: for each query pmid (as int), map candidate pmid -> relevance score
# We assign: fully relevant: 2, partial: 1, irrelevant: 0
ground_truth = {}  # {query_pmid: {candidate_pmid: score}}
for entry in queries_list:
    qid = int(entry['pmid'])
    ground_truth[qid] = {}
    for pmid in entry['relevant']:
        ground_truth[qid][int(pmid)] = 2
    for pmid in entry['partial']:
        # If a candidate already exists with score 2, keep it.
        ground_truth[qid][int(pmid)] = max(ground_truth[qid].get(int(pmid), 0), 1)
    for pmid in entry['irrelevant']:
        # irrelevant explicitly scored as 0 (optional, since absence can be treated as 0)
        ground_truth[qid][int(pmid)] = 0



In [None]:
##############################################
# 4. Recommendation Function
##############################################
def recommend_articles(query_title, query_abstract, query_keywords, top_n=5):
    """
    Compute query embedding from title, abstract, keywords and search FAISS index.
    Excludes the query itself if present.
    Returns a list of recommended PMIDs.
    """
    query_text = " ".join((query_title + " " + query_abstract + " " + query_keywords).split())
    #query_embedding = model.encode([query_text], convert_to_numpy=True)
    query_embedding = encode_texts([query_text])  # returns a numpy array
    faiss.normalize_L2(query_embedding)
    # Retrieve more than top_n to allow filtering.
    D, I = index.search(query_embedding, top_n + 5)
    candidate_pmids = [index_to_pmid[int(idx)] for idx in I[0]]

    # Optionally, if the query article's PMID is known, filter it out.
    # Here, we do a simple heuristic: if the query text is very similar to a candidate's text, skip it.
    filtered = []
    for pid in candidate_pmids:
        # If the query is already in the corpus and the candidate text contains similar words, skip.
        # (Alternatively, if you know the query pmid, you can pass it in and filter exactly.)
        if query_title.lower() in corpus_text.get(pid, "").lower():
            continue
        filtered.append(pid)
        if len(filtered) == top_n:
            break
    return filtered

##############################################
# 5. Ranking Metrics Functions
##############################################
def average_precision_at_k(relevant_pmids, recommended_pmids, k):
    """
    Compute Average Precision at k.
    Treat any candidate with a relevance score >= 1 as relevant.
    """
    if not relevant_pmids:
        return 0.0
    relevant_set = set(relevant_pmids)
    num_relevant = 0.0
    ap_sum = 0.0
    for i, pid in enumerate(recommended_pmids[:k], start=1):
        # binary relevance: score >= 1 is relevant
        if pid in relevant_set:
            num_relevant += 1
            ap_sum += num_relevant / i
    return ap_sum / min(len(relevant_set), k)

def mean_average_precision(all_relevant_list, all_recommended_list, k):
    ap_scores = []
    for rels, recs in zip(all_relevant_list, all_recommended_list):
        ap = average_precision_at_k(rels, recs, k)
        ap_scores.append(ap)
    return np.mean(ap_scores) if ap_scores else 0.0

def reciprocal_rank(recommended_pmids, relevant_set):
    for i, pid in enumerate(recommended_pmids, start=1):
        if pid in relevant_set:
            return 1.0 / i
    return 0.0

def dcg_at_k(recommended_pmids, ground_truth_dict, k):
    dcg = 0.0
    for i, pid in enumerate(recommended_pmids[:k], start=1):
        # Use the graded relevance score (if missing, 0)
        score = ground_truth_dict.get(pid, 0)
        dcg += score / math.log2(i + 1)
    return dcg

def ndcg_at_k(recommended_pmids, ground_truth_dict, k):
    dcg = dcg_at_k(recommended_pmids, ground_truth_dict, k)
    # Ideal DCG: sort the relevance scores of the candidates in descending order.
    ideal_scores = sorted(ground_truth_dict.values(), reverse=True)[:k]
    idcg = sum(score / math.log2(i + 1) for i, score in enumerate(ideal_scores, start=1))
    return dcg / idcg if idcg > 0 else 0.0

In [None]:
##############################################
# 6. Evaluation Over Multiple Queries
##############################################
# Evaluate only queries that are in our ground_truth and also appear in our corpus.
query_ids = [qid for qid in ground_truth if qid in corpus_text]
K = 5

all_AP = []
all_RR = []
all_NDCG = []
per_query_results = {}

for qid in query_ids:
    # Get query text from corpus_text
    query_text = corpus_text[qid]
    # Here, we assume that the query's title, abstract and keywords can be recovered
    # by splitting or using the df if available. Otherwise, we use the full text.
    # For simplicity, we use the full_text from corpus.
    # In a real system, you would retrieve the original title, abstract, keywords.
    # Below, we simply split the full_text assuming the title is the first sentence.
    parts = query_text.split(".")
    query_title = parts[0] if parts else query_text
    # Use the remainder for abstract (keywords might be embedded)
    query_abstract = " ".join(parts[1:]) if len(parts) > 1 else ""
    query_keywords = ""  # If not separately available

    recommended_pmids = recommend_articles(query_title, query_abstract, query_keywords, top_n=K)
    per_query_results[qid] = recommended_pmids

    # For binary metrics (AP and RR), consider candidates with score>=1 as relevant.
    true_relevant_set = {pid for pid, score in ground_truth[qid].items() if score >= 1}

    ap = average_precision_at_k(list(true_relevant_set), recommended_pmids, K)
    rr = reciprocal_rank(recommended_pmids, true_relevant_set)
    ndcg = ndcg_at_k(recommended_pmids, ground_truth[qid], K)

    all_AP.append(ap)
    all_RR.append(rr)
    all_NDCG.append(ndcg)

MAP5 = np.mean(all_AP) * 100
MRR = np.mean(all_RR) * 100
NDCG5 = np.mean(all_NDCG) * 100

print(f"Overall MAP@5: {MAP5:.2f}%")
print(f"Overall MRR: {MRR:.2f}%")
print(f"Overall NDCG@5: {NDCG5:.2f}%")

# Optionally, print some per-query results.
for i, qid in enumerate(query_ids[:5]):
    print(f"\nQuery PMID: {qid}")
    print(f"Recommended PMIDs: {per_query_results[qid]}")
    binary_truth = [pid for pid, score in ground_truth[qid].items() if score >= 1]
    print(f"Ground truth relevant PMIDs: {binary_truth}")

Overall MAP@5: 27.28%
Overall MRR: 58.46%
Overall NDCG@5: 30.76%

Query PMID: 22569528
Recommended PMIDs: [19583964, 18280112, 19720745, 19402821, 21413931]
Ground truth relevant PMIDs: [17928366, 18562239, 19052640, 19060905, 19242111, 19244124, 19414607, 19805545, 19816936, 20079430, 20811985, 22028468, 22177953, 23549785, 23712012, 24089523, 25350931, 26235619, 27376062, 28474232, 29454854]

Query PMID: 23613754
Recommended PMIDs: [27739137, 22540148, 18362916, 22260668, 20675210]
Ground truth relevant PMIDs: [18818436, 20022960, 20675210, 22085933, 25533345, 25690936, 29061959, 29304842, 22307056]

Query PMID: 29409062
Recommended PMIDs: [23667458, 23281855, 20637083, 29695703, 20487513]
Ground truth relevant PMIDs: [18443018, 19772615, 22916718, 23281855, 24931993, 26355502, 28570104, 18593717, 19087303, 19237334, 20637083, 21609501, 21846404, 22080466, 22761950, 22927994, 22962469, 23229795, 23514199, 23868775, 24726865, 26455801, 27153661, 27506132, 27571416, 28113697, 28937982]