In [6]:
import torch
from transformers import AutoTokenizer, AutoModel
import faiss
import numpy as np
import pandas as pd

# Load the pre-trained BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

def get_embedding(text):
    """Generate BioBERT embeddings for a given text."""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state[:, 0, :].squeeze().numpy()

# Load the cleaned dataset
cleaned_df = pd.read_csv("cleaned_indiana_dataset.csv")

# Combine 'findings' and 'impression' for embedding
cleaned_df['text'] = cleaned_df['findings'] + " " + cleaned_df['impression']

# Generate embeddings for all reports
embeddings = np.array([get_embedding(text) for text in cleaned_df['text']])

# Create a FAISS index for fast similarity search
d, num_embeddings = embeddings.shape
index = faiss.IndexFlatL2(d)
index.add(embeddings)

def search_similar_cases(query, top_k=5):
    """Retrieve similar cases given a textual query."""
    query_embedding = get_embedding(query).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)
    return cleaned_df.iloc[indices[0]]

print("Text retrieval system is ready! Use search_similar_cases(query) to find related cases.")

AssertionError: 

In [5]:
# Sample query for testing
sample_query = "Patient shows signs of pneumonia with bilateral lung opacity."

# Retrieve similar cases
similar_cases = search_similar_cases(sample_query, top_k=3)

# Display the retrieved cases
print(similar_cases[['uid', 'findings', 'impression']])

NameError: name 'search_similar_cases' is not defined

In [4]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import re

# Load BioBERT model and tokenizer
biobert_model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(biobert_model_name)
model = AutoModel.from_pretrained(biobert_model_name)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load cleaned CSV file
csv_path = "cleaned_indiana_dataset.csv"  # Update the path if needed
df = pd.read_csv(csv_path)

# Text Cleaning Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"\b(?:none|not provided|normal chest x-xxxx)\b", "", text)
    text = re.sub(r"[^a-z0-9\s.,]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# Apply text cleaning
df["cleaned_findings"] = df["findings"].apply(clean_text)
df["cleaned_impression"] = df["impression"].apply(clean_text)

# Function to generate BioBERT embeddings
def get_biobert_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}  # Move inputs to GPU if available
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use the mean of the last hidden state as the sentence embedding
    embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return embedding

# Generate embeddings for all samples
df["biobert_embedding"] = df["cleaned_findings"].apply(lambda x: get_biobert_embedding(x) if x else np.zeros(768))

# Convert embeddings to numpy array
embeddings = np.vstack(df["biobert_embedding"].values)

# Save embeddings for later retrieval
np.save("biobert_embeddings.npy", embeddings)

# Save the updated dataset
df.to_csv("cleaned_indiana_dataset_with_embeddings.csv", index=False)

print("BioBERT embeddings saved successfully!")

BioBERT embeddings saved successfully!
