# 1. QA retrivel with HealthCareMagic-100k-QA
100k real conversations between patients and doctors from HealthCareMagic.com

In [3]:
import json
import numpy as np
import torch
import os
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm

def load_json(filename):
    """Load JSON file containing medical QA data."""
    with open(filename, 'r', encoding='utf-8') as file:
        return json.load(file)

def save_json(data, filename):
    """Save JSON data to file."""
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=2)

def encode_text(text, model, tokenizer):
    """Encode text into BERT embeddings."""
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    # Convert to list for JSON serialization
    return outputs.last_hidden_state[:, 0, :].numpy()[0].tolist()

def preprocess_dataset_with_embeddings(json_file, model, tokenizer, batch_size=100):
    """
    Process the dataset in batches, adding embeddings to each entry
    and saving after each batch to avoid memory issues.
    """
    # Load data
    data = load_json(json_file)
    
    # Create a backup of original file
    backup_file = json_file + '.backup'
    if not os.path.exists(backup_file):
        save_json(data, backup_file)
        print(f"Original data backed up to {backup_file}")
    
    # Track which entries already have embeddings
    for i in tqdm(range(0, len(data), batch_size), desc="Processing batches"):
        batch = data[i:min(i+batch_size, len(data))]
        modified = False
        
        for j, entry in enumerate(batch):
            # Skip if already has embedding
            if "embedding" not in entry:
                # Generate combined text
                text = entry["input"] + " " + entry["output"]
                # Encode and store
                entry["embedding"] = encode_text(text, model, tokenizer)
                modified = True
                
        
        if modified:
            # Save after each batch
            save_json(data, json_file)
    
    return data

def find_best_matches_from_preprocessed(json_file, query, model, tokenizer, k=3):
    """Find best matches using preprocessed embeddings in the JSON file."""
    # Load preprocessed data with embeddings
    data = load_json(json_file)
    
    # Encode query
    query_embedding = np.array(encode_text(query, model, tokenizer))
    
    # Calculate similarities without loading all embeddings at once
    similarities = []
    for i, entry in enumerate(tqdm(data, desc="Calculating similarities")):
        if "embedding" in entry:
            entry_embedding = np.array(entry["embedding"])
            # Calculate cosine similarity
            sim = cosine_similarity([query_embedding], [entry_embedding])[0][0]
            similarities.append((i, sim))
    
    # Sort by similarity
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Get top-k results
    results = []
    for i in range(min(k, len(similarities))):
        idx, score = similarities[i]
        results.append((data[idx]["output"], score))
    
    return results


In [None]:
json_file = "Dataset/HealthCareMagic-100k-QA.json"
query = "My head is spinning when I stand up, but not when sitting. I also feel nauseous."
k = 10

# Initialize BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# First time: preprocess and add embeddings to JSON file
preprocess_dataset_with_embeddings(json_file, model, tokenizer, batch_size=100)

# After preprocessing: find matches using saved embeddings
best_matches = find_best_matches_from_preprocessed(json_file, query, model, tokenizer, k)

for i, (answer, score) in enumerate(best_matches):
    print(f"Rank {i+1} | Score: {score:.4f}\n{answer}\n")

# 2. ClinicalBERT, LLM for medical scene
This model was trained on a large multicenter dataset with a large corpus of 1.2B words of diverse diseases we constructed. We then utilized a large-scale corpus of EHRs from over 3 million patient records to fine tune the base language model.

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("medicalai/ClinicalBERT")
model = AutoModel.from_pretrained("medicalai/ClinicalBERT")

# 3. MedPub, Disease - Symptom pair dataset
Get the Disease - Symptom pairs from MedPub Dataset.


## 3.1 download dataset


In [None]:
import os
import gzip
import urllib.request
from tqdm.notebook import tqdm

# Base URL for PubMed baseline files
base_url = "https://ftp.ncbi.nlm.nih.gov/pubmed/baseline/"

# Define download directory
download_dir = "Dataset/pubmed_data"

# Create directory for downloaded files if it doesn't exist
os.makedirs(download_dir, exist_ok=True)

# Download and extract files, in total 1274 files
for i in tqdm(range(1, 10), desc="Processing files"):
    file_num = str(i).zfill(4)
    gz_filename = f"pubmed25n{file_num}.xml.gz"
    xml_filename = gz_filename[:-3]  # Remove .gz extension
    xml_filepath = os.path.join(download_dir, xml_filename)
    
    # Skip if XML already exists
    if os.path.exists(xml_filepath):
        continue
        
    url = base_url + gz_filename
    
    try:
        # Download gz file
        gz_filepath = os.path.join(download_dir, gz_filename)
        urllib.request.urlretrieve(url, gz_filepath)
        
        # Extract gz file
        with gzip.open(gz_filepath, 'rb') as gz_file:
            with open(xml_filepath, 'wb') as xml_file:
                xml_file.write(gz_file.read())
                
        # Remove gz file after extraction
        os.remove(gz_filepath)
        
    except Exception as e:
        tqdm.write(f"Error processing {gz_filename}: {str(e)}")
        continue

## 3.2 Extract the title and abstract

In [None]:
import os
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
import pandas as pd

# Directory containing the XML files
xml_dir = "Dataset/pubmed_data"

# List to store extracted data
papers = []

# Process each XML file
for filename in tqdm(os.listdir(xml_dir), desc="Processing batches"):
    if not filename.endswith('.xml'):
        continue
    
    filepath = os.path.join(xml_dir, filename)
    
    try:
        # Parse the XML file
        tree = ET.parse(filepath)
        root = tree.getroot()
        
        # Extract data from each article
        for article in root.findall('.//PubmedArticle'):
            try:
                # Extract PMID
                pmid_elem = article.find('.//PMID')
                pmid = pmid_elem.text if pmid_elem is not None else None
                
                # Extract abstract
                abstract_elem = article.find('.//AbstractText')
                abstract = abstract_elem.text if abstract_elem is not None else None
                
                # Only add if we have an abstract
                if abstract:
                    papers.append({
                        'pmid': pmid,
                        'abstract': abstract
                    })
            except Exception as e:
                tqdm.write(f"Error extracting data from article: {str(e)}")
                continue
    
    except Exception as e:
        tqdm.write(f"Error processing file {filename}: {str(e)}")
        continue

# Create DataFrame from the extracted data
papers_df = pd.DataFrame(papers)

# Display the first few rows
print(f"Total papers extracted: {len(papers_df)}")
papers_df.head()

# Save to CSV
papers_df.to_csv("Dataset/pubmed_papers.csv", index=False)
print("Data saved to Dataset/pubmed_papers.csv")


## 3.3 Generate the disease - symptom dataset
Generate the disease - symptom dataset learning from the abstract. 