In [1]:
# https://pmc.ncbi.nlm.nih.gov/articles/PMC8374293/

In [2]:
from Bio import Entrez
from datetime import datetime
import json
import os  

Entrez.email = "yilianz@uci.edu"

# Rename disease names for recorded files 
def sanitize_filename(name):
    
    return "".join(c if c.isalnum() or c in ("-", "_") else "_" for c in name).strip()

# Separate list of diseases into n piese 
def split_list(lst, n):
    
    k, m = divmod(len(lst), n)
    return [lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n)]



def search_and_save_all_pubmed(disease, filename, json_filename, abnormal_file, start_year=2015):

    # set up base search term
    base_search_term = (
    "(electronic health record OR electronic health records OR electronic medical record OR electronic medical records OR EHR OR EMR) "
    "AND ({rare_disease}) "
    #"AND (large language model OR language model OR NLP OR natural language processing OR language processing OR machine learning OR artificial intelligence OR predictive modeling OR deep learning)"
)
    search_term = base_search_term.format(rare_disease=f'{disease.lower()}')
    
    date_range = f"{start_year}/01/01:3000/12/31[dp]"  
    search_query = f"{search_term} AND {date_range}"

    
    with Entrez.esearch(db="pubmed", term=search_query, retmax=100000, usehistory="y") as search_handle:
        search_results = Entrez.read(search_handle)
    
    total_count = int(search_results["Count"])
    #print(f"Search term: '{search_query}'")
    #print(f"Total results found: {total_count}")


    # Append research reults number to Json 
    if os.path.exists(json_filename):
        with open(json_filename, "r", encoding="utf-8") as json_file:
            existing_data = json.load(json_file)
    else:
        existing_data = {"true_count": 0,"count": 0, "results": []}

    existing_data["results"].append({"disease_name": disease,"search_query": search_query, "count": total_count})

    # Get Abnormal count out of calculations, specified how many total research achived and how many real we need  
    if total_count >= 1000:
        existing_data["count"] += total_count
    else:
        existing_data["true_count"] += total_count
        existing_data["count"] += total_count
        

    with open(json_filename, "w", encoding="utf-8") as json_file:
        json.dump(existing_data, json_file, indent=4)
        
    if total_count == 0:
        
        return
    
    # Check for abnormal & record 
    if total_count >= 1000:
            print(f"Abnormal count detected for {disease}: {total_count}.")
            with open(abnormal_file, "a") as ab_file:
                ab_file.write(f"{disease}: {total_count}\n")
            return
    
    # Use WebEnv and QueryKey for batch fetching
    webenv = search_results["WebEnv"]
    query_key = search_results["QueryKey"]

    all_data = ""
    batch_size = 10000
    
    for start in range(0, total_count, batch_size):
        
        with Entrez.efetch(
            db="pubmed",
            rettype="medline",
            retmode="text",
            retstart=start,
            retmax=batch_size,
            webenv=webenv,
            query_key=query_key
        ) as fetch_handle:
            all_data += fetch_handle.read()

    
    # Append PubMed Search 
    #with open(filename, "a") as file:
        #file.write(all_data)
    
    # Create separate file for recording 
    disease_dir = './pubmed_search/all_diseases'
    os.makedirs(disease_dir, exist_ok=True)
    disease_file = os.path.join(disease_dir, f"{sanitize_filename(disease)}.txt")
    with open(disease_file, "w") as file:
        file.write(all_data)


In [3]:
path_file = "./disease_json/data.json"
# Read Orphanet Rare Disease names 
with open(path_file, "r", encoding="utf-8") as file:
    disease_data = json.load(file)
rare_diseases = [d['Name'] for d in disease_data['disease']]

In [None]:
output_json_filename = f"./pubmed_search/pubmed_search_summary_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"



for disease in rare_diseases:
    search_and_save_all_pubmed(disease, output_filename, output_json_filename)
