#### 1. install EDirect
sh -c "$(curl -fsSL https://ftp.ncbi.nlm.nih.gov/entrez/entrezdirect/install-edirect.sh)"

#### 2. fetch all PubMed ids 
esearch -db pubmed -query "intelligence[tiab] AND 2013:2023[dp] AND hasabstract" | efetch -format uid > pmids.csv

#### 3. fetch abstract of all the articles based on PMID as below

In [3]:
import pandas as pd
from Bio import Entrez, Medline

from opensearchpy import OpenSearch


Entrez.email = "huqiaowen0104@gmail.com"  # Set your email address

pmids_file_path = 'data/pmids.csv'

index_name = "pubmed_intelligence"

host = '127.0.0.1'
port = 9200
username = 'admin'
password = 'admin'

client = OpenSearch(hosts = [{'host': host, 'port': port}],
                        http_auth =(username, password),
                        use_ssl = True,
                        verify_certs = False,
                        ssl_assert_hostname = False,
                        ssl_show_warn = False,
                        timeout=30
                        )

In [None]:
def fetch_pubmed_record(pmid):
    try:
        # Fetch the record from PubMed
        handle = Entrez.efetch(db="pubmed", id=pmid, rettype="medline", retmode="text")
        # print(handle.read())
        records = Medline.parse(handle)
        record = next(records)

        # Extract information
        pubmed_id = record.get("PMID", "")
        title = record.get("TI", "")
        abstract = record.get("AB", "")
        keywords = record.get("OT", [])
        authors = record.get("AU", [])
        pub_date_edat = record.get("EDAT", "")
        article_date = record.get("CRDT", "")
        journal = record.get("JT", "")

        return {
            'PMID': pubmed_id,
            'Title': title,
            'Abstract': abstract,
            'Keywords': keywords,
            'Authors': authors,
            'PubDateEDAT': pub_date_edat.split(" ")[0],
            'ArticleDate':article_date[0].split(" ")[0],
            'Journal': journal
            
        }
    except Exception as e:
        print(f"Error: {e}")
        return None

In [2]:
# create index
with open("pubmed_intelligence_mappings.json", "r") as mapping_file:
    mapping_json = mapping_file.read()

try:
    response = client.indices.create(index_name,body=mapping_json)
    print("Creating index:")
    print(response)
except Exception as e:
    print(e)


Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'pubmed_intelligence'}


In [None]:
from datetime import datetime

pmids_df = pd.read_csv(pmids_file_path,header=None,names=['pmid'])
print('number or docs:',len(pmids_df))

batch_size = 100
for i in range(0,len(pmids_df),batch_size):
    data_batch = []
    pmids = pmids_df['pmid'][i:i+batch_size].tolist()
    start_t = datetime.now()
    for pmid in pmids:
        record_data = fetch_pubmed_record(pmid)
        record_index = {"index": {"_index": index_name, "_id": record_data["PMID"]}}
        data_batch.append(record_index)
        data_batch.append(record_data)
    
    try:
        # Bulk index the data
        client.bulk(body=data_batch, index=index_name)
        end_t = datetime.now()
        print(f"Records starting from {i}+ are stored into opensearch. Cost {(end_t-start_t).total_seconds()} seconds")
    except Exception as e:
        print(f"Error: {e}")

In [4]:
import json

# Replace 'path_to_your_file.json' with the actual file path
file_path = 'data/pubmed_intelligence.json'

# Open the file and load its contents
with open(file_path, 'r') as file:
    data = json.load(file)
    
print(len(data))
print(data[0])

batch_size = 10000
for i in range(0,len(data),batch_size):
    data_batch = []
    for record_data in data[i:i+batch_size]:
        record_index = {"index": {"_index": index_name, "_id": record_data["PMID"]}}
        data_batch.append(record_index)
        data_batch.append(record_data)

    try:
        # Bulk index the data
        client.bulk(body=data_batch, index=index_name)
        print(f"Records starting from {i}+ are stored into opensearch.")
    except Exception as e:
        print(f"Error: {e}")

58730
{'PMID': '26665339', 'Title': 'Using perioperative analytics to optimize OR performance.', 'Abstract': 'In the past, the data hospitals gleaned from operating rooms (ORs) tended to be static and lacking in actionable information. Hospitals can improve OR performance by applying OR analytics, such as evaluation of turnover times and expenses, which provide useful intelligence. Having the information is important, but success depends on aligning staff behavior to effectively achieve improvement strategies identified using the analytics.', 'Keywords': [], 'Authors': ['Rempfer D'], 'PubDateEDAT': '2015/12/17', 'ArticleDate': '2015/12/16', 'Journal': 'Healthcare financial management : journal of the Healthcare Financial Management Association'}
Records starting from 0+ are stored into opensearch.
Records starting from 10000+ are stored into opensearch.
Records starting from 20000+ are stored into opensearch.
Records starting from 30000+ are stored into opensearch.
Records starting fro