In [25]:
import requests
import time
import pandas as pd
import numpy as np
import os
from requests.exceptions import ConnectionError, Timeout

# Function to fetch PubMed article details for a single PubMed ID with retry mechanism
def fetch_pubmed_details(pubmed_id, retries=3, delay=8):
    efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        'db': 'pubmed',
        'id': pubmed_id,
        'retmode': 'xml',
        'rettype': 'abstract'
    }

    # Retry mechanism to handle connection errors
    for attempt in range(retries):
        try:
            response = requests.get(efetch_url, params=params, timeout=10)
            if response.status_code == 200:
                return response.text
            elif response.status_code == 429:
                print(f"Rate limit hit for PubMed ID {pubmed_id}. Retrying in {delay} seconds...")
                time.sleep(delay)
                delay *= 2  # Exponential backoff
            else:
                print(f"Error fetching details for PubMed ID {pubmed_id}: {response.status_code}")
                return None
        except (ConnectionError, Timeout) as e:
            print(f"Connection error for PubMed ID {pubmed_id}: {e}. Retrying in {delay} seconds... (Attempt {attempt+1}/{retries})")
            time.sleep(delay)
    
    print(f"Failed to fetch details for PubMed ID {pubmed_id} after {retries} attempts.")
    return None

# Function to fetch details for the PubMed IDs in the `pmid_list` column
def fetch_details_for_pmid_list(row, processed_papers_counter):
    # Safely convert the 'pmid_list' to a list (if it's an ndarray)
    pmid_list = row.get('pmid_list', [])
    if isinstance(pmid_list, (np.ndarray, pd.Series)):
        pmid_list = pmid_list.tolist()
    
    if not pmid_list:  # Skip if pmid_list is empty
        return None

    paper_details = []
    for pubmed_id in pmid_list:
        if not pubmed_id or pubmed_id == 'None':  # Skip invalid IDs
            continue

        details = fetch_pubmed_details(pubmed_id)
        if details:
            paper_details.append(details)
        
        # Increment processed paper counter
        processed_papers_counter[0] += 1
        print(f"Processed {processed_papers_counter[0]} papers so far.")
        
        # Respect API rate limits with a delay
        time.sleep(0.5)
    
    return paper_details if paper_details else None

# Function to save progress to a Parquet file
def save_progress_parquet(df, file_path):
    # Ensure file path ends with .parquet
    parquet_file_path = os.path.splitext(file_path)[0] + '.parquet'
    
    try:
        # Save the DataFrame to a Parquet file
        df.to_parquet(parquet_file_path, index=False)
        print(f"Auto-saved progress to {parquet_file_path}")
    except Exception as e:
        print(f"Failed to save progress to {parquet_file_path}. Error: {e}")

# Main processing loop
def process_and_save_pubmed_details(df_paper, file_path, start_idx, save_interval):
    processed_papers_counter = [0]  # Counter for processed papers
    total_trials = len(df_paper)

    # Start processing from `start_idx`
    print(f"Resuming from trial {start_idx + 1}")
    for idx in range(start_idx, total_trials):
        row = df_paper.iloc[idx]
        paper_details = fetch_details_for_pmid_list(row, processed_papers_counter)
        
        # Save details into separate columns if available
        if paper_details:
            for i, details in enumerate(paper_details):
                df_paper.at[idx, f'Paper_Details_{i+1}'] = details

        # Save progress every `save_interval` trials
        if (idx + 1) % save_interval == 0:
            save_progress_parquet(df_paper, file_path)

    # Final save at the end of processing
    save_progress_parquet(df_paper, file_path)


In [26]:
file_path = "/Users/jiazhengli/Desktop/project_root/trials_pmid_papers.parquet"

process_and_save_pubmed_details(df, file_path, start_idx=0, save_interval=5000)

 so far.
Processed 93994 papers so far.
Processed 93995 papers so far.
Processed 93996 papers so far.
Processed 93997 papers so far.
Processed 93998 papers so far.
Processed 93999 papers so far.
Processed 94000 papers so far.
Processed 94001 papers so far.
Processed 94002 papers so far.
Processed 94003 papers so far.
Processed 94004 papers so far.
Processed 94005 papers so far.
Processed 94006 papers so far.
Processed 94007 papers so far.
Processed 94008 papers so far.
Processed 94009 papers so far.
Processed 94010 papers so far.
Processed 94011 papers so far.
Processed 94012 papers so far.
Processed 94013 papers so far.
Processed 94014 papers so far.
Processed 94015 papers so far.
Processed 94016 papers so far.
Processed 94017 papers so far.
Processed 94018 papers so far.
Processed 94019 papers so far.
Processed 94020 papers so far.
Processed 94021 papers so far.
Processed 94022 papers so far.
Processed 94023 papers so far.
Processed 94024 papers so far.
Processed 94025 papers so far.

In [44]:
# Convert numpy arrays to lists
df['pmid_list'] = df['pmid_list'].apply(lambda x: x.tolist() if isinstance(x, np.ndarray) else x)

# Count rows where pmid_list is a non-empty list
num_rows_with_pmid_list = df['pmid_list'].apply(lambda x: isinstance(x, list) and len(x) > 0).sum()

# Count total number of individual PubMed IDs across all rows
num_total_pmids = df['pmid_list'].apply(lambda x: len(x) if isinstance(x, list) else 0).sum()

print(f"Number of rows that have a non-empty pmid_list: {num_rows_with_pmid_list}")
print(f"Total number of PubMed IDs: {num_total_pmids}")


Number of rows that have a non-empty pmid_list: 136928
Total number of PubMed IDs: 708181


In [46]:
# Count occurrences of different pmid_list lengths
pmid_length_counts = df['pmid_list'].apply(lambda x: len(x) if isinstance(x, list) else 0).value_counts().sort_index()

# Print the counts
for length, count in pmid_length_counts.items():
    print(f"pmid_list with {length} PMIDs: {count}")


pmid_list with 0 PMIDs: 262471
pmid_list with 1 PMIDs: 59953
pmid_list with 2 PMIDs: 18397
pmid_list with 3 PMIDs: 11789
pmid_list with 4 PMIDs: 7882
pmid_list with 5 PMIDs: 6135
pmid_list with 6 PMIDs: 4423
pmid_list with 7 PMIDs: 3522
pmid_list with 8 PMIDs: 2831
pmid_list with 9 PMIDs: 2382
pmid_list with 10 PMIDs: 2171
pmid_list with 11 PMIDs: 1798
pmid_list with 12 PMIDs: 1592
pmid_list with 13 PMIDs: 1325
pmid_list with 14 PMIDs: 1251
pmid_list with 15 PMIDs: 1025
pmid_list with 16 PMIDs: 918
pmid_list with 17 PMIDs: 804
pmid_list with 18 PMIDs: 714
pmid_list with 19 PMIDs: 661
pmid_list with 20 PMIDs: 608
pmid_list with 21 PMIDs: 550
pmid_list with 22 PMIDs: 423
pmid_list with 23 PMIDs: 451
pmid_list with 24 PMIDs: 426
pmid_list with 25 PMIDs: 378
pmid_list with 26 PMIDs: 355
pmid_list with 27 PMIDs: 301
pmid_list with 28 PMIDs: 290
pmid_list with 29 PMIDs: 227
pmid_list with 30 PMIDs: 237
pmid_list with 31 PMIDs: 207
pmid_list with 32 PMIDs: 205
pmid_list with 33 PMIDs: 198
pmi