In [1]:
from Bio import Entrez
import time
import pandas as pd

In [4]:
# 设置Entrez的email
Entrez.email = 'xc2611@nyu.edu'

def search(query, start_year=None, end_year=None):
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='100000',
                            retmode='xml',
                            term=query,
                            datetype='pdat' if start_year and end_year else 'none',
                            mindate=start_year if start_year else '',
                            maxdate=end_year if end_year else '')
    results = Entrez.read(handle)
    return results

def fetch_details(id_list, max_retries=3, retry_delay=5):
    ids = ','.join(id_list)
    attempt = 0
    while attempt < max_retries:
        try:
            handle = Entrez.efetch(db='pubmed',
                                   retmode='xml',
                                   id=ids)
            results = Entrez.read(handle)
            return results
        except Exception as e:
            print(f"Error fetching details, attempt {attempt + 1}/{max_retries}: {e}")
            attempt += 1
            time.sleep(retry_delay)
    raise Exception("Failed to fetch details after multiple attempts")

In [5]:
# 搜索研究
query = "Crohn's disease"
studies = search(query)
total_records = int(studies['Count'])
print(total_records)

69527


In [6]:
# 定义时间段
time_periods = [(1950, 1990), (1991, 2000), (2001, 2010),
                (2011, 2015), (2016, 2020), (2021, 2025)]



# 初始化数据列表
title_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
doi_list = []

if total_records <= 10000:
    print("Fetching all records...")
    id_list = studies["IdList"]
    # 直接提取所有结果
    chunk_size = 1000  # 每批次请求的文章数量
    for chunk_i in range(0, len(id_list), chunk_size):
        chunk = id_list[chunk_i:chunk_i + chunk_size]
        try:
            papers = fetch_details(chunk)
            for paper in papers['PubmedArticle']:
                paper_id = paper['MedlineCitation']['PMID']
                title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
                try:
                    abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
                except KeyError:
                    abstract_list.append('No Abstract')
                journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
                language_list.append(paper['MedlineCitation']['Article']['Language'][0])
                try:
                    pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
                except KeyError:
                    pubdate_year_list.append('No Data')
                try:
                    doi = next((elocation_id for elocation_id in paper['MedlineCitation']['Article']['ELocationID'] 
                               if elocation_id.attributes['EIdType'] == 'doi'), 'No DOI')
                except KeyError:
                    doi = 'No DOI'
                doi_list.append(doi)
        except Exception as e:
            print(f"Error fetching details for chunk {chunk_i}: {e}")

else:
    print("Too many hits. Splitting search by time periods...")
    for start_year, end_year in time_periods:
        print(f"Fetching abstracts for {start_year}-{end_year}...")
        # 按时间段搜索
        studies = search(query=query, start_year=start_year, end_year=end_year)
        total_records = int(studies['Count'])
        if total_records == 0:
            continue

        id_list = studies["IdList"]
        print(f"# of hits for {start_year}-{end_year}: {total_records}")

        # 分批次提取数据
        chunk_size = 1000  # 每批次请求的文章数量
        for chunk_i in range(0, len(id_list), chunk_size):
            chunk = id_list[chunk_i:chunk_i + chunk_size]
            try:
                papers = fetch_details(chunk)
                for paper in papers['PubmedArticle']:
                    paper_id = paper['MedlineCitation']['PMID']
                    title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
                    try:
                        abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
                    except KeyError:
                        abstract_list.append('No Abstract')
                    journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
                    language_list.append(paper['MedlineCitation']['Article']['Language'][0])
                    try:
                        pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
                    except KeyError:
                        pubdate_year_list.append('No Data')
                    try:
                        doi = next((elocation_id for elocation_id in paper['MedlineCitation']['Article']['ELocationID'] 
                                   if elocation_id.attributes['EIdType'] == 'doi'), 'No DOI')
                    except KeyError:
                        doi = 'No DOI'
                    doi_list.append(doi)
            except Exception as e:
                print(f"Error fetching details for chunk {chunk_i}: {e}")

# 创建并保存DataFrame
df = pd.DataFrame({
    'Title': title_list,
    'Abstract': abstract_list,
    'Journal': journal_list,
    'Language': language_list,
    'Year': pubdate_year_list,
    'DOI': doi_list
})


Too many hits. Splitting search by time periods...
Fetching abstracts for 1950-1990...
# of hits for 1950-1990: 11173




Fetching abstracts for 1991-2000...
# of hits for 1991-2000: 7265
Fetching abstracts for 2001-2010...
# of hits for 2001-2010: 14501
Fetching abstracts for 2011-2015...
# of hits for 2011-2015: 11412
Fetching abstracts for 2016-2020...
# of hits for 2016-2020: 13911
Fetching abstracts for 2021-2025...
# of hits for 2021-2025: 12423


In [7]:
df['Month'].replace('Jan', '01', inplace=True)
df['Month'].replace('Feb', '02', inplace=True)
df['Month'].replace('Mar', '03', inplace=True)
df['Month'].replace('Apr', '04', inplace=True)
df['Month'].replace('May', '05', inplace=True)
df['Month'].replace('Jun', '06', inplace=True)
df['Month'].replace('Jul', '07', inplace=True)
df['Month'].replace('Aug', '08', inplace=True)
df['Month'].replace('Sep', '09', inplace=True)
df['Month'].replace('Oct', '10', inplace=True)
df['Month'].replace('Nov', '11', inplace=True)
df['Month'].replace('Dec', '12', inplace=True)
df['Month'].replace('No Data', np.nan, inplace=True)

KeyError: 'Month'

In [8]:
print(df)

                                                   Title  \
0            A simple index of Crohn's-disease activity.   
1                                       Crohn's disease.   
2                         Crohn's disease and pregnancy.   
3                         Crohn's disease and pregnancy.   
4                             Anorectal Crohn's disease.   
...                                                  ...   
57228  Association between inflammatory bowel disease...   
57229  Is Occupation a Risk Factor for Developing Inf...   
57230  Changes in the Penetration Rate of Biosimilar ...   
57231  Transition Readiness in Youth with Inflammator...   
57232  HIF-Dependent <i>NFATC1</i> Activation Upregul...   

                                                Abstract  \
0                                            No Abstract   
1                                            No Abstract   
2      Seventy-eight pregnancies in 50 patients were ...   
3      This paper reports the outcome o

In [9]:
# 导出为CSV文件
csv_file_path = '/Users/xr/Desktop/Crohns_disease_abstract.csv'
df.to_csv(csv_file_path, index=False)
print(f"DataFrame has been exported to {csv_file_path}")

DataFrame has been exported to /Users/xr/Desktop/Crohns_disease_abstract.csv
