In [13]:
from Bio import Entrez
import time
import pandas as pd

In [14]:
# 设置Entrez的email
Entrez.email = ''
# Entrez.api_key=''
# Entrez.max_tries=5
# Entrez.sleep_between_tries=180
# Entrez.timeout=10

def search(query, start_year=None, end_year=None):
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='100000',
                            retmode='xml',
                            term=query,
                            datetype='pdat' if start_year and end_year else 'none',
                            mindate=start_year if start_year else '',
                            maxdate=end_year if end_year else '')
    results = Entrez.read(handle)
    return results

def fetch_details(id_list, max_retries=5, retry_delay=5):
    ids = ','.join(id_list)
    attempt = 0
    while attempt < max_retries:
        try:
            handle = Entrez.efetch(db='pubmed',
                                   retmode='xml',
                                   id=ids)
            results = Entrez.read(handle)
            return results
        except Exception as e:
            print(f"Error fetching details, attempt {attempt + 1}/{max_retries}: {e}")
            attempt += 1
            time.sleep(retry_delay)
    raise Exception("Failed to fetch details after multiple attempts")

In [None]:
# 搜索研究
query = "" #Inflammatory Bowel Disease or Ulcerative colitis or Crohn's disease
studies = search(query)
total_records = int(studies['Count'])
print(total_records)

In [None]:
# 定义时间段
time_periods = [(1900, 1949), (1950, 1990), (1991, 2000), (2001, 2010),
                (2011, 2015), (2016, 2020), (2021, 2025)]

# 细分时间段的函数
def split_period(start_year, end_year):
    periods = []
    step = 3  # 每个子时间段的步长
    for year in range(start_year, end_year + 1, step):
        next_year = min(year + step - 1, end_year)
        periods.append((year, next_year))
    return periods

# 更细分时间段的函数
def small_split_period(start_year, end_year):
    periods = []
    step = 1  # 每个子时间段的步长
    for year in range(start_year, end_year + 1, step):
        next_year = min(year + step - 1, end_year)
        periods.append((year, next_year))
    return periods

# 初始化数据列表
title_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []
doi_list = []

def process_chunk(id_list):
    chunk_size = 1000  # 每批次请求的文章数量
    for chunk_i in range(0, len(id_list), chunk_size):
        chunk = id_list[chunk_i:chunk_i + chunk_size]
        try:
            papers = fetch_details(chunk)
            for paper in papers['PubmedArticle']:
                paper_id = paper['MedlineCitation']['PMID']
                title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
                try:
                    abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
                except KeyError:
                    abstract_list.append('No Abstract')
                journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
                language_list.append(paper['MedlineCitation']['Article']['Language'][0])
                try:
                    pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
                except KeyError:
                    pubdate_year_list.append('No Data')
                try:
                    pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
                except KeyError:
                    pubdate_month_list.append('No Data')
                try:
                    doi = next((elocation_id for elocation_id in paper['MedlineCitation']['Article']['ELocationID'] 
                               if elocation_id.attributes['EIdType'] == 'doi'), 'No DOI')
                except KeyError:
                    doi = 'No DOI'
                doi_list.append(doi)
        except Exception as e:
            print(f"Error fetching details for chunk {chunk_i}: {e}")

def fetch_data_for_period(start_year, end_year):
    studies = search(query=query, start_year=start_year, end_year=end_year)
    total_records = int(studies['Count'])
    print(f"# of hits for {start_year}-{end_year}: {total_records}")

    if total_records <= 10000:
        id_list = studies["IdList"]
        process_chunk(id_list)
    else:
        sub_periods = split_period(start_year, end_year)
        for sub_start_year, sub_end_year in sub_periods:
            studies = search(query=query, start_year=sub_start_year, end_year=sub_end_year)
            sub_total_records = int(studies['Count'])
            print(f"# of hits for {sub_start_year}-{sub_end_year}: {sub_total_records}")
            if sub_total_records == 0:
                continue

            id_list = studies["IdList"]
            if sub_total_records <= 10000:
                process_chunk(id_list)
            else:
                s_sub_periods=small_split_period(sub_start_year, sub_end_year)
                for s_sub_start_year, s_sub_end_year in s_sub_periods:
                    studies = search(query=query, start_year=s_sub_start_year, end_year=s_sub_end_year)
                    s_sub_total_records = int(studies['Count'])
                    print(f"# of hits for {s_sub_start_year}-{s_sub_end_year}: {s_sub_total_records}")
                    id_list = studies["IdList"]

                    if sub_total_records <= 10000:
                        process_chunk(id_list)

if total_records <= 10000:
    print("Fetching all records...")
    id_list = studies["IdList"]
    process_chunk(id_list)
else:
    print("Too many hits. Splitting search by time periods...")
    for start_year, end_year in time_periods:
        fetch_data_for_period(start_year, end_year)

# 创建并保存DataFrame
df = pd.DataFrame({
    'Title': title_list,
    'Abstract': abstract_list,
    'Journal': journal_list,
    'Language': language_list,
    'Year': pubdate_year_list,
    'Month': pubdate_month_list,
    'DOI': doi_list
})

In [None]:
df['Month'].replace('Jan', '01', inplace=True)
df['Month'].replace('Feb', '02', inplace=True)
df['Month'].replace('Mar', '03', inplace=True)
df['Month'].replace('Apr', '04', inplace=True)
df['Month'].replace('May', '05', inplace=True)
df['Month'].replace('Jun', '06', inplace=True)
df['Month'].replace('Jul', '07', inplace=True)
df['Month'].replace('Aug', '08', inplace=True)
df['Month'].replace('Sep', '09', inplace=True)
df['Month'].replace('Oct', '10', inplace=True)
df['Month'].replace('Nov', '11', inplace=True)
df['Month'].replace('Dec', '12', inplace=True)


In [None]:
print(df)

In [None]:
# 导出为CSV文件
csv_file_path = ''
df.to_csv(csv_file_path, index=False)
print(f"DataFrame has been exported to {csv_file_path}")