In [None]:
!pip install biopython

In [48]:
from Bio import Entrez
import pandas as pd
import numpy as np
import time

In [49]:
def search(query,mindate=None, maxdate=None):
    Entrez.email = 'youremail@'
    handle = Entrez.esearch(db='pubmed',
                            sort='relevance',
                            retmax='250000',
                            retmode='xml',
                            term=query,
                            mindate=mindate,
                            maxdate=maxdate)
    results = Entrez.read(handle)
    return results

In [72]:
query='keywords'
mindate=None
maxdate = None
studies = search(query, mindate, maxdate)
studiesIdList = studies['IdList']
print(len(studiesIdList))

294


In [73]:

def fetch_details(id_list, max_retries=3, retry_delay=5):
    ids = ','.join(id_list)
    Entrez.email = 'youremail@'
    attempt = 0
    while attempt < max_retries:
        try:
            handle = Entrez.efetch(db='pubmed',
                                   retmode='xml',
                                   id=ids)
            results = Entrez.read(handle)
            return results
        except Exception as e:
            print(f"Error fetching details, attempt {attempt + 1}/{max_retries}: {e}")
            attempt += 1
            time.sleep(retry_delay)
    raise Exception("Failed to fetch details after multiple attempts")

In [74]:
# 使用Set存储已提取的ID
extracted_ids = set()

# 分批次提取数据
title_list = []
abstract_list = []
journal_list = []
language_list = []
pubdate_year_list = []
pubdate_month_list = []
doi_list = []

chunk_size = 1000  # 每批次请求的文章数量
for chunk_i in range(0, len(studiesIdList), chunk_size):
    chunk = studiesIdList[chunk_i:chunk_i + chunk_size]
    try:
        papers = fetch_details(chunk)
        for paper in papers['PubmedArticle']:
            paper_id = paper['MedlineCitation']['PMID']
            if paper_id not in extracted_ids:
                extracted_ids.add(paper_id)
                title_list.append(paper['MedlineCitation']['Article']['ArticleTitle'])
                try:
                    abstract_list.append(paper['MedlineCitation']['Article']['Abstract']['AbstractText'][0])
                except KeyError:
                    abstract_list.append('No Abstract')
                journal_list.append(paper['MedlineCitation']['Article']['Journal']['Title'])
                language_list.append(paper['MedlineCitation']['Article']['Language'][0])
                try:
                    pubdate_year_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Year'])
                except KeyError:
                    pubdate_year_list.append('No Data')
                try:
                    pubdate_month_list.append(paper['MedlineCitation']['Article']['Journal']['JournalIssue']['PubDate']['Month'])
                except KeyError:
                    pubdate_month_list.append('No Data')
                # 提取DOI
                try:
                    elocation_ids = paper['MedlineCitation']['Article']['ELocationID']
                    doi = next((elocation_id for elocation_id in elocation_ids if elocation_id.attributes['EIdType'] == 'doi'), 'No DOI')
                except KeyError:
                    doi = 'No DOI'
                doi_list.append(doi)
    except Exception as e:
        print(f"Error fetching details for chunk {chunk_i}: {e}")

# 创建DataFrame
df = pd.DataFrame(list(zip(title_list, abstract_list, journal_list, language_list, pubdate_year_list, pubdate_month_list, doi_list)),
                  columns=['Title', 'Abstract', 'Journal', 'Language', 'Year', 'Month', 'DOI'])


In [76]:
df['Month'].replace('Jan', '01', inplace=True)
df['Month'].replace('Feb', '02', inplace=True)
df['Month'].replace('Mar', '03', inplace=True)
df['Month'].replace('Apr', '04', inplace=True)
df['Month'].replace('May', '05', inplace=True)
df['Month'].replace('Jun', '06', inplace=True)
df['Month'].replace('Jul', '07', inplace=True)
df['Month'].replace('Aug', '08', inplace=True)
df['Month'].replace('Sep', '09', inplace=True)
df['Month'].replace('Oct', '10', inplace=True)
df['Month'].replace('Nov', '11', inplace=True)
df['Month'].replace('Dec', '12', inplace=True)
df['Month'].replace('No Data', np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Month'].replace('Jan', '01', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Month'].replace('Feb', '02', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values

In [77]:
print(df)

                                                 Title  \
0    A Systematic Review of Monogenic Inflammatory ...   
1    Interferon-induced IL-10 drives systemic T-cel...   
2    M2 macrophage-related gene signature in chroni...   
3    An HDAC6 inhibitor reverses chemotherapy-induc...   
4    The expression of IL10RA in colorectal cancer ...   
..                                                 ...   
289  Prior transfusion of umbilical cord mesenchyma...   
290  [Interleukin-10 receptor mutations in children...   
291  Diminished expression of ICOS, GITR and CTLA-4...   
292  [Relationship between gene polymorphism at rs2...   
293  [Differential expression of microRNA in eutopi...   

                                              Abstract  \
0    Advances in genomic technologies have led to i...   
1    Patients with chronic liver disease (CLD), inc...   
2    Chronic rhinosinusitis with nasal polyps (CRSw...   
3    Chemotherapy-induced peripheral neuropathy (CI...   
4    Despite 

In [78]:
# 导出为CSV文件
csv_file_path = ''
df.to_csv(csv_file_path, index=False)
print(f"DataFrame has been exported to {csv_file_path}")

DataFrame has been exported to C:\Users\86187\OneDrive\桌面\ailomics\data\IL10RA_abstract.csv
