In [1]:
import os
import re
import pandas as pd
import logging
from Bio import Entrez, Medline

# 로깅 설정
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)

# PubMed 데이터베이스 접속
def search_pubmed(org_name, year, month):
    """
    Searches PubMed for papers that meet the org and date criteria.
    """
    Entrez.email = "dylan@grkcon.com"  # API 아이디 입력
    try:
        handle = Entrez.esearch(db="pubmed", 
                                term=f"{org_name}[affil] AND {year}/{month}/01:{year}/{str(int(month) + 1)}/01[dp]", 
                                retmax=9999)
        record = Entrez.read(handle)
        log.debug("PubMed 접속 완료")
        return record['IdList']
    except Exception as e:
        log.error("PubMed 데이터베이스 접속 오류: %s", e)
        return []

# PubMed ID 리스트에 대한 상세 데이터 가져오기
def fetch_pubmed_details(ids):
    """
    Returns PubMed details from ids.
    """
    try:
        handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
        records = handle.read()
        return records
    except Exception as e:
        log.error("PubMed 세부 정보 가져오기 오류: %s", e)
        return ''

# PubMed 데이터를 파싱하여 데이터 프레임에 저장
def parse_records(records):
    """
    Parses the raw medline records into a list of dictionaries.
    """
    from io import StringIO

    records_io = StringIO(records)
    parser = Medline.parse(records_io)
    return list(parser)

# 데이터를 추출하여 저장하는 함수
def pubmed_extraction(org_name, data_file, year, month):
    """
    Extracts data from PubMed and saves it to a CSV file.
    """
    id_list = search_pubmed(org_name, year, month)
    
    if not id_list:
        log.debug("검색된 PubMed ID가 없습니다.")
        return
    
    log.debug(f"{len(id_list)}개의 PubMed ID 검색됨.")
    
    records = fetch_pubmed_details(id_list)
    
    if not records:
        log.debug("PubMed 레코드가 없습니다.")
        return

    parsed_records = parse_records(records)
    log.debug(f"{len(parsed_records)}개의 레코드 파싱됨.")

    citation_list = []
    
    for record in parsed_records:
        try:
            pmid = record.get('PMID', '-')
            title = record.get('TI', '-')
            journal = record.get('JT', '-')
            pub_year = record.get('DP', '-')[:4]
            authors = record.get('AU', [])
            author_str = ', '.join(authors)
            num_authors = len(authors)
            mesh_terms = record.get('MH', [])
            mesh_str = ', '.join(mesh_terms)
            doi = record.get('AID', ['-'])[0].split(' ')[0]
            issn = record.get('IS', '-')
            abstract = record.get('AB', '-')
            affiliations = record.get('AD', [])
            
            department = '-'
            hospital = '-'
            
            # Check if the first author or corresponding author is from Chungbuk National University
            first_author_affil = affiliations[0] if affiliations else ''
            corres_author_affil = [affil for affil in affiliations if "@" in affil]
            corres_author_affil = corres_author_affil[0] if corres_author_affil else ''
            
            if org_name in first_author_affil or org_name in corres_author_affil:
                if org_name in first_author_affil:
                    department = first_author_affil.split(',')[0]
                    hospital = ', '.join([a for a in first_author_affil.split(',') if org_name in a])
                elif org_name in corres_author_affil:
                    department = corres_author_affil.split(',')[0]
                    hospital = ', '.join([a for a in corres_author_affil.split(',') if org_name in a])

                citation = [department, hospital, title, journal, pub_year, author_str, num_authors, mesh_str, doi, issn, abstract, mesh_str]
                citation_list.append(citation)
        
        except Exception as e:
            log.error(f"레코드 처리 중 오류 발생: {e}")
            continue

    columns = ["Department", "Hospital", "Title", "Journal", "Year", "Author", "Number of Authors", "MeSH", "DOI", "ISSN", "Abstract", "Keywords"]
    citation_df = pd.DataFrame(citation_list, columns=columns)
    
    # 디렉토리가 존재하지 않으면 생성
    os.makedirs(data_file, exist_ok=True)
    
    output_file = os.path.join(data_file, f"{org_name}_{year}_{month}-{str(int(month) + 1)}.csv")
    citation_df.to_csv(output_file, index=False, encoding="utf-8-sig")
    log.debug(f"데이터가 {output_file}에 저장되었습니다.")

# 주요 흐름
for i in range(1,13): 
    pubmed_extraction("Chungbuk National University Hospital", r"C:\Users\오현택\Desktop\python\0518", "2022", i)


2024-05-20 10:16:09,717 - DEBUG - PubMed 접속 완료
2024-05-20 10:16:09,718 - DEBUG - 76개의 PubMed ID 검색됨.
2024-05-20 10:16:14,089 - DEBUG - 76개의 레코드 파싱됨.
2024-05-20 10:16:14,119 - DEBUG - 데이터가 C:\Users\오현택\Desktop\python\0518\Chungbuk National University Hospital_2022_1-2.csv에 저장되었습니다.
2024-05-20 10:16:15,339 - DEBUG - PubMed 접속 완료
2024-05-20 10:16:15,341 - DEBUG - 36개의 PubMed ID 검색됨.
2024-05-20 10:16:18,795 - DEBUG - 36개의 레코드 파싱됨.
2024-05-20 10:16:18,802 - DEBUG - 데이터가 C:\Users\오현택\Desktop\python\0518\Chungbuk National University Hospital_2022_2-3.csv에 저장되었습니다.
2024-05-20 10:16:20,249 - DEBUG - PubMed 접속 완료
2024-05-20 10:16:20,251 - DEBUG - 33개의 PubMed ID 검색됨.
2024-05-20 10:16:23,096 - DEBUG - 33개의 레코드 파싱됨.
2024-05-20 10:16:23,113 - DEBUG - 데이터가 C:\Users\오현택\Desktop\python\0518\Chungbuk National University Hospital_2022_3-4.csv에 저장되었습니다.
2024-05-20 10:16:24,424 - DEBUG - PubMed 접속 완료
2024-05-20 10:16:24,426 - DEBUG - 33개의 PubMed ID 검색됨.
2024-05-20 10:16:26,672 - DEBUG - 33개의 레코드 파싱됨.
2024-