In [None]:
import os
import logging
from Bio import Entrez, Medline
import pandas as pd

# 로깅 설정
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
log = logging.getLogger(__name__)

# PubMed 데이터베이스 접속
def search_pubmed(org_name, start_date, end_date):
    """
    Searches PubMed for papers that meet the org and date criteria.
    """
    Entrez.email = "dylan@grkcon.com"  # API 아이디 입력
    try:
        handle = Entrez.esearch(db="pubmed", 
                                term=f"{org_name}[Affiliation] AND ({start_date}[Date - Publication] : {end_date}[Date - Publication])", 
                                retmax=9999)
        record = Entrez.read(handle)
        log.debug("PubMed 접속 완료")
        return record['IdList']
    except Exception as e:
        log.error("PubMed 데이터베이스 접속 오류: %s", e)
        return []

# PubMed ID 리스트에 대한 상세 데이터 가져오기
def fetch_pubmed_details(ids):
    """
    Returns PubMed details from ids.
    """
    try:
        handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
        records = handle.read()
        return records
    except Exception as e:
        log.error("PubMed 세부 정보 가져오기 오류: %s", e)
        return ''

# PubMed 데이터를 파싱하여 데이터 프레임에 저장
def parse_records(records):
    """
    Parses the raw medline records into a list of dictionaries.
    """
    from io import StringIO

    records_io = StringIO(records)
    parser = Medline.parse(records_io)
    return list(parser)

# 데이터를 추출하여 저장하는 함수
def pubmed_extraction(org_name, data_file, start_date, end_date):
    """
    Extracts data from PubMed and saves it to a CSV file.
    """
    id_list = search_pubmed(org_name, start_date, end_date)
    
    if not id_list:
        log.debug("검색된 PubMed ID가 없습니다.")
        return
    
    log.debug(f"{len(id_list)}개의 PubMed ID 검색됨.")
    
    records = fetch_pubmed_details(id_list)
    
    if not records:
        log.debug("PubMed 레코드가 없습니다.")
        return

    parsed_records = parse_records(records)
    log.debug(f"{len(parsed_records)}개의 레코드 파싱됨.")

    citation_list = []
    
    for record in parsed_records:
        try:
            pmid = record.get('PMID', '-')
            title = record.get('TI', '-')
            journal = record.get('JT', '-')
            pub_year = record.get('DP', '-')[:4]
            authors = record.get('AU', [])
            author_str = ', '.join(authors)
            num_authors = len(authors)
            mesh_terms = record.get('MH', [])
            mesh_str = ', '.join(mesh_terms)
            doi = record.get('AID', ['-'])[0].split(' ')[0]
            issn = record.get('IS', '-')
            abstract = record.get('AB', '-')
            affiliations = record.get('AD', [])
            
            department_first_author = '-'
            hospital_first_author = '-'
            department_corres_author = '-'
            hospital_corres_author = '-'
            co_authors_affils = []

            # Check affiliations for each author
            for affil in affiliations:
                if org_name.lower() in affil.lower():
                    co_authors_affils.append(affil)

            co_authors_str = '; '.join(co_authors_affils) if co_authors_affils else '-'

            # First Author Affiliation
            if affiliations:
                department_first_author = affiliations[0].split(',')[0]
                hospital_first_author = ', '.join([a for a in affiliations[0].split(',') if org_name.lower() in a.lower()])

            # Corresponding Author Affiliation
            corres_author_affil = next((affil for affil in affiliations if "@" in affil), '')
            if corres_author_affil:
                department_corres_author = corres_author_affil.split(',')[0]
                hospital_corres_author = ', '.join([a for a in corres_author_affil.split(',') if org_name.lower() in a.lower()])

            # Extract additional information: diseases, treatments, and departments
            diseases = [term for term in mesh_terms if 'disease' in term.lower()]
            treatments = [term for term in mesh_terms if 'therapy' in term.lower()]
            departments = [term for term in mesh_terms if 'specialty' in term.lower()]
            
            diseases_str = ', '.join(diseases) if diseases else '-'
            treatments_str = ', '.join(treatments) if treatments else '-'
            departments_str = ', '.join(departments) if departments else '-'

            # Append the citation information to the list
            citation = [
                department_first_author, hospital_first_author, 
                department_corres_author, hospital_corres_author, 
                co_authors_str, title, journal, pub_year, author_str, 
                num_authors, mesh_str, doi, issn, abstract, diseases_str, treatments_str, departments_str
            ]
            citation_list.append(citation)
        
        except Exception as e:
            log.error(f"레코드 처리 중 오류 발생: {e}")
            continue

    columns = [
        "First Author Department", "First Author Hospital", 
        "Corresponding Author Department", "Corresponding Author Hospital", 
        "Co-Author Affiliations", "Title", "Journal", "Year", "Author", 
        "Number of Authors", "MeSH", "DOI", "ISSN", "Abstract", 
        "Diseases", "Treatments", "Departments"
    ]
    citation_df = pd.DataFrame(citation_list, columns=columns)
    
    # 디렉토리가 존재하지 않으면 생성
    os.makedirs(data_file, exist_ok=True)
    
    output_file = os.path.join(data_file, f"{org_name.replace(' ', '_')}_2022.csv")
    citation_df.to_csv(output_file, index=False, encoding="utf-8-sig")
    log.debug(f"데이터가 {output_file}에 저장되었습니다.")

# 주요 흐름
org_name = "Chungbuk National University Hospital"
start_date = "2022/01/01"
end_date = "2022/12/31"
pubmed_extraction(org_name, r"C:\Users\오현택\Desktop\python\0525_2", start_date, end_date)
