In [3]:
from argparse import ArgumentParser
from ast import literal_eval
from bs4 import BeautifulSoup
import requests
import urllib.request, urllib.parse, urllib.error
import numpy as np
import pandas as pd
import metapub
from metapub import PubMedFetcher, PubMedAuthor
import sys
from Bio import Entrez
from pathlib import Path
import os


import logging
import re

In [4]:
# 로깅 설정
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
log.addHandler(stream_handler)

# Pubmed 데이터베이스 접속
def search_pubmed(org_name, year, month):
    """
    Searches PubMed for papers that meet the org and date criteria.
    """
    Entrez.email = "dylan@grkcon.com"  # API 아이디 입력, API 코드는 NCBI 개발자 환경에서 받을 수 있음
    try:
        handle = Entrez.esearch(db="pubmed",
                                term=f"{org_name}[affil] AND {year}/{month}/01:{year}/{str(int(month) + 1)}/01[dp]",
                                retmax=9999)
        record = Entrez.read(handle)
        log.debug("PubMed database connection successful.")
        return record['IdList']
    except Exception as e:
        log.error("Error connecting to PubMed database: %s", e)
        return []

# 검색 예시
search_pubmed("Chungbuk National University Hospital", "2024", "05")

2024-05-14 11:59:47,207 - DEBUG - PubMed database connection successful.
2024-05-14 11:59:47,207 - DEBUG - PubMed database connection successful.


['38738926', '38730344', '38728497', '38720697', '38713078', '38700863', '38568686', '38556482', '38547767', '38411317', '38336490', '38331252']

In [5]:
org_name = "Chungbuk National University Hospital"
data_file =  r"C:\Users\오현택\Desktop\python\0514"
year = 2022
month = 1

# 로깅 설정
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

# 로그를 출력할 로거(logger) 생성
log = logging.getLogger(__name__)


#Pubmed 데이터 베이스 접속
def search_pubmed(org_name, year, month):
    """
    Searches PubMed for papers that meet the org and date criteria.
    """
    Entrez.email = "dylan@grkcon.com" #API 아이디 입력, API 코드는 NCBI 개발자 환경에서 받을 수 있음
    try:
        handle = Entrez.esearch(db="pubmed", 
                            term=f"{org_name}[Chungbuk National University Hospital] AND {year}/{month}/01:{year}/{str(int(month) + 1)}/01[dp]", 
                            retmax=9999)
        record = Entrez.read(handle)
        log.debug("PubMed 접속 완료")
        return record['IdList']
    except Exception as e:
        log.error("Error connecting to PubMed database: %s", e)
        return []
search_pubmed("Chungbuk National University Hospital", "2022", "01")


2024-05-14 11:59:57,236 - DEBUG - PubMed 접속 완료
2024-05-14 11:59:57,236 - DEBUG - PubMed 접속 완료


['36726749', '36704547', '36620625', '36405028', '36398236', '36341128', '36339930', '36317398', '36267638', '36262197', '36213459', '36211416', '36177347', '36124238', '36124066', '36121802', '36105055', '36017038', '35992880', '35992188', '35989920', '35847886', '35837570', '35783824', '35720840', '35719633', '35702115', '35702109', '35666742', '35573830', '35558886', '35510249', '35479279', '35444990', '35421154', '35359630', '35350572', '35281880', '35274010', '35252005', '35247953', '35237619', '35237059', '35232673', '35221600', '35221590', '35209705', '35203512', '35160167', '35158207', '35135065', '35128850', '35116221', '35103535', '35093635', '35089523', '35089229', '35070944', '35070940', '35069503', '35050964', '35043006', '35039576', '35034023', '35031884', '35021289', '35016269', '35015188', '35007247', '34997880', '34996990', '34993890', '34980137', '34980097', '34973183', '34973069', '34964278', '34862341', '34826609', '34800078', '34773897', '34768012', '34711784', '34

In [7]:
#정보 확인 및 데이터 검색 확인

def fetch_pubmed_details(ids):
    """
    Returns pubmed details from ids.
    """
    try:
        handle = Entrez.efetch(db="pubmed", id=ids, rettype="medline", retmode="text")
        records = handle.read()
        return records
    except Exception as e:
        print(f"Error fetching PubMed details: {e}")
        return ''

fetch_pubmed_details('36726749')


"\nPMID- 36726749\nOWN - NLM\nSTAT- PubMed-not-MEDLINE\nLR  - 20230203\nIS  - 1664-2295 (Print)\nIS  - 1664-2295 (Electronic)\nIS  - 1664-2295 (Linking)\nVI  - 13\nDP  - 2022\nTI  - The mortality of patients with Parkinson's disease with deep brain stimulation.\nPG  - 1099862\nLID - 10.3389/fneur.2022.1099862 [doi]\nLID - 1099862\nAB  - BACKGROUND: Deep brain stimulation (DBS) of the subthalamic nucleus (STN) is \n      effective in improving motor function in patients with Parkinson's disease (PD). \n      This study aimed to investigate mortality associated with bilateral STN DBS in \n      patients with PD and to assess the factors associated with mortality and causes \n      of death after DBS. METHODS: We reviewed the medical records of 257 patients with \n      PD who underwent bilateral STN DBS at the Movement Disorder Center at Seoul \n      National University Hospital between March 2005 and November 2018. Patients were \n      evaluated preoperatively, at 3, 6, and 12 months 

In [9]:
#데이터 저장

def pubmed_extraction(org_name, data_file, year, month):
    """
    Extracts data from PubMed and saves it to a CSV file.
    """

    idlist = search_pubmed(org_name, year, month)

    fetch = PubMedFetcher()

    citation_list = []

    n = 0
    for id in idlist:
        article = fetch.article_by_pmid(id)
        mesh_terms = article.mesh.values()
        records = fetch_pubmed_details(id)
        affiliation = re.findall(r'AD  - (.+)', records)

        # Extract the affiliation
        full_affil = [''.join(aff) for aff in affiliation]
        corres_affil = [a for a in full_affil if "@" in a]

        # Extract mesh terms if available
        mesh = [v['descriptor_name'] for v in mesh_terms] if mesh_terms else []

        abstract = [article.abstract if article.abstract else ""]

        n += 1
        print(f"-----------{year}/{month} Completed: {n}/{len(idlist)}!-----------")

        try:
            auth_affil = ''.join(affiliation[0])
            # Check if the organization name is in the affiliation
            if org_name in auth_affil:
                department = auth_affil.split(',')[0]
                hospital = [a for a in auth_affil.split(',') if org_name in a]
                citation = [department, *hospital, article.title, article.journal, article.year, article.authors[0], len(article.authors), ", ".join(mesh), article.doi, article.issn, ", ".join(abstract)]
                citation_list.append(citation)
            elif org_name in corres_affil[0]:
                department = corres_affil[0].split(',')[0]
                hospital = [a for a in corres_affil[0].split(',') if org_name in a]
                citation = [department, *hospital, article.title, article.journal, article.year, article.authors[0], len(article.authors), ", ".join(mesh), article.doi, article.issn, ", ".join(abstract)]
                citation_list.append(citation)
        except IndexError:
            continue

    citation_df = pd.DataFrame(citation_list)[1:10]
    citation_df.columns = ["Department", "Hospital", "Title", "Journal", "Year", "Author", "Number of Authors", "MeSH", "DOI", "ISSN", "Abstract"]
    citation_df.to_csv(f"{data_file}/{org_name}_{year}_{month}-{str(int(month) + 1)}.csv", index=False, encoding="utf-8-sig")

pubmed_extraction("Chungbuk National University Hospital", r"C:\Users\오현택\Desktop\python\0514", "2022", "01")


2024-05-14 12:04:42,950 - DEBUG - PubMed 접속 완료
2024-05-14 12:04:42,950 - DEBUG - PubMed 접속 완료


-----------2022/01 Completed: 1/97!-----------
-----------2022/01 Completed: 2/97!-----------
-----------2022/01 Completed: 3/97!-----------
-----------2022/01 Completed: 4/97!-----------
-----------2022/01 Completed: 5/97!-----------
-----------2022/01 Completed: 6/97!-----------
-----------2022/01 Completed: 7/97!-----------
-----------2022/01 Completed: 8/97!-----------
-----------2022/01 Completed: 9/97!-----------
-----------2022/01 Completed: 10/97!-----------
-----------2022/01 Completed: 11/97!-----------
-----------2022/01 Completed: 12/97!-----------
-----------2022/01 Completed: 13/97!-----------
-----------2022/01 Completed: 14/97!-----------
-----------2022/01 Completed: 15/97!-----------
-----------2022/01 Completed: 16/97!-----------
-----------2022/01 Completed: 17/97!-----------
-----------2022/01 Completed: 18/97!-----------
-----------2022/01 Completed: 19/97!-----------
-----------2022/01 Completed: 20/97!-----------
-----------2022/01 Completed: 21/97!-----------
-

In [10]:
import pandas as pd
from argparse import ArgumentParser
from ast import literal_eval
import glob
import os
import re
from rapidfuzz import process, fuzz
import logging
from pathlib import Path

log = logging.getLogger(__name__)

In [18]:
def get_files(data_path):
    files = glob.glob('{}/*.{}'.format(data_path, 'csv'))
    df = pd.concat([pd.read_csv(file, usecols=["Department", "Hospital", "Title", "Journal", "Year", "Author", "Number of Authors", "MeSH", "DOI", "ISSN", "Abstract"]) for file in files])
    return df

department_mapping = {
    'Gastroenterology': '소화기내과',
    'Cardiology': '순환기내과',
    'Pulmonology': '호흡기내과',
    'Nephrology': '신장내과',
    'Rheumatology': '류마티스내과',
    'Endocrinology': '내분비내과',
    'Infectious': '감염내과',
    'Hematology': '혈액종양내과',
    'Oncology': '혈액종양내과',
    'Internal': '내과',
    'Upper Gastrointestinal': '위장관외과(상부)',
    'Hepatobiliary': '간담췌외과',
    'Colorectal': '대장항문외과',
    'Endocrine': '유방내분비외과',
    'Vascular': '이식혈관외과',
    'Trauma': '중환자외상외과',
    'Surgery': '외과',
    'Pediatric': '소아외과',
    'Cardiothoracic': '심장혈관흉부외과',
    'Orthopedic': '정형외과',
    'Neurosurgery': '신경외과',
    'Plastic': '성형외과',
    'Obstetrics and Gynecology': '산부인과',
    'Pediatrics': '소아과',
    'Psychiatry': '정신건강의학과',
    'Neurology': '신경과',
    'Oral': '치과',
    'Ophthalmology': '안과',
    'Otorhinolaryngology': '이비인후과',
    'Dermatology': '피부과',
    'Urology': '비뇨의학과',
    'Rehabilitation': '재활의학과',
    'Physical': '재활의학과',
    'Family Medicine': '가정의학과',
    'Dentistry': '치과',
    'Laboratory Medicine': '진단검사의학과',
    'Radiation Oncology': '방사선종양학과',
    'Radiation': '방사선종양학과',
    'Anesthesiology': '마취통증의학과',
    'Pathology': '병리과',
    'Radiology': '영상의학과',
    'Emergency Medicine': '응급의학과',
    'Nuclear Medicine': '핵의학과',
    'Clinical Pharmacology': '임상약리학과',
    'Health Screening': '건진센터',
    'Thyroid': '갑상선센터',
    'International': '국제진료센터',
    'Care': '생활치료센터',
    'Screening': '선별진료소과(소)',
    'Transplant': '이식외과',
    'COVID-19': '코로나19',
    'Division of Cardiology' : '심장내과',
    'Internal Medicine' : '내과'
}

In [34]:
df = get_files(r"C:\Users\오현택\Desktop\python\0514")

In [35]:
# "Department" 열의 값을 변경하여 "Department of"를 삭제
df['Department'] = df['Department'].str.replace('Department of ', '')

In [36]:
#진료과 한국어로 변경

for i, department in enumerate(df['Department']):
    kor_department = department_mapping.get(department, None)
    if kor_department:
        df.loc[i, 'Department_KOR'] = kor_department
    else:
        print(f"Error: Unrecognized department name '{department}'")

# Department_KOR 열이 없는 경우 빈 값으로 채우기
df['Department_KOR'] = df['Department_KOR'].fillna('')

In [38]:
cwts = pd.read_csv("C:/Users/오현택/Desktop/python/cwts.csv")
cwts_year = pd.pivot_table(cwts, values = 'SNIP', index = ['Source title', 'Electronic ISSN'], columns = "Year").reset_index()

In [39]:
years = [str(i) for i in range(1999, 2023)]

cwts_year.columns = list(["Full Journal", "ISSN", *years])
cwts_year = cwts_year[["Full Journal", "ISSN", "2022"]]
dictionary = cwts_year[["2022", "ISSN"]].set_index("ISSN").to_dict()['2022']

In [40]:
def output_IF(org_name, out_path):
    dictionary = create_dictionary()

    ######20240202 cyw ISSN 번호에 따라 IF 번호 저장
    df = pd.read_csv(f"{out_path}/{org_name}_KOR.csv", encoding="utf-8-sig")
    df['IF'] = df['ISSN'].apply(lambda x: dictionary.get(x, None))

    df[['Department', 'KOR', 'Year', 'Journal', 'DOI', 'IF']].drop_duplicates().to_csv(f"{out_path}/{org_name}_IF_stats.csv", encoding='utf-8-sig', index = False)

In [41]:
df['IF'] = df['ISSN'].map(dictionary)

    # 중복 행을 제거하고 DataFrame을 CSV 파일로 저장합니다.
df[['Department', 'Year', 'Journal', 'DOI', 'IF']].drop_duplicates()
df['IF'] = df['IF'].fillna(np.nan)

df

Unnamed: 0,Department,Hospital,Title,Journal,Year,Author,Number of Authors,MeSH,DOI,ISSN,Abstract,Department_KOR,IF
0,Cardiology,Chungbuk National University Hospital,Deep Learning Model for Predicting Rhythm Outc...,J Healthc Eng,2022,Lee DI,4,"Atrial Fibrillation, Catheter Ablation, Deep L...",10.1155/2022/2863495,2040-2309,Current guidelines on atrial fibrillation (AF)...,순환기내과,0.91
1,Otorhinolaryngology,Chungbuk National University Hospital,Effects of CoQ10 Replacement Therapy on the Au...,Biomed Res Int,2022,Nam DW,15,"Ataxia, Deafness, Hearing Loss, Sensorineural,...",10.1155/2022/5250254,2314-6141,Primary coenzyme Q10 (CoQ10) deficiency refers...,이비인후과,0.97
2,Surgery,Chungbuk National University Hospital,"The Risk Factors, Incidence and Prognosis of P...",Front Oncol,2022,Park S,13,,10.3389/fonc.2022.889433,2234-943X,The term 'pregnancy-associated breast cancer' ...,외과,
3,Division of Cardiology,Chungbuk National University Hospital,Case Report: Delayed Ventricular Pseudoaneurys...,Front Cardiovasc Med,2022,Kim M,8,,10.3389/fcvm.2022.887190,2297-055X,A 74-year-old woman presented with incessant w...,심장내과,1.44
4,Radiation Oncology,Chungbuk National University Hospital,"Fused toes homolog, a potential molecular regu...",PLoS One,2022,D S P,6,"Cell Line, Tumor, Female, Human papillomavirus...",10.1371/journal.pone.0266532,1932-6203,Human papillomavirus type 16 (HPV16) plays a m...,방사선종양학과,
5,Obstetrics and Gynecology,Chungbuk National University Hospital,"Awareness, intentions and attitudes towards pl...",Reprod Biomed Online,2022,Kim SM,7,"Attitude, Cross-Sectional Studies, Cryopreserv...",10.1016/j.rbmo.2022.01.002,1472-6491,"RESEARCH QUESTION: What are the awareness, int...",산부인과,1.27
6,Dermatology,Chungbuk National University Hospital,A Case of Primary Cutaneous Extraskeletal Ewin...,Ann Dermatol,2022,Lee JW,4,,10.5021/ad.2022.34.1.72,2005-3894,Primary cutaneous extraskeletal Ewing sarcoma ...,피부과,0.6
7,Pediatrics,Chungbuk National University Hospital,Association of Tandem Repeat Number Variabilit...,J Clin Med,2022,Lee JK,4,,10.3390/jcm11030715,2077-0383,,소아과,1.18
8,Internal Medicine,Chungbuk National University Hospital,A case report of nilotinib-induced irreversibl...,Medicine (Baltimore),2022,Cho JY,5,"Adrenal Cortex Hormones, Biopsy, Cough, Humans...",10.1097/MD.0000000000028701,1536-5964,RATIONALE: Nilotinib is a second line tyrosine...,내과,0.8


In [45]:
df.columns

Index(['Department', 'Hospital', 'Title', 'Journal', 'Year', 'Author',
       'Number of Authors', 'MeSH', 'DOI', 'ISSN', 'Abstract',
       'Department_KOR', 'IF'],
      dtype='object')

In [46]:
# 컬럼순서 정리
desired_columns = ['Department', 'Department_KOR', 'Hospital', 'Title', 'Journal', 'Year', 'Author','ISSN', 'IF','Number of Authors', 'MeSH', 'DOI', 'Abstract']


df = df.reindex(columns=desired_columns)


In [47]:
df

Unnamed: 0,Department,Department_KOR,Hospital,Title,Journal,Year,Author,ISSN,IF,Number of Authors,MeSH,DOI,Abstract
0,Cardiology,순환기내과,Chungbuk National University Hospital,Deep Learning Model for Predicting Rhythm Outc...,J Healthc Eng,2022,Lee DI,2040-2309,0.91,4,"Atrial Fibrillation, Catheter Ablation, Deep L...",10.1155/2022/2863495,Current guidelines on atrial fibrillation (AF)...
1,Otorhinolaryngology,이비인후과,Chungbuk National University Hospital,Effects of CoQ10 Replacement Therapy on the Au...,Biomed Res Int,2022,Nam DW,2314-6141,0.97,15,"Ataxia, Deafness, Hearing Loss, Sensorineural,...",10.1155/2022/5250254,Primary coenzyme Q10 (CoQ10) deficiency refers...
2,Surgery,외과,Chungbuk National University Hospital,"The Risk Factors, Incidence and Prognosis of P...",Front Oncol,2022,Park S,2234-943X,,13,,10.3389/fonc.2022.889433,The term 'pregnancy-associated breast cancer' ...
3,Division of Cardiology,심장내과,Chungbuk National University Hospital,Case Report: Delayed Ventricular Pseudoaneurys...,Front Cardiovasc Med,2022,Kim M,2297-055X,1.44,8,,10.3389/fcvm.2022.887190,A 74-year-old woman presented with incessant w...
4,Radiation Oncology,방사선종양학과,Chungbuk National University Hospital,"Fused toes homolog, a potential molecular regu...",PLoS One,2022,D S P,1932-6203,,6,"Cell Line, Tumor, Female, Human papillomavirus...",10.1371/journal.pone.0266532,Human papillomavirus type 16 (HPV16) plays a m...
5,Obstetrics and Gynecology,산부인과,Chungbuk National University Hospital,"Awareness, intentions and attitudes towards pl...",Reprod Biomed Online,2022,Kim SM,1472-6491,1.27,7,"Attitude, Cross-Sectional Studies, Cryopreserv...",10.1016/j.rbmo.2022.01.002,"RESEARCH QUESTION: What are the awareness, int..."
6,Dermatology,피부과,Chungbuk National University Hospital,A Case of Primary Cutaneous Extraskeletal Ewin...,Ann Dermatol,2022,Lee JW,2005-3894,0.6,4,,10.5021/ad.2022.34.1.72,Primary cutaneous extraskeletal Ewing sarcoma ...
7,Pediatrics,소아과,Chungbuk National University Hospital,Association of Tandem Repeat Number Variabilit...,J Clin Med,2022,Lee JK,2077-0383,1.18,4,,10.3390/jcm11030715,
8,Internal Medicine,내과,Chungbuk National University Hospital,A case report of nilotinib-induced irreversibl...,Medicine (Baltimore),2022,Cho JY,1536-5964,0.8,5,"Adrenal Cortex Hormones, Biopsy, Cough, Humans...",10.1097/MD.0000000000028701,RATIONALE: Nilotinib is a second line tyrosine...


In [48]:
df.to_excel("C:\\Users\\오현택\\Desktop\\python\\IF_output0514.xlsx")

In [26]:
dictionary

{'-': 1.23,
 '2053-1583': 1.13,
 '2190-5738': 0.81,
 '2329-7670': 1.1,
 '2092-6731': 1.6,
 '2550-2247': 0.83,
 '2013-3294': 0.45,
 '1614-2411': 1.51,
 '1662-8667': 0.24,
 '1984-4182': 0.64,
 '2575-3126': 0.65,
 '2214-7233': nan,
 '1477-3848': 2.07,
 '2376-0605': 0.45,
 '1844-9166': 0.46,
 '1559-7776': 0.49,
 '2162-5239': 0.38,
 '2375-5717': 0.14,
 '1825-1242': 0.32,
 '2210-738X': nan,
 '1530-9932': 0.88,
 '2515-9321': 0.59,
 '2330-5517': 0.3,
 '2330-5525': 0.0,
 '2145-4493': 0.43,
 '2326-3253': 0.0,
 '1932-2240': 0.17,
 '1557-7341': 7.15,
 '2153-2192': 1.19,
 '1550-4840': 1.01,
 '1551-3688': 1.78,
 '1558-1160': nan,
 '1936-7236': 1.23,
 '1549-6333': 1.71,
 '1544-3965': 0.84,
 '1558-3430': nan,
 '2375-4702': 1.11,
 '1556-4703': 1.31,
 '1942-3462': 0.71,
 '1557-945X': 1.07,
 '1557-7333': 1.2,
 '1557-7325': 2.26,
 '1946-6226': 2.07,
 '2637-8051': 1.0,
 '2378-9638': 1.2,
 '1557-4644': 1.55,
 '1557-7309': 0.83,
 '2167-8383': 1.48,
 '1557-7368': 3.05,
 '2573-9522': 2.12,
 '1558-2868': 3.49,


In [24]:
cwts_year

Year,Source title,Electronic ISSN,1999,2000,2001,2002,2003,2004,2005,2006,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,2000 IEEE Wireless Communications and Networki...,-,,,0.64,1.25,1.27,0.90,0.64,,...,,,,,,,,,,
1,21st Century Music,-,,,,,,0.00,0.00,0.00,...,,,,,,,,,,
2,2D Materials,2053-1583,,,,,,,,,...,,,1.28,1.04,1.22,1.33,1.46,1.42,1.33,1.13
3,3 Biotech,2190-5738,,,,,,,,,...,0.62,0.00,0.14,1.17,1.11,1.17,0.95,0.96,0.86,0.81
4,3D Printing and Additive Manufacturing,2329-7670,,,,,,,,,...,,,1.63,1.39,1.33,1.42,1.35,1.39,1.20,1.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33970,npj Systems Biology and Applications,2056-7189,,,,,,,,,...,,,,1.20,1.12,0.99,1.22,1.12,1.08,0.97
33971,npj Vaccines,2059-0105,,,,,,,,,...,,,,,0.40,1.30,1.34,1.62,1.99,1.63
33972,"primary care companion for CNS disorders, The",2155-7780,,0.35,0.08,0.15,0.15,0.41,0.57,0.66,...,0.70,0.58,0.58,0.55,0.59,0.58,0.65,0.59,0.48,0.25
33973,rEFLections,2651-1479,,,,,,,,,...,,,,,,,,,0.48,0.73
