## Dependencies

In [140]:
# Install a pip package in the current Jupyter kernel
#!python -m pip install --upgrade pip pandas jupyterthemes openpyxl xlsxwriter
#!jt -t monokai -fs 12 -cellw 80% -T -N -kl
import re
import datetime
import pandas as pd
pmids = ["36058839", "34611887", "35123919", "32951293", "34043444", "28160076"]

In [233]:
%%time
def parse_date_as_daytime(pub_date):
    if not pub_date:
        pub_date = datetime.datetime.min
    else:
        time = " 00:00:00"
        if " " in pub_date:
            date = pub_date.split(" ")
            pub_date = date[0]
            time = " " + date[1]
        begin = "^"
        end = "$"
        YYYY = "\d{4}"
        MM_DD ="\d{2}"
        MM_ABRV = "\D+"
        divider = "[\s|-]{1}"
        YYYY_MM = begin + YYYY + divider + MM_DD + end
        YYYY_ABRV = begin + YYYY + divider + MM_ABRV + end
        YYYY_ABRV_DD = begin + YYYY + divider + MM_ABRV + divider + MM_DD + end
        months = {"[Jj][Aa][Nn].*": "01", "[Ff][Ee][Bb].*": "02", "[Mm][Aa][Rr].*": "03",
                  "[Aa][Pp][Rr].*": "04", "[Mm][Aa][Yy].*": "05", "[Jj][Uu][Nn].*": "06",
                  "[Jj][Uu][Ll].*": "07", "[Aa][Uu][Gg].*": "08", "[Ss][Ee][Pp].*": "09",
                  "[Oo][Cc][Tt].*": "10", "[Nn][Oo][Vv].*": "11", "[Dd][Ee][Cc].*": "12",
                  "[Ww][Ii][Nn].*": "12", "[Ss][Pp][Rr].*": "03", "[Ss][Uu][Mm].*": "06",
                  "[Aa][Uu][Tt].*": "09", "[Ff][Aa][Ll].*": "09"}
        MM_DD_YYYY = begin + "\d{1,2}\/{1}\d{1,2}\/{1}\d{4}" + end
        ORDINAL_DATE = begin + "\d{5}" + end
        if re.search(begin + YYYY + end, pub_date): #2023
            pub_date = pub_date + "-01-01"
        elif re.search(YYYY_MM, pub_date): #2023-06
            test = pub_date
            pub_date = pub_date + "-01"
        elif re.search(YYYY_ABRV, pub_date): #2023-Jun or 2023-Jun-23
            for month_re, month_number in months.items():
                YYYYMM_ABRV = begin + YYYY + divider + month_re + end
                if re.search(YYYYMM_ABRV, pub_date):
                    pub_date = pub_date[:4] + "-" + month_number + "-01"
        elif re.search(YYYY_ABRV_DD, pub_date):
            for month_re, month_number in months.items():
                YYYYMM_ABRVDD = begin + YYYY + divider + month_re + divider + MM_DD + end
                if re.search(YYYYMM_ABRVDD, pub_date):
                    pub_date = pub_date[:4] + "-" + month_number + "-" + pub_date[-2:]
        elif re.search(MM_DD_YYYY, pub_date): #
            pub_date = pub_date.split("/")
            pub_date = pub_date[2] + "-" + pub_date[0] + "-" + pub_date[1]
        elif re.search(ORDINAL_DATE, pub_date):
            print("Ordinal date: " + pub_date)
            offset = datetime.datetime(1900, 1, 1)
            pub_date = str((offset + datetime.timedelta(days=int(pub_date))).date())
            #pub_date = str(datetime.date.fromordinal(int(pub_date)))
            print("Ordinal date: " + pub_date)
        pub_date += time
    return pub_date

def get_journal_source(row):
    source = ""
    if row["ISOAbbreviation"]:
        ISOAbbreviation = row["ISOAbbreviation"]
        Original_PubDate = " ".join(row["Original_PubDate"].split("-"))
        Volume = row["Volume"]
        Issue = row["Issue"]
        Pagination = row["Pagination"]
        PubModel = row["PubModel"]
        doi = ""
        pii = ""
        ELocationID = row["ELocationID"].strip("{").strip("}").split("||")
        for ELID in ELocationID:
            if "doi" in ELID:
                doi = ELID
            if "pii" in ELID:
                pii = ELID
        DT_PubDate = row["PubDate"]
        DT_ArticleDate = row["ArticleDate_Electronic"]
        #DT_ArticleDate = pd.to_datetime(parse_pub_date(ArticleDate_Electronic)).dt.strftime('%Y-%m-%d')
        #Methods Inf Med. 2012;51(3):189-98. doi: 10.3414/ME11-01-0055. Epub 2012 Apr 5.
        source = str(ISOAbbreviation) + "." 
        #See: https://www.nlm.nih.gov/bsd/licensee/journal_source.html
        if Original_PubDate and PubModel != "Electronic-eCollection":
            source = source + " " + str(Original_PubDate)
        elif Original_PubDate and PubModel == "Electronic-eCollection" and DT_ArticleDate:
            try: #To remove trailing zero in Unix (Linux, OS X) use -
                source = source + " " + DT_ArticleDate.strftime('%Y %b %-d')
            except ValueError: #To remove trailing zero in Windows use #
                source = source + " " + DT_ArticleDate.strftime('%Y %b %#d')
        if Volume:
            source = source + ";" + str(Volume)
        if Issue:
            source = source + "(" + str(Issue) + ")"
        if Pagination:
            source = source + ":" + str(Pagination) + "."
        elif pii:
            source = source + ":" + str(pii) + "."
        if doi:
            source = source + " " + str(doi) + "."
        # See:  https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#articledate
        if DT_ArticleDate and (DT_ArticleDate < DT_PubDate) and PubModel != "Electronic-eCollection": #if articledate is older/lower than print day
            try: #To remove trailing zero in Unix (Linux, OS X) use -
                source = source + " Epub " + DT_ArticleDate.strftime('%Y %b %-d')
            except ValueError: #To remove trailing zero in Windows use #
                source = source + " Epub " + DT_ArticleDate.strftime('%Y %b %#d')
        if PubModel == "Electronic-eCollection":
            source = source + " eCollection " + Original_PubDate
        if source:
            source += "."
    return source

CPU times: total: 0 ns
Wall time: 0 ns


In [267]:
%%time
xml_file ="pubmed_data3.xml"
xslt_file = "PubMed_XML_to_Pandas_Transformer.xsl"
#dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d') #Perhaps for the future
df = pd.read_xml(xml_file, xpath="/Citation", stylesheet=xslt_file, dtype=str)
date_columns = df.filter(like='Date').columns.to_list()
df = df.fillna(value="")
for column in date_columns:
    df["Original_" + column] = df[column].copy()
    df[column] = df[column].apply(lambda x: pd.to_datetime(parse_date_as_daytime(x), errors="coerce"))
#df = df.fillna(value="")
df["Journal_Source"] = df.apply(lambda x: get_journal_source(x), axis=1)
df["DOI"] = df["ELocationID"].apply(lambda x: "".join([x.strip("doi: ") for x in x.split("||") if "doi" in x]))
df["PMC"] = df["ArticleIdList"].apply(lambda x: "".join([x.strip("pmc: ") for x in x.split("\\") if "pmc" in x]))
#df['PMID'] = df['PMID'].astype(int)
columns_to_keep = ["PMID", "AuthorList_Fullnames", "PubDate", "Grant_Number", "Journal_Source", "DOI", "Abstract", "ISSN_Electronic", "PMC", "JournalTitle", "CopyrightInformation", "ArticleTitle", "PublicationTypeList"]
#columns_to_keep = df.filter(like='Type').columns#.to_list()
df = df[columns_to_keep]

writer = pd.ExcelWriter("Transformed_PubMed_Data.xlsx",
                        engine="xlsxwriter", 
                        datetime_format="yyyy-mm-dd hh:mm:ss",
                        date_format="yyyy-mm-dd",)
# Convert the dataframe to an XlsxWriter Excel object.
df.to_excel(writer, sheet_name="Sheet1", index=False, header=True)
workbook = writer.book
worksheet = writer.sheets["Sheet1"]
(max_row, max_col) = df.shape
worksheet.set_column(1, max_col, 20)
writer.close()
df
#df["PMC"]#[2]
#Front Genet. 2013 Jan 25;3:330. doi: 10.3389/fgene.2012.00330. eCollection 2012.

CPU times: total: 109 ms
Wall time: 106 ms


Unnamed: 0,PMID,AuthorList_Fullnames,PubDate,Grant_Number,Journal_Source,DOI,Abstract,ISSN_Electronic,PMC,JournalTitle,CopyrightInformation,ArticleTitle,PublicationTypeList
0,36058839,"Singhal, Surbhi||Walter, Louise C||Smith, Alex...",2023-03-01,K24 AG068312\AG\NIA NIH HHS\United States||K76...,J Geriatr Oncol. 2023 Mar;14(2):101366. doi: 1...,10.1016/j.jgo.2022.08.015,INTRODUCTION: Functional outcomes during non-s...,1879-4076,PMC9974579,Journal of geriatric oncology,Copyright © 2022 The Authors. Published by Els...,Change in four measures of physical function a...,Journal Article
1,34611887,"Wong, Melisa L||Shi, Ying||Smith, Alexander K|...",2022-01-01,KL2 TR001870\TR\NCATS NIH HHS\United States||K...,J Am Geriatr Soc. 2022 Jan;70(1):136-149. doi:...,10.1111/jgs.17474,BACKGROUND: Maintenance of function during can...,1532-5415,PMC8742783,Journal of the American Geriatrics Society,© 2021 The American Geriatrics Society.,Changes in older adults' life space during lun...,"Journal Article||Research Support, N.I.H., Ext..."
2,35123919,"Wong, Melisa L||Nicosia, Francesca M||Smith, A...",2022-06-01,K24 AG068312\AG\NIA NIH HHS\United States||K76...,J Geriatr Oncol. 2022 Jun;13(5):606-613. doi: ...,10.1016/j.jgo.2022.01.014,BACKGROUND: Shared decision making (SDM) is es...,1879-4076,PMC9232997,Journal of geriatric oncology,Copyright © 2022 Elsevier Ltd. All rights rese...,"""You have to be sure that the patient has the ...","Journal Article||Research Support, N.I.H., Ext..."
3,32951293,"Wong, Melisa L||Gao, Junheng||Thanarajasingam,...",2021-03-01,KL2 TR001870\TR\NCATS NIH HHS\United States||K...,Oncologist. 2021 Mar;26(3):e435-e444. doi: 10....,10.1002/onco.13527,BACKGROUND: Prior comparisons of chemotherapy ...,1549-490X,PMC7930405,The oncologist,© AlphaMed Press 2020.,Expanding Beyond Maximum Grade: Chemotherapy T...,"Journal Article||Research Support, N.I.H., Ext..."
4,34043444,"Presley, Carolyn J||Gomes, Fabio||Burd, Christ...",2021-07-01,R01 AG059711\AG\NIA NIH HHS\United States||K76...,J Clin Oncol. 2021 Jul 01;39(19):2115-2127. do...,10.1200/JCO.21.00138,,1527-7755,PMC8260908,Journal of clinical oncology : official journa...,,Immunotherapy in Older Adults With Cancer.,"Journal Article||Research Support, N.I.H., Ext..."
5,28160076,"Wong, Melisa L||Paul, Steven M||Cooper, Bruce ...",2017-06-01,K05 CA168960\CA\NCI NIH HHS\United States||UL1...,Support Care Cancer. 2017 Jun;25(6):1931-1939....,10.1007/s00520-017-3593-z,PURPOSE: Few studies have examined interindivi...,1433-7339,PMC5433346,Supportive care in cancer : official journal o...,,Predictors of the multidimensional symptom exp...,Journal Article
