## Dependencies

In [5]:
# Install a pip package in the current Jupyter kernel
#!python -m pip install --upgrade pip pandas jupyterthemes biopython lxml
#!jt -t monokai -fs 12 -cellw 80% -T -N -kl
import re
import datetime
import pandas as pd
pmids = ["36058839", "34611887", "35123919", "32951293", "34043444", "28160076"]

In [214]:
%%time
def parse_pub_date(pub_date):
    if not pub_date:
        pub_date = ""
    begin = "^"
    end = "$"
    YYYY = "\d{4}"
    MM_DD ="\d{2}"
    MM_ABRV = "\D+"
    divider = "[\s|-]{1}"
    YYYY_MM = begin + YYYY + divider + MM_DD + end
    YYYY_ABRV = begin + YYYY + divider + MM_ABRV + end
    YYYY_ABRV_DD = begin + YYYY + divider + MM_ABRV + divider + MM_DD + end
    months = {"[Jj][Aa][Nn].*": "01", "[Ff][Ee][Bb].*": "02", "[Mm][Aa][Rr].*": "03",
              "[Aa][Pp][Rr].*": "04", "[Mm][Aa][Yy].*": "05", "[Jj][Uu][Nn].*": "06",
              "[Jj][Uu][Ll].*": "07", "[Aa][Uu][Gg].*": "08", "[Ss][Ee][Pp].*": "09",
              "[Oo][Cc][Tt].*": "10", "[Nn][Oo][Vv].*": "11", "[Dd][Ee][Cc].*": "12",
              "[Ww][Ii][Nn].*": "12", "[Ss][Pp][Rr].*": "03", "[Ss][Uu][Mm].*": "06",
              "[Aa][Uu][Tt].*": "09", "[Ff][Aa][Ll].*": "09"}
    MM_DD_YYYY = begin + "\d{1,2}\/{1}\d{1,2}\/{1}\d{4}" + end
    ORDINAL_DATE = begin + "\d{5}" + end
    if re.search(begin + YYYY + end, pub_date): #2023
        pub_date = pub_date + "-01-01"
    elif re.search(YYYY_MM, pub_date): #2023-06
        test = pub_date
        pub_date = pub_date + "-01"
    elif re.search(YYYY_ABRV, pub_date): #2023-Jun or 2023-Jun-23
        for month_re, month_number in months.items():
            YYYYMM_ABRV = begin + YYYY + divider + month_re + end
            if re.search(YYYYMM_ABRV, pub_date):
                pub_date = pub_date[:4] + "-" + month_number + "-01"
    elif re.search(YYYY_ABRV_DD, pub_date):
        for month_re, month_number in months.items():
            YYYYMM_ABRVDD = begin + YYYY + divider + month_re + divider + MM_DD + end
            if re.search(YYYYMM_ABRVDD, pub_date):
                pub_date = pub_date[:4] + "-" + month_number + "-" + pub_date[-2:]
    elif re.search(MM_DD_YYYY, pub_date): #
        pub_date = pub_date.split("/")
        pub_date = pub_date[2] + "-" + pub_date[0] + "-" + pub_date[1]
    elif re.search(ORDINAL_DATE, pub_date):
        print("Ordinal date: " + pub_date)
        offset = datetime.datetime(1900, 1, 1)
        pub_date = str((offset + datetime.timedelta(days=int(pub_date))).date())
        #pub_date = str(datetime.date.fromordinal(int(pub_date)))
        print("Ordinal date: " + pub_date)
    
    return pub_date

def get_journal_source(row):
    ISOAbbreviation = row["ISOAbbreviation"]
    Original_PubDate = " ".join(row["Original_PubDate"].split("-"))
    Volume = row["Volume"]
    Issue = row["Issue"]
    Pagination = row["Pagination"]
    PubModel = row["PubModel"]
    doi = ""
    pii = ""
    ELocationID = row["ELocationID"].strip("{").strip("}").split("||")
    for ELID in ELocationID:
        if "doi" in ELID:
            doi = ELID
        if "pii" in ELID:
            pii = ELID
    DT_PubDate = row["PubDate"]
    print(type(DT_PubDate))
    DT_ArticleDate = row["ArticleDate_Electronic"]
#     DT_ArticleDate = pd.to_datetime(parse_pub_date(ArticleDate_Electronic)).dt.strftime('%Y-%m-%d')
    #Methods Inf Med. 2012;51(3):189-98. doi: 10.3414/ME11-01-0055. Epub 2012 Apr 5.
    source = str(ISOAbbreviation) + "." 
    #See: https://www.nlm.nih.gov/bsd/licensee/journal_source.html
    if Original_PubDate:
        source = source + " " + str(Original_PubDate)
    if Volume:
        source = source + ";" + str(Volume)
    if Issue:
        source = source + "(" + str(Issue) + ")"
    if Pagination:
        source = source + ":" + str(Pagination) + "."
    elif pii:
        source = source + ":" + str(pii) + "."
    if doi:
        source = source + " " + str(doi) + "."
    # See:  https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#articledate
    if DT_ArticleDate and (DT_ArticleDate < DT_PubDate) and PubModel != "Electronic-eCollection": #if articledate is older/lower than print day
        try: #To remove trailing zero in Unix (Linux, OS X) use -
            source = source + " Epub " + DT_ArticleDate.strftime('%Y %b %-d') + "."
        except ValueError: #To remove trailing zero in Windows use #
            source = source + " Epub " + DT_ArticleDate.strftime('%Y %b %#d') + "."
    if PubModel == "Electronic-eCollection":
        source = source + " eCollection " + Original_PubDate + "."
        
    return source

CPU times: total: 0 ns
Wall time: 927 µs


In [217]:
%%time
xml_file ="pubmed_data.xml"
xslt_file = "PubMed_XML_to_Pandas_Transformer.xsl"
#dateparse = lambda x: datetime.strptime(x, '%Y-%m-%d') #Perhaps for the future
df = pd.read_xml(xml_file, xpath="/Citation", stylesheet=xslt_file, dtype=str)
date_columns = df.filter(like='Date').columns.to_list()
df["Original_PubDate"] = df["PubDate"].copy()
for column in date_columns:
    df[column] = pd.to_datetime(df[column].apply(lambda x: parse_pub_date(x)))#.dt.strftime('%Y-%m-%d')
df = df.fillna(value="")
df["Journal_Source"] = df.apply(lambda x: get_journal_source(x), axis=1)
#df['PMID'] = df['PMID'].astype(int)
df.to_excel("Transformed_PubMed_Data.xlsx", index=False, header=True)
df#["Language"][2]
columns_to_keep = ["PMID", "Original_PubDate", "PubDate", "PubModel", "Journal_Source"]
df = df[columns_to_keep]
df
#df["Journal_Source"][0]
#Oncologist. 2021 Mar;26(3):e435-e444. doi: 10.1002/onco.13527. Epub 2020 Oct 1.

<class 'pandas._libs.tslibs.timestamps.Timestamp'>


AttributeError: 'NoneType' object has no attribute 'split'

In [180]:

                            elif header == "Source":
                                citation_H = i
                            elif header == "Location Identifier": # needs iteration
                                doi_H = i
                            elif header == "Abstract":
                                abstract_H = i
                            elif header == "ISSN":
                                issn_H = i
                            elif header == "PMC ID":
                                pmc_H = i
                            elif header == "Journal Title":
                                journal_H = i
                            elif header == "Copyright Information":
                                copyright_H = i
                            elif header == "Title":
                                title_H = i
                            elif header == "Publication Type":
                                pub_type_H = i

SyntaxError: invalid syntax (3724410098.py, line 1)