In [2]:
import xml.etree.ElementTree as ET
import pandas as pd
tree = ET.parse('data/raw/pubmed22n1113.xml')
root = tree.getroot()

In [27]:
# <PubmedArticle>
#     <MedlineCitation Status="In-Data-Review" Owner="NLM">
#       <PMID Version="1">34848605</PMID>
#       <DateRevised>
#         <Year>2021</Year>
#         <Month>12</Month>
#         <Day>01</Day>
#       </DateRevised>
#       <Article PubModel="Print">
#         <Journal>
#           <ISSN IssnType="Electronic">1572-0241</ISSN>
#           <JournalIssue CitedMedium="Internet">
#             <Volume>116</Volume>
#             <Issue>Suppl 1</Issue>
#             <PubDate>
#               <Year>2021</Year>
#               <Month>Dec</Month>
#               <Day>01</Day>
#             </PubDate>
#           </JournalIssue>
#           <Title>The American journal of gastroenterology</Title>
#           <ISOAbbreviation>Am J Gastroenterol</ISOAbbreviation>
#         </Journal>
#         <ArticleTitle>P018 How to Improve Transition of Pediatric IBD Patients Through Use of EMR.</ArticleTitle>
#         <Pagination>
#           <MedlinePgn>S4-S5</MedlinePgn>
#         </Pagination>
#         <ELocationID EIdType="doi" ValidYN="Y">10.14309/01.ajg.0000798672.52636.9c</ELocationID>
#         <AuthorList CompleteYN="Y">
#           <Author ValidYN="Y">
#             <LastName>Guylda</LastName>
#             <ForeName>Johnson</ForeName>
#             <Initials>J</Initials>
#             <AffiliationInfo>
#               <Affiliation>University of Rochester Medical Center, Rochester, Minnesota, United States.</Affiliation>
#             </AffiliationInfo>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Christopher</LastName>
#             <ForeName>Walker</ForeName>
#             <Initials>W</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>John</LastName>
#             <ForeName>Miller</ForeName>
#             <Initials>M</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Ashley</LastName>
#             <ForeName>Steiger</ForeName>
#             <Initials>S</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Krystle</LastName>
#             <ForeName>Bittner</ForeName>
#             <Initials>B</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Rebecca</LastName>
#             <ForeName>Abell</ForeName>
#             <Initials>A</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Danielle</LastName>
#             <ForeName>Marino</ForeName>
#             <Initials>M</Initials>
#           </Author>
#         </AuthorList>
#         <Language>eng</Language>
#         <PublicationTypeList>
#           <PublicationType UI="D016428">Journal Article</PublicationType>
#         </PublicationTypeList>
#       </Article>
#       <MedlineJournalInfo>
#         <Country>United States</Country>
#         <MedlineTA>Am J Gastroenterol</MedlineTA>
#         <NlmUniqueID>0421030</NlmUniqueID>
#         <ISSNLinking>0002-9270</ISSNLinking>
#       </MedlineJournalInfo>
#       <CitationSubset>IM</CitationSubset>
#     </MedlineCitation>
#     <PubmedData>
#       <History>
#         <PubMedPubDate PubStatus="received">
#           <Year>2021</Year>
#           <Month>10</Month>
#           <Day>14</Day>
#         </PubMedPubDate>
#         <PubMedPubDate PubStatus="entrez">
#           <Year>2021</Year>
#           <Month>12</Month>
#           <Day>1</Day>
#           <Hour>6</Hour>
#           <Minute>7</Minute>
#         </PubMedPubDate>
#         <PubMedPubDate PubStatus="pubmed">
#           <Year>2021</Year>
#           <Month>12</Month>
#           <Day>2</Day>
#           <Hour>6</Hour>
#           <Minute>0</Minute>
#         </PubMedPubDate>
#         <PubMedPubDate PubStatus="medline">
#           <Year>2021</Year>
#           <Month>12</Month>
#           <Day>2</Day>
#           <Hour>6</Hour>
#           <Minute>0</Minute>
#         </PubMedPubDate>
#       </History>
#       <PublicationStatus>ppublish</PublicationStatus>
#       <ArticleIdList>
#         <ArticleId IdType="pubmed">34848605</ArticleId>
#         <ArticleId IdType="doi">10.14309/01.ajg.0000798672.52636.9c</ArticleId>
#         <ArticleId IdType="pii">00000434-202112001-00019</ArticleId>
#       </ArticleIdList>
#     </PubmedData>
#   </PubmedArticle>

In [35]:
# For each article we want to extract the following information:
# - Title => MedlineCitation/Article/ArticleTitle
# - doi => PubmedData/ArticleIdList/ArticleId[@IdType='doi']
# - pubmed_id => PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']
# - year_published => MedlineCitation/Article/Journal/JournalIssue/PubDate/Year
# - month_published => MedlineCitation/Article/Journal/JournalIssue/PubDate/Month
# - day_published => MedlineCitation/Article/Journal/JournalIssue/PubDate/Day
# - Abstract => MedlineCitation/Article/Abstract/AbstractText

In [43]:
articles = {
    'title': [],
    'doi': [],
    'pubmed_id': [],
    'year_published': [],
    'month_published': [],
    'day_published': [],
    'abstract': []
}

In [45]:
for pubmed_article in root:
    title = pubmed_article.find('MedlineCitation/Article/ArticleTitle').text

    doi = pubmed_article.find('PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')
    doi = doi.text if doi is not None else None
    
    pubmed_id = pubmed_article.find('PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]').text

    year_pub = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
    month_pub = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Month')
    day_pub = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Day')

    year_published = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year').text if year_pub is not None else ''
    month_published = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Month').text if month_pub is not None else ''
    day_published = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Day').text if day_pub is not None else ''
    
    ab = pubmed_article.find('MedlineCitation/Article/Abstract/AbstractText')
    if ab is not None:
        abstract = ab.text
    else:
        abstract = ''
    # print('Title:', title)
    # print('doi:', doi)
    # print('pubmed_id:', pubmed_id)
    # print('year_published:', year_published)
    # print('month_published:', month_published)
    # print('day_published:', day_published)
    # print('abstract:', abstract)
    # print()
    # break
    articles['title'].append(title)
    articles['doi'].append(doi)
    articles['pubmed_id'].append(pubmed_id)
    articles['year_published'].append(year_published)
    articles['month_published'].append(month_published)
    articles['day_published'].append(day_published)
    articles['abstract'].append(abstract)

In [50]:
# TODO: Note in the title each article title is preceded by a number (pXXX),
#       we need to remove this.

# TODO: Instead of empty strings, we should fill in the missing values with NaN

In [48]:
df = pd.DataFrame(articles)
df.head()

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract
0,P018 How to Improve Transition of Pediatric IB...,10.14309/01.ajg.0000798672.52636.9c,34848605,2021,Dec,1,
1,P019 Not All Fecal Calprotectin is Specific fo...,10.14309/01.ajg.0000798676.11183.dd,34848606,2021,Dec,1,
2,P021 WITHDRAWN.,10.14309/01.ajg.0000798684.47571.9e,34848608,2021,Dec,1,
3,P020 Proton Pump Inhibitor Therapy: Providing ...,10.14309/01.ajg.0000798680.74472.80,34848607,2021,Dec,1,
4,P022 Biologic Therapy and Therapeutic Drug Mon...,10.14309/01.ajg.0000798688.16985.68,34848609,2021,Dec,1,
