In [87]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
tree = ET.parse('data/raw/pubmed22n1113.xml')
root = tree.getroot()

In [2]:
# <PubmedArticle>
#     <MedlineCitation Status="In-Data-Review" Owner="NLM">
#       <PMID Version="1">34848605</PMID>
#       <DateRevised>
#         <Year>2021</Year>
#         <Month>12</Month>
#         <Day>01</Day>
#       </DateRevised>
#       <Article PubModel="Print">
#         <Journal>
#           <ISSN IssnType="Electronic">1572-0241</ISSN>
#           <JournalIssue CitedMedium="Internet">
#             <Volume>116</Volume>
#             <Issue>Suppl 1</Issue>
#             <PubDate>
#               <Year>2021</Year>
#               <Month>Dec</Month>
#               <Day>01</Day>
#             </PubDate>
#           </JournalIssue>
#           <Title>The American journal of gastroenterology</Title>
#           <ISOAbbreviation>Am J Gastroenterol</ISOAbbreviation>
#         </Journal>
#         <ArticleTitle>P018 How to Improve Transition of Pediatric IBD Patients Through Use of EMR.</ArticleTitle>
#         <Pagination>
#           <MedlinePgn>S4-S5</MedlinePgn>
#         </Pagination>
#         <ELocationID EIdType="doi" ValidYN="Y">10.14309/01.ajg.0000798672.52636.9c</ELocationID>
#         <AuthorList CompleteYN="Y">
#           <Author ValidYN="Y">
#             <LastName>Guylda</LastName>
#             <ForeName>Johnson</ForeName>
#             <Initials>J</Initials>
#             <AffiliationInfo>
#               <Affiliation>University of Rochester Medical Center, Rochester, Minnesota, United States.</Affiliation>
#             </AffiliationInfo>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Christopher</LastName>
#             <ForeName>Walker</ForeName>
#             <Initials>W</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>John</LastName>
#             <ForeName>Miller</ForeName>
#             <Initials>M</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Ashley</LastName>
#             <ForeName>Steiger</ForeName>
#             <Initials>S</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Krystle</LastName>
#             <ForeName>Bittner</ForeName>
#             <Initials>B</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Rebecca</LastName>
#             <ForeName>Abell</ForeName>
#             <Initials>A</Initials>
#           </Author>
#           <Author ValidYN="Y">
#             <LastName>Danielle</LastName>
#             <ForeName>Marino</ForeName>
#             <Initials>M</Initials>
#           </Author>
#         </AuthorList>
#         <Language>eng</Language>
#         <PublicationTypeList>
#           <PublicationType UI="D016428">Journal Article</PublicationType>
#         </PublicationTypeList>
#       </Article>
#       <MedlineJournalInfo>
#         <Country>United States</Country>
#         <MedlineTA>Am J Gastroenterol</MedlineTA>
#         <NlmUniqueID>0421030</NlmUniqueID>
#         <ISSNLinking>0002-9270</ISSNLinking>
#       </MedlineJournalInfo>
#       <CitationSubset>IM</CitationSubset>
#     </MedlineCitation>
#     <PubmedData>
#       <History>
#         <PubMedPubDate PubStatus="received">
#           <Year>2021</Year>
#           <Month>10</Month>
#           <Day>14</Day>
#         </PubMedPubDate>
#         <PubMedPubDate PubStatus="entrez">
#           <Year>2021</Year>
#           <Month>12</Month>
#           <Day>1</Day>
#           <Hour>6</Hour>
#           <Minute>7</Minute>
#         </PubMedPubDate>
#         <PubMedPubDate PubStatus="pubmed">
#           <Year>2021</Year>
#           <Month>12</Month>
#           <Day>2</Day>
#           <Hour>6</Hour>
#           <Minute>0</Minute>
#         </PubMedPubDate>
#         <PubMedPubDate PubStatus="medline">
#           <Year>2021</Year>
#           <Month>12</Month>
#           <Day>2</Day>
#           <Hour>6</Hour>
#           <Minute>0</Minute>
#         </PubMedPubDate>
#       </History>
#       <PublicationStatus>ppublish</PublicationStatus>
#       <ArticleIdList>
#         <ArticleId IdType="pubmed">34848605</ArticleId>
#         <ArticleId IdType="doi">10.14309/01.ajg.0000798672.52636.9c</ArticleId>
#         <ArticleId IdType="pii">00000434-202112001-00019</ArticleId>
#       </ArticleIdList>
#     </PubmedData>
#   </PubmedArticle>

In [3]:
# For each article we want to extract the following information:
# - Title => MedlineCitation/Article/ArticleTitle
# - doi => PubmedData/ArticleIdList/ArticleId[@IdType='doi']
# - pubmed_id => PubmedData/ArticleIdList/ArticleId[@IdType='pubmed']
# - year_published => MedlineCitation/Article/Journal/JournalIssue/PubDate/Year
# - month_published => MedlineCitation/Article/Journal/JournalIssue/PubDate/Month
# - day_published => MedlineCitation/Article/Journal/JournalIssue/PubDate/Day
# - Abstract => MedlineCitation/Article/Abstract/AbstractText

In [111]:
articles = {
    'title': [],
    'doi': [],
    'pubmed_id': [],
    'year_published': [],
    'month_published': [],
    'day_published': [],
    'abstract': []
}

In [112]:
skipped_articles_types = []

for pubmed_article in root:
    # Check that:
    # <PublicationTypeList>
    #      <PublicationType UI="D016428">Journal Article</PublicationType>
    #    </PublicationTypeList>
    # the PublicationType is Journal Article
    if pubmed_article.find('MedlineCitation/Article/PublicationTypeList/PublicationType').get('UI') != 'D016428':
        skipped_articles_types.append(pubmed_article.find('MedlineCitation/Article/PublicationTypeList/PublicationType').get('UI'))
        continue # skip this article

    title = pubmed_article.find('MedlineCitation/Article/ArticleTitle')
    title = ET.tostring(title, encoding='utf-8').decode('utf-8')
    title = title.replace('<ArticleTitle>', '').replace('</ArticleTitle>', '')
    # Remove the \n
    title = title.replace('\n', '')
    # Skip the first P*** in the title
    # Check if the title starts with a P followed by 3 numbers 0-9
    if title[0] == 'P' and title[1:4].isdigit():
        title = title[5:]

    doi = pubmed_article.find('PubmedData/ArticleIdList/ArticleId[@IdType="doi"]')
    doi = doi.text if doi is not None else np.nan
    
    pubmed_id = pubmed_article.find('PubmedData/ArticleIdList/ArticleId[@IdType="pubmed"]').text

    year_pub = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year')
    month_pub = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Month')
    day_pub = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Day')

    year_published = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Year').text if year_pub is not None else np.nan
    month_published = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Month').text if month_pub is not None else np.nan
    day_published = pubmed_article.find('MedlineCitation/Article/Journal/JournalIssue/PubDate/Day').text if day_pub is not None else np.nan
    
    ab = pubmed_article.find('MedlineCitation/Article/Abstract/AbstractText')
    if ab is not None:
        abstract = ab.text
    else:
        abstract = np.nan

    # print('Title:', title)
    # print('doi:', doi)
    # print('pubmed_id:', pubmed_id)
    # print('year_published:', year_published)
    # print('month_published:', month_published)
    # print('day_published:', day_published)
    # print('abstract:', abstract)
    # print()
    # break
    articles['title'].append(title)
    articles['doi'].append(doi)
    articles['pubmed_id'].append(pubmed_id)
    articles['year_published'].append(year_published)
    articles['month_published'].append(month_published)
    articles['day_published'].append(day_published)
    articles['abstract'].append(abstract)

print('Skipped articles:', len(skipped_articles_types))
print('Skipped articles types:', list(set(skipped_articles_types))) # TODO: Check if some of these are actually valid

Skipped articles: 3466
Skipped articles types: ['D017065', 'D016433', 'D016422', 'D017203', 'D029282', 'D016430', 'D017418', 'D054711', 'D000075742', 'D016440', 'D000076942', 'D016421', 'D016425', 'D013485', 'D004740', 'D059040', 'D016423', 'D000078182', 'D016439', 'D002363']


In [113]:
df = pd.DataFrame(articles)
print(df.shape)
df.head()

(26534, 7)


Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract
0,How to Improve Transition of Pediatric IBD Pat...,10.14309/01.ajg.0000798672.52636.9c,34848605,2021,Dec,1,
1,Not All Fecal Calprotectin is Specific for Inf...,10.14309/01.ajg.0000798676.11183.dd,34848606,2021,Dec,1,
2,WITHDRAWN.,10.14309/01.ajg.0000798684.47571.9e,34848608,2021,Dec,1,
3,"Proton Pump Inhibitor Therapy: Providing ""Acid...",10.14309/01.ajg.0000798680.74472.80,34848607,2021,Dec,1,
4,Biologic Therapy and Therapeutic Drug Monitori...,10.14309/01.ajg.0000798688.16985.68,34848609,2021,Dec,1,


In [114]:
# Count the number of articles with NaN values
df.isna().sum()

title                 0
doi                 351
pubmed_id             0
year_published      519
month_published    5254
day_published      8647
abstract           3202
dtype: int64

In [115]:
# Remove all rows with NaN abstract
df.dropna(subset=['abstract'], inplace=True)

In [116]:
# Count the number of articles with NaN values
df.isna().sum()

title                 0
doi                 305
pubmed_id             0
year_published      397
month_published    4321
day_published      7092
abstract              0
dtype: int64

In [117]:
# Find the min lenght of the title
min_title_length = df['title'].str.len().min()
min_title_length

# Print the titles with the min length
df[df['title'].str.len() == min_title_length]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract
2104,GenBank.,10.1093/nar/gkab1135,34850943,2021,Dec,1,GenBank® (https://www.ncbi.nlm.nih.gov/genbank...


df[df['title'] == "REMOVED."]

In [118]:
df[df['title'].str.len() == min_title_length+1]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract


In [119]:
df[df['title'].str.len() == min_title_length+2]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract


In [120]:
df[df['title'].str.len() == min_title_length+3]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract
6267,Gout Storm.,10.12659/AJCR.932683,34855717,2021,Sep,20,BACKGROUND Gout is a chronic disease character...


In [121]:
df[df['title'].str.len() == min_title_length+4]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract


In [122]:
df[df['title'].str.len() == min_title_length+5]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract


In [123]:
df[df['title'].str.len() == min_title_length+6]

Unnamed: 0,title,doi,pubmed_id,year_published,month_published,day_published,abstract
23558,Seaweed rafts.,10.1016/j.cub.2021.10.053,34875235,2021,Dec,6,Jonathan Waters provides an introduction to se...
