In [None]:
import xml.etree.ElementTree as ET
import os, csv, gzip, glob
from ftplib import FTP

In [None]:
# Set Working Directory 
current_dir = "YourWorkingDirectory"
raw_dir = os.path.join(current_dir, "RawData")

# Create Raw Data Directory
if not os.path.exists(raw_dir):
    os.makedirs(raw_dir)

In [None]:
# Open connection with Pubmed and Download files from Medline Baseline
# Data accessed in December 2017
ftp = FTP("ftp.ncbi.nlm.nih.gov")
ftp.login()
ftp.cwd("pubmed")
ftp.cwd("baseline")
file_names = ftp.nlst() 

# File #533 (hardcoded) is the first file with publication date in 2005
# Note: At the time of data access, there are approximately ~400 files to download
good_files = [file for file in file_names if (file.endswith(".gz") and int(file[9:13]) >= 533)]
for file in good_files:
    print("Writing: " + file) 
    ftp.retrbinary("RETR " + file, open(os.path.join(raw_dir, file), "wb").write)
ftp.close() # close ftp connection

In [None]:
# Returns node text if there is text, else returns empty string
def none_text(node):
    if node is None:
        return ""
    return node.text    

# Extracts author first name (fname), last name (lname), and affiliation
def extract_author_info(obj):
    return({
            "fname"      : none_text(obj.find("ForeName")), 
            "lname"      : none_text(obj.find("LastName")),
            "affiliation": none_text(obj.find("AffiliationInfo/Affiliation")), 
    })

# Extracts and Writes journal name, publication year, article title, 
# first author's first and last name, last author's first and last name,
# first author's affiliation, article type, middle authors, and if there is an abtract
def write_journal_csv(csvwriter, node):
    journal        = none_text(node.find("Article/Journal/Title")) 
    year           = none_text(node.find("Article/Journal/JournalIssue/PubDate/Year")) 
    title          = none_text(node.find("Article/ArticleTitle"))
    article_type   = none_text(node.find("Article/PublicationTypeList/PublicationType"))
    abstract       = node.find("Article/Abstract") is not None
    
    author_list    = node.find("Article/AuthorList") # author list
    if author_list is not None:
        authors        = author_list.findall("Author") # authors
        author_info    = [extract_author_info(author) for author in authors] # author information

        fname_first    = author_info[0].get("fname")
        lname_first    = author_info[0].get("lname")
        affiliation    = author_info[0].get("affiliation")

        fname_last     = ""
        lname_last     = ""
        middle_authors = []
        if len(author_info) > 1:
            fname_last     = author_info[-1].get("fname")
            lname_last     = author_info[-1].get("lname")
            middle_authors = [author.get("fname") for author in author_info[1:-1]]

        csvwriter.writerow([journal, year, title, fname_first, lname_first, fname_last, 
                         lname_last, affiliation, article_type, middle_authors, abstract])

# Given an XML file, will read the entire file to a CSV 
def process_xml_file(file):
    output = open(file + ".csv", "w")
    csvwriter = csv.writer(output) 
    csvwriter.writerow(["journal", "year", "title", "fname_first", "fname_last", 
                         "lname_first", "lname_last", "affiliation", "article_type", 
                         "middle_authors", "abstract"])

    tree = ET.parse(gzip.open(file))
    root = tree.getroot()
    nodes = root.findall("PubmedArticle/MedlineCitation")
    for node in nodes:
        write_journal_csv(csvwriter, node)
        
    print("Done: " + file + ".csv")
    output.close()

In [None]:
# Locate all XML files downloaded from PubMed and Write to CSVs
file_names = glob.glob(os.path.join(raw_dir, "*.xml.gz"))
for file in file_names:
    process_xml_file(file)