In [203]:
import os
import datetime
import csv
pmcdir = 'pmc_articles'
from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,
                         get_article_xml, file_to_doi, doi_to_file)

from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, 
                                     get_related_retraction_article, check_article_type)


In [198]:
def get_article_metadata(article_file, size='small'):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :param size: small, medium or large, aka how many fields to return for each article
    :return: tuple of metadata fields
    """
    doi = file_to_doi(article_file)
    filename = os.path.basename(article_file).rstrip('.xml')
    title = get_article_title(article_file)
    journal = get_plos_journal(article_file)
    jats_article_type = check_article_type(article_file)
    plos_article_type = get_plos_article_type(article_file)
    dtd_version = get_article_dtd(article_file)
    dates = get_article_dates(article_file, string=True)
    (pubdate, collection, received, accepted) = ('','','','')
    pubdate = dates['epub']
    try:
        collection = dates['collection']
    except KeyError:
        pass
    try:
        received = dates['received']
    except KeyError:
        pass
    try:
        accepted = dates['accepted']
    except KeyError:
        pass
    metadata = [doi, filename, title, journal, jats_article_type, plos_article_type, dtd_version, pubdate,
                received, accepted, collection]
    metadata = tuple(metadata)
    if len(metadata) == 11:
        return metadata
    else:
        print('Error in {}: {} items'.format(article_file, len(article_file)))
        return False

In [205]:
def get_plos_journal(article_file, caps_fixed=True):
    """
    For an individual PLOS article, get the journal it was published in.
    :param article_file: individual local PLOS XML article
    :param caps_fixed: whether to render the journal name correctly or as-is
    :return: PLOS journal at specified xpath location
    """
    journal = get_article_xml(article_file=article_file,
                              tag_path_elements=["/",
                                                 "article",
                                                 "front",
                                                 "journal-meta",
                                                 "journal-title-group",
                                                 "journal-title"])
    try:
        journal = journal[0].text
    except IndexError:
        print('Error in journal name for {}'.format(article_file))
    if caps_fixed:
        journal = journal.split()
        if journal[0].lower() == 'plos':
            journal[0] = "PLOS"
        journal = (' ').join(journal)
    return journal

In [175]:
article_file = listdir_nohidden(corpusdir)[4638]
get_plos_journal(article_file)

'PLOS Computational Biology'

In [166]:
def get_article_title(article_file):
    """
    For an individual PLOS article, get its title.
    :param article_file: individual local PLOS XML article
    :return: article title at specified xpath location
    """
    title = get_article_xml(article_file=article_file,
                            tag_path_elements=["/",
                                               "article",
                                               "front",
                                               "article-meta",
                                               "title-group",
                                               "article-title"])
    return title[0].text

In [200]:
def parse_article_date(date_element, date_format='%d %m %Y'):
    """
    For an article date element, convert XML to a datetime object
    :param date_format: string format used to convert to datetime object
    :return: datetime object
    """
    day = ''
    month = ''
    year = ''
    for item in date_element.getchildren():
        if item.tag == 'day':
            day = item.text
        if item.tag == 'month':
            month = item.text
        if item.tag == 'year':
            year = item.text
    if day:
        date = (day, month, year)
        string_date = ' '.join(date)
        date = datetime.datetime.strptime(string_date, date_format)
    elif month:
        date = (month, year)
        string_date = ' '.join(date)
        date = datetime.datetime.strptime(string_date, '%m %Y')
    elif year:
        date = year
        date = datetime.datetime.strptime(date, '%Y')
    else:
        print('date error')
        date = ''
    return date

def get_article_dates(article_file, string=False):
    """
    For an individual article, get all of its dates
    :param article_file: file path/DOI of the article
    :return: dictionary of date types mapped to datetime objects for that article
    """
    dates = {}

    tag_path_1 = ["/",
                  "article",
                  "front",
                  "article-meta",
                  "pub-date"]
    raw_xml_1 = get_article_xml(article_file=article_file,
                              tag_path_elements=tag_path_1)
    for element in raw_xml_1:
        pub_type = element.get('pub-type')
        date = parse_article_date(element)
        dates[pub_type] = date

    tag_path_2 = ["/",
                  "article",
                  "front",
                  "article-meta",
                  "history"]        
    raw_xml_2 = get_article_xml(article_file=article_file,
                              tag_path_elements=tag_path_2)
    for element in raw_xml_2:
        for part in element:
            date_type = part.get('date-type')
            date = parse_article_date(part)
            dates[date_type] = date
    if 'received' in dates and 'accepted' in dates:
        if not dates['received'] <= dates['accepted'] <= dates['epub']:
            print('{} dates not in correct order: {}'.format(article_file, dates))
    if string:
        for key, value in dates.items():
            dates[key] = value.strftime('%Y-%m-%d')

    return dates

In [182]:
def get_corpus_metadata(article_list=None):
    """
    Run get_article_metadata() on a list of files, by default every file in corpusdir
    :param article_list: list of articles to run it on
    :return: list of tuples for each article
    """
    if article_list is None:
        article_list = listdir_nohidden(corpusdir)
    corpus_metadata = [get_article_metadata(article) for article in article_list]
    return corpus_metadata

In [188]:
def corpus_metadata_to_csv(corpus_metadata=None):
    """
    Convert list of tuples from get_article_metadata to csv
    :param corpus_metadata: the list of tuples, defaults to creating from corpusdir
    :return: None
    """
    if corpus_metadata is None:
        corpus_metadata = get_corpus_metadata()
    with open('allofplos_metadata.csv', 'w') as out:
        csv_out = csv.writer(out)
        csv_out.writerow(['doi', 'filename', 'title', 'journal', 'jats_article_type', 'plos_article_type',
                          'dtd_version', 'pubdate', 'received', 'accepted', 'collection'])
        for row in corpus_metadata:
            csv_out.writerow(row)

In [204]:
article_files = [doi_to_file(doi) for doi in get_random_list_of_dois(count=1000)]


IndexError: list index out of range

In [206]:
corpus_metadata = get_corpus_metadata(article_files)

Error in journal name for allofplos_xml/journal.pone.0043777.xml


AttributeError: 'list' object has no attribute 'split'

In [None]:
corpus_metadata_to_csv(corpus_metadata)