In [None]:
import os
import datetime
import lxml.etree as et
import csv
pmcdir = 'pmc_articles'
from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,
                         get_article_xml, file_to_doi, doi_to_file)

from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, 
                                     get_related_retraction_article, check_article_type, get_plos_journal,
                                     get_article_title, parse_article_date, get_corpus_metadata,
                                     get_article_abstract, corpus_metadata_to_csv)


In [None]:
article_files = [doi_to_file(doi) for doi in get_random_list_of_dois(count=100)]


In [None]:
corpus_metadata = get_corpus_metadata(article_files)

In [None]:
for article in article_files[30:40]:
    abstract = get_article_abstract(article)
    print(abstract)

In [None]:
def get_article_abstract(article_file):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :return: plain-text string of content in abstract
    """
    abstract = get_article_xml(article_file, tag_path_elements=["/",
                                                                "article",
                                                                "front",
                                                                "article-meta",
                                                                "abstract"])
    try:
        abstract_text = et.tostring(abstract[0], encoding='unicode', method='text')
    except IndexError:
        if check_article_type(article_file) == 'research-article' and \
          get_plos_article_type(article_file) == 'Research Article':
            print(check_article_type(article_file), article_file)

        abstract_text = ''
    # clean up text: rem white space, new line marks
    abstract_text.replace('  ', '').strip()
    if '\n' in abstract_text:
        print(abstract_text.decode('utf-8'))

    return abstract_text

In [None]:
corpus_metadata_to_csv(corpus_metadata=corpus_metadata)

In [None]:
abstract = get_article_abstract()
print(abstract)

In [None]:
abstract = b"".decode("utf-8") 

In [None]:
def get_article_metadata(article_file, size='small'):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :param size: small, medium or large, aka how many fields to return for each article
    :return: tuple of metadata fields
    """
    doi = file_to_doi(article_file)
    filename = os.path.basename(article_file).rstrip('.xml')
    title = get_article_title(article_file)
    journal = get_plos_journal(article_file)
    jats_article_type = check_article_type(article_file)
    plos_article_type = get_plos_article_type(article_file)
    dtd_version = get_article_dtd(article_file)
    dates = get_article_dates(article_file, string=True)
    (pubdate, collection, received, accepted) = ('','','','')
    pubdate = dates['epub']
    abstract = get_article_abstract(article_file)
    try:
        collection = dates['collection']
    except KeyError:
        pass
    try:
        received = dates['received']
    except KeyError:
        pass
    try:
        accepted = dates['accepted']
    except KeyError:
        pass
    metadata = [doi, filename, title, journal, jats_article_type, plos_article_type, dtd_version, pubdate,
                received, accepted, collection, abstract]
    metadata = tuple(metadata)
    if len(metadata) == 12:
        return metadata
    else:
        print('Error in {}: {} items'.format(article_file, len(article_file)))
        return False

In [None]:
get_article_abstract('10.1371/journal.pone.0118395')

In [None]:
one_list = [article for article in listdir_nohidden(corpusdir) if 'pone' in article]
print(len(one_list))
sample_list = one_list[0:1000]

def assemble_diabetes_corpus(article_list):
    """
    Find all PLOS ONE articles that say something about diabetes or technology in the abstract.
    """
    diabetes_article_list = [article for article in article_list if all(c in get_article_abstract(article).lower() for c in ('diabet', 'techno'))]
    return diabetes_article_list

In [None]:
def assemble_diabetes_corpus(article_list):
    """
    Find all PLOS ONE articles that say something about diabetes or technology in the abstract.
    """
    diabetes_article_list = [article for article in article_list if 'diabet' in get_article_abstract(article).lower()]
    return diabetes_article_list

In [None]:
diabetes_article_list = assemble_diabetes_corpus(one_list)

In [None]:
diabetes_metadata = get_corpus_metadata(article_list=diabetes_article_list)

In [None]:
corpus_metadata_to_csv(diabetes_metadata)

In [None]:
print(diabetes_metadata)

In [None]:
with open('diabetes.txt', 'w') as f:
    for article in diabetes_article_list:
        f.write("%s\n" % '@@@')
        f.write("%s\n" % file_to_doi(article))
        f.write("%s\n" % '$$$')
        f.write("%s\n" % get_article_title(article))
        f.write("%s\n" % '###')
        f.write("%s\n" % get_article_abstract(article))

In [None]:
diabetes_abstract_list = [get_article_abstract(article) for article in diabetes_article_list]