In [1]:
import os
import datetime
import lxml.etree as et
import csv
pmcdir = 'pmc_articles'
from plos_corpus import (corpusdir, get_article_pubdate, check_if_uncorrected_proof, listdir_nohidden,
                         get_article_xml, file_to_doi, doi_to_file)

from samples.corpus_analysis import (get_plos_article_type, get_article_dtd, get_random_list_of_dois, 
                                     get_related_retraction_article, check_article_type, get_plos_journal,
                                     get_article_title, parse_article_date, get_corpus_metadata,
                                     get_article_abstract, corpus_metadata_to_csv)


In [15]:
article_files = [doi_to_file(doi) for doi in get_random_list_of_dois(count=1000)]


In [17]:
corpus_metadata = get_corpus_metadata(article_files)

100% (1000 of 1000) |#####################| Elapsed Time: 0:00:26 Time: 0:00:26


In [16]:
for article in article_files[30:40]:
    abstract = get_article_abstract(article)
    print(abstract)


Tumorigenesis requires the re-organization of metabolism to support malignant proliferation. We examine how the altered metabolism of cancer cells is reflected in the rewiring of co-expression patterns among metabolic genes. Focusing on breast and clear-cell kidney tumors, we report the existence of key metabolic genes which act as hubs of differential co-expression, showing significantly different co-regulation patterns between normal and tumor states. We compare our findings to those from classical differential expression analysis, and counterintuitively observe that the extent of a gene's differential co-expression only weakly correlates with its differential expression, suggesting that the two measures probe different features of metabolism. Focusing on this discrepancy, we use changes in co-expression patterns to highlight the apparent loss of regulation by the transcription factor HNF4A in clear cell renal cell carcinoma, despite no differential expression of HNF4A. Finally, we 

In [None]:
def get_article_abstract(article_file):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :return: plain-text string of content in abstract
    """
    abstract = get_article_xml(article_file, tag_path_elements=["/",
                                                                "article",
                                                                "front",
                                                                "article-meta",
                                                                "abstract"])
    try:
        abstract_text = et.tostring(abstract[0], encoding='unicode', method='text')
    except IndexError:
        if check_article_type(article_file) == 'research-article' and \
          get_plos_article_type(article_file) == 'Research Article':
            print(check_article_type(article_file), article_file)

        abstract_text = ''
    # clean up text: rem white space, new line marks
    abstract_text.replace('  ', '').strip()
    if '\n' in abstract_text:
        print(abstract_text.decode('utf-8'))

    return abstract_text

In [18]:
corpus_metadata_to_csv(corpus_metadata=corpus_metadata)

In [None]:
abstract = get_article_abstract()
print(abstract)

In [None]:
abstract = b"".decode("utf-8") 

In [10]:
def get_article_metadata(article_file, size='small'):
    """
    For an individual article in the PLOS corpus, create a tuple of a set of metadata fields sbout that corpus.
    Make it small, medium, or large depending on number of fields desired.
    :param article_file: individual local PLOS XML article
    :param size: small, medium or large, aka how many fields to return for each article
    :return: tuple of metadata fields
    """
    doi = file_to_doi(article_file)
    filename = os.path.basename(article_file).rstrip('.xml')
    title = get_article_title(article_file)
    journal = get_plos_journal(article_file)
    jats_article_type = check_article_type(article_file)
    plos_article_type = get_plos_article_type(article_file)
    dtd_version = get_article_dtd(article_file)
    dates = get_article_dates(article_file, string=True)
    (pubdate, collection, received, accepted) = ('','','','')
    pubdate = dates['epub']
    abstract = get_article_abstract(article_file)
    try:
        collection = dates['collection']
    except KeyError:
        pass
    try:
        received = dates['received']
    except KeyError:
        pass
    try:
        accepted = dates['accepted']
    except KeyError:
        pass
    metadata = [doi, filename, title, journal, jats_article_type, plos_article_type, dtd_version, pubdate,
                received, accepted, collection, abstract]
    metadata = tuple(metadata)
    if len(metadata) == 12:
        return metadata
    else:
        print('Error in {}: {} items'.format(article_file, len(article_file)))
        return False

In [None]:
get_article_abstract('10.1371/journal.pone.0118395')

In [118]:
one_list = [article for article in listdir_nohidden(corpusdir) if 'pone' in article]

def assemble_diabetes_corpus(article_list):
    """
    Find all PLOS ONE articles that say something about diabetes or technology in the abstract.
    """
    diabetes_article_list = [article for article in article_list if 'diabet' in get_article_abstract(article).lower()]
    return diabetes_article_list

In [119]:
diabetes_article_list = assemble_diabetes_corpus(one_list)

research-article allofplos_xml/journal.pone.0150341.xml
research-article allofplos_xml/journal.pone.0160248.xml
research-article allofplos_xml/journal.pone.0163841.xml
research-article allofplos_xml/journal.pone.0173427.xml
research-article allofplos_xml/journal.pone.0174259.xml
research-article allofplos_xml/journal.pone.0184204.xml


In [120]:
diabetes_metadata = get_corpus_metadata(article_list=diabetes_article_list)

100% (6196 of 6196) |#####################| Elapsed Time: 0:02:43 Time: 0:02:43


In [135]:
corpus_metadata_to_csv(diabetes_meta_truncated)

In [None]:
print(diabetes_metadata)

In [6]:
with open('diabetes.txt', 'w') as f:
    for article in diabetes_article_list:
        f.write("%s\n" % '@@@')
        f.write("%s\n" % file_to_doi(article))
        f.write("%s\n" % '$$$')
        f.write("%s\n" % get_article_title(article))
        f.write("%s\n" % '###')
        f.write("%s\n" % get_article_abstract(article))

In [110]:
for article in diabetes_test_list:
    print('diabet' in get_article_abstract(article).lower())

True
True
True


In [111]:
diab_techno_test_list = ['10.1371/journal.pone.0105181', '10.1371/journal.pone.0005124', ]

['10.1371/journal.pone.0066299',
 '10.1371/journal.pone.0142480',
 '10.1371/journal.pone.0175096']

'\nCognitive impairment occurs in both schizophrenia and diabetes. There is currently limited understanding whether schizophrenia with diabetes has more serious cognitive deficits than schizophrenia without diabetes or diabetes only. This study assessed cognitive performance in 190 healthy controls, 106 diabetes only, 127 schizophrenia without diabetes and 55 schizophrenia with diabetes. This study was conducted from January 2008 to December 2010. Compared to healthy controls, all patient groups had significantly decreased total and five index RBANS scores (all p<0.01–p<0.001), except for the visuospatial/constructional index. Schizophrenia with diabetes performed worse than schizophrenia without diabetes in immediate memory (p<0.01) and total RBANS scores (<0.05), and showed a trend for decreased attention (p\u200a=\u200a0.052) and visuospatial/constructional capacity (p\u200a=\u200a0.063). Schizophrenia with diabetes performed worse than diabetes only in immediate memory (p<0.001) an

In [89]:
abstract_lower = get_article_abstract('10.1371/journal.pone.0066299').lower()

In [90]:
abstract_lower

'\ncognitive impairment occurs in both schizophrenia and diabetes. there is currently limited understanding whether schizophrenia with diabetes has more serious cognitive deficits than schizophrenia without diabetes or diabetes only. this study assessed cognitive performance in 190 healthy controls, 106 diabetes only, 127 schizophrenia without diabetes and 55 schizophrenia with diabetes. this study was conducted from january 2008 to december 2010. compared to healthy controls, all patient groups had significantly decreased total and five index rbans scores (all p<0.01–p<0.001), except for the visuospatial/constructional index. schizophrenia with diabetes performed worse than schizophrenia without diabetes in immediate memory (p<0.01) and total rbans scores (<0.05), and showed a trend for decreased attention (p\u200a=\u200a0.052) and visuospatial/constructional capacity (p\u200a=\u200a0.063). schizophrenia with diabetes performed worse than diabetes only in immediate memory (p<0.001) an

In [92]:
abstract = get_article_abstract('10.1371/journal.pone.0066299')
'diabet' in abstract.lower()

True

In [117]:
get_article_body_word_count(article_list[0])

4804

In [129]:
diabetes_meta_truncated = []
for row in diabetes_metadata:
    diabetes_meta_truncated.append(row[:-1])

In [133]:
len(diabetes_meta_truncated)

6196