In [2]:
from plos_corpus import *
from samples.corpus_analysis import *
corpusdir_prod = '../../allofplos/allofplos/allofplos_xml/'

# Q: Are annotation DOIs resolving correctly?

In [None]:
def make_annotation_dict(save_output=True):
    """
    For every article file whose DOI contains the word "annotation", check whether its DOI resolves correctly
    by creating a dictionary of the resolution status.
    :return: dictionary where each key is a DOI, each value is associated resolution of that DOI via doi.org.
    :param save_output: exports dictionary to csv
    """
    dois = [file_to_doi(file) for file in listdir_nohidden(corpusdir)]
    annotation_list = [x for x in dois if x.startswith('10.1371/annotation')]
    anno_dict = {doi: check_if_doi_resolves(doi) for doi in annotation_list}
    
    if save_output:
        with open('annotations.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['DOI', 'Resolution'])
            for key, value in anno_dict.items():
                writer.writerow([key, value])

    return anno_dict

In [None]:
# run this
make_annotation_dict()

# Q: Which `<contrib>` elements follow a certain pattern?

In [None]:
def get_tina_test_set():
    """
    Return a list of DOIs good for Tina's function
    """
    random_list_of_dois = get_random_list_of_dois(count=10)
    random_list_of_articles = [doi_to_file(doi) for doi in random_list_of_dois if 'annotation' not in doi]
    search_1_dois = ('10.1371/journal.pmed.1002035', '10.1371/journal.pone.0047559', '10.1371/journal.pone.0047944')
    search_1_articles = [doi_to_file(doi) for doi in search_1_dois]
    search_test_set = list(set(random_list_of_articles + search_1_articles))
    return search_test_set

def find_contrib_pattern(article_list=None, csv=True):
    """
    Three separate searches would be most helpful:
    Search #1: Find all articles where a <contrib> element contains an <on-behalf-of> element. 
       Example: pmed.1002035, pone.0047559, and pone.0047944 should all be found by this search.
    Search #2: Find all articles where a <contrib> element that contains an <on-behalf-of> element is
    immediately followed by <contrib> element that contains a <collab> element.
       Example: pone.0047559 and pone.0047944 should both be found by this search, but not pmed.1002035.
    Search #3: Find all articles where a <contrib> element that contains an <on-behalf-of> element is
    immediately followed by <contrib> element that contains a <collab> element that contains a <contrib-group>.
       Example: pone.0047944 should be found by this search, but not pmed.1002035 or pone.0047559.)
    To test this function, use get_tina_test_set() to run on a subset of articles
    """
    if article_list is None:
        article_list = listdir_nohidden(corpusdir)

    search_1_results = []
    search_2_results = []
    search_3_results = []

    for article_file in article_list:
        tag_path_elements = ('/',
                             'article',
                             'front',
                             'article-meta')
        article_xml = get_articleXML_content(article_file, tag_path_elements=tag_path_elements)
        meta_categories = article_xml[0].getchildren()
        contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']
        for contrib_group in contrib_groups:
            for contributor in contrib_group:
                for element in contributor:
                    if element.tag == 'on-behalf-of':
                        search_1_results.append(file_to_doi(article_file))
                        next_element = contributor.getnext()
                        if next_element is not None:
                            for elem in next_element:
                                if elem.tag == 'collab':
                                    search_2_results.append(file_to_doi(article_file))
                                    for subelem in elem:
                                        if subelem.tag == 'contrib-group':
                                            search_3_results.append(file_to_doi(article_file))
                                            break

    search_1_results = set(search_1_results)
    search_2_results = set(search_2_results)
    search_3_results = set(search_3_results)
    search_results = list(set(search_1_results + search_2_results + search_3_results))
    doi_results = []
    for doi in search_results:
        if doi in search_1_results:
            s1 = 'yes'
        else:
            s1 = 'no'
        if doi in search_2_results:
            s2 = 'yes'
        else:
            s2 = 'no'
        if doi in search_3_results:
            s3 = 'yes'
        else:
            s3 = 'no'
        doi_result = (doi, s1, s2, s3)
        doi_results.append(doi_result)
    if csv:
        with open('search_results.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['DOI', 'Search 1', 'Search 2', 'Search 3'])
            for doi_result in sorted(doi_results):
                writer.writerow(doi_result)
    return doi_results

In [None]:
# test this function
test_list = get_tina_test_set()
doi_results = find_contrib_pattern(article_list=test_list, csv=False)

In [None]:
print(doi_results)

In [None]:
# run this function for real
doi_results = find_contrib_pattern()

# Q: Which articles after 2015 have 2 or more corrections attached?

In [None]:
corrections_article_list, corrected_article_list = get_corrected_article_list()
multiple_corrections = set([article for article in corrected_article_list
                            if corrected_article_list.count(article) > 1])

In [None]:
multiple_corrections.remove('10.1371/journal.')
multiple_corrections_post_2015 = [article for article in multiple_corrections
                                  if get_article_pubdate(doi_to_file(article)).year >= 2015]

In [None]:
multiple_corrections_post_2015
with open('2_or_more_corrections.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['DOI'])
    for item in multiple_corrections_post_2015:
        writer.writerow(item)

# Q: Which articles have a series of table-wrap elements?

In [190]:
example_doi = '10.1371/journal.pone.0068090'
search_1_file = 'xml_testing/Search-1_TRUE.xml'
search_2_file = 'xml_testing/Search-2_TRUE.xml'
intro_file = doi_to_file(example_doi)
fail_file = doi_to_file('10.1371/journal.pone.0182980')
test_list = [intro_file, search_1_file, search_2_file, fail_file]

intro_condition = []
search_1 = []
search_2 = []

def find_table_wraps(article):
    """
    find all articles with a `table-wrap` element. of those, if there is no immediate sub-tag of
    'alternative' in table
    """
    intro_condition = False
    search_1 = False
    search_2 = False

    article_tree = et.parse(article)
    table_wraps = article_tree.findall('.//table-wrap')
    if table_wraps:
        for table_wrap in table_wraps:
            try:
                if all('alternatives' not in table_part.tag for table_part in table_wrap) and \
                   all('graphic' not in table_part.tag for table_part in table_wrap):
                    intro_condition = True
            except TypeError:
                # this is an imperfect work-around. if alternatives were a sub-sub-element,
                # it would be incorrectly excluded from intro_
                alternatives = table_wrap.findall('.//alternatives')
                if alternatives == 0:
                    intro_condition = True
            if intro_condition:
                danger = table_wrap.findall('.//graphic')
                if danger:
                    search_1 = True
                danger2 = table_wrap.findall('.//inline-graphic')
                if danger2:
                    search_2 = True
            else:
                pass
                       
#                 for table_part in table_parts:
#                     if 'alternatives' in table_part.tag:
#                         print('alternatives')

    else:
        pass

    return intro_condition, search_1, search_2


In [196]:
table_results = []
for article_file in test_list:
    intro_condition, search_1, search_2 = find_table_wraps(article_file)
    print(article_file, intro_condition, search_1, search_2)

allofplos_xml/journal.pone.0068090.xml True False False
xml_testing/Search-1_TRUE.xml True True False
xml_testing/Search-2_TRUE.xml True True True
allofplos_xml/journal.pone.0182980.xml False False False


In [197]:
table_results = []
file_list = listdir_nohidden(corpusdir)
for article_file in file_list:
    intro_condition, search_1, search_2 = find_table_wraps(article_file)
    if intro_condition:
        result = [file_to_doi(article_file), search_1, search_2]
        table_results.append(result)

# print(table_results)
with open('table_search_results_revised.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['DOI', 'Search 1', 'Search 2'])
    for doi_result in sorted(table_results):
        writer.writerow(doi_result)

In [None]:
for article_file in listdir_nohidden(corpusdir)[180000:180010]:
    print(find_table_wraps(article_file))

# Which Aperta articles have a group collaboration contributor element?

## Example: 10.1371/journal.pmed.1002170
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<collab>International Ebola Response Team</collab>
<xref ref-type="fn" rid="fn001">
<sup>¶</sup>

<fn fn-type="other" id="fn001">
<p>
¶ The International Ebola Response Team comprises the authors listed in this article in alphabetical order
</p>
</fn>

In [18]:
 def get_article_collab(doi, corpusdir=corpusdir_prod):
    """
    For a given PLOS article, see if there is a collaborator group in the authors list. Print data if so
    :return: tuple of doi, collaborators, and the footnote number if so
    """
    tag_path_elements = ('/',
                         'article',
                         'front',
                         'article-meta')
    article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)
    meta_categories = article_xml[0].getchildren()
    contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']
    collab = False
    rid = ''
    footnote = False
    collab_tuple = ''
    try:
        for contrib_group in contrib_groups:
            for contrib in contrib_group:
                if contrib.attrib['contrib-type'] == 'author':
                    for child in contrib:
                        if child.tag == "collab":
                            collab = True
                            collaborators = child.text
                            continue
                        if child.tag == 'role':
                            continue
                        elif child.tag == 'xref':
                            rid = (child.attrib['rid'])
                        if collab and rid:
                            break

    except IndexError:
        print('No authors found for {}'.format(doi))
        return False

    if collab and rid:
        tag_path_elements = ('/',
                             'article',
                             'front',
                             'article-meta',
                             'author-notes')

        article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)
        notes = article_xml[0].getchildren()
        for note in notes:
            if note.tag == 'fn' and rid in note.attrib.values():
                footnote = True
        if footnote is False:
            print('footnote not found for {}'.format(doi))

        collab_tuple = (doi, collaborators, rid)

    elif collab:
        print('rid not found for {}'.format(doi))

    if collab_tuple:
        print(collab_tuple)

    return collab_tuple

In [21]:
# Restrict to PLOS Biology Aperta articles
article_list = [article for article in listdir_nohidden(corpusdir_prod) if 'pbio.2' in article] 
doi_list = [file_to_doi(article) for article in article_list]
doi_list.append('10.1371/journal.pmed.1002170')

In [22]:
for doi in doi_list:
    get_article_collab(doi)

('10.1371/journal.pbio.2001069', 'CycliX consortium', 'fn001')
('10.1371/journal.pbio.2001855', 'BEEHIVE collaboration', 'fn001')
('10.1371/journal.pmed.1002170', 'International Ebola Response Team', 'fn001')
