In [1]:
from plos_corpus import *
from samples.corpus_analysis import *
corpusdir_prod = '../../allofplos/allofplos/allofplos_xml/'

# Q: Are annotation DOIs resolving correctly?

In [None]:
def make_annotation_dict(save_output=True):
    """
    For every article file whose DOI contains the word "annotation", check whether its DOI resolves correctly
    by creating a dictionary of the resolution status.
    :return: dictionary where each key is a DOI, each value is associated resolution of that DOI via doi.org.
    :param save_output: exports dictionary to csv
    """
    dois = [file_to_doi(file) for file in listdir_nohidden(corpusdir)]
    annotation_list = [x for x in dois if x.startswith('10.1371/annotation')]
    anno_dict = {doi: check_if_doi_resolves(doi) for doi in annotation_list}
    
    if save_output:
        with open('annotations.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['DOI', 'Resolution'])
            for key, value in anno_dict.items():
                writer.writerow([key, value])

    return anno_dict

In [None]:
# run this
make_annotation_dict()

# Q: Which `<contrib>` elements follow a certain pattern?

In [3]:
def get_tina_test_set():
    """
    Return a list of DOIs good for Tina's function
    """
    random_list_of_dois = get_random_list_of_dois(count=10)
    random_list_of_articles = [doi_to_path(doi) for doi in random_list_of_dois if 'annotation' not in doi]
    search_1_dois = ('10.1371/journal.pmed.1002035', '10.1371/journal.pone.0047559', '10.1371/journal.pone.0047944')
    search_1_articles = [doi_to_path(doi) for doi in search_1_dois]
    search_test_set = list(set(random_list_of_articles + search_1_articles))
    return search_test_set

def find_contrib_pattern(article_list=None, csv=True):
    """
    Three separate searches would be most helpful:
    Search #1: Find all articles where a <contrib> element contains an <on-behalf-of> element. 
       Example: pmed.1002035, pone.0047559, and pone.0047944 should all be found by this search.
    Search #2: Find all articles where a <contrib> element that contains an <on-behalf-of> element is
    immediately followed by <contrib> element that contains a <collab> element.
       Example: pone.0047559 and pone.0047944 should both be found by this search, but not pmed.1002035.
    Search #3: Find all articles where a <contrib> element that contains an <on-behalf-of> element is
    immediately followed by <contrib> element that contains a <collab> element that contains a <contrib-group>.
       Example: pone.0047944 should be found by this search, but not pmed.1002035 or pone.0047559.)
    To test this function, use get_tina_test_set() to run on a subset of articles
    """
    if article_list is None:
        article_list = listdir_nohidden(corpusdir)

    search_1_results = []
    search_2_results = []
    search_3_results = []

    for article_file in article_list:
        tag_path_elements = ('/',
                             'article',
                             'front',
                             'article-meta')
        article_xml = get_articleXML_content(article_file, tag_path_elements=tag_path_elements)
        meta_categories = article_xml[0].getchildren()
        contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']
        for contrib_group in contrib_groups:
            for contributor in contrib_group:
                for element in contributor:
                    if element.tag == 'on-behalf-of':
                        search_1_results.append(filename_to_doi(article_file))
                        next_element = contributor.getnext()
                        if next_element is not None:
                            for elem in next_element:
                                if elem.tag == 'collab':
                                    search_2_results.append(filename_to_doi(article_file))
                                    for subelem in elem:
                                        if subelem.tag == 'contrib-group':
                                            search_3_results.append(filename_to_doi(article_file))
                                            break

    search_1_results = set(search_1_results)
    search_2_results = set(search_2_results)
    search_3_results = set(search_3_results)
    search_results = list(set(search_1_results + search_2_results + search_3_results))
    doi_results = []
    for doi in search_results:
        if doi in search_1_results:
            s1 = 'yes'
        else:
            s1 = 'no'
        if doi in search_2_results:
            s2 = 'yes'
        else:
            s2 = 'no'
        if doi in search_3_results:
            s3 = 'yes'
        else:
            s3 = 'no'
        doi_result = (doi, s1, s2, s3)
        doi_results.append(doi_result)
    if csv:
        with open('search_results.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['DOI', 'Search 1', 'Search 2', 'Search 3'])
            for doi_result in sorted(doi_results):
                writer.writerow(doi_result)
    return doi_results

In [4]:
# test this function
test_list = get_tina_test_set()
doi_results = find_contrib_pattern(article_list=test_list, csv=False)

NameError: name 'get_random_list_of_dois' is not defined

In [None]:
print(doi_results)

In [None]:
# run this function for real
doi_results = find_contrib_pattern()

# Q: Which articles after 2015 have 2 or more corrections attached?

In [None]:
corrections_article_list, corrected_article_list = get_corrected_article_list()
multiple_corrections = set([article for article in corrected_article_list
                            if corrected_article_list.count(article) > 1])

In [None]:
multiple_corrections.remove('10.1371/journal.')
multiple_corrections_post_2015 = [article for article in multiple_corrections
                                  if get_article_pubdate(doi_to_file(article)).year >= 2015]

In [None]:
multiple_corrections_post_2015
with open('2_or_more_corrections.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['DOI'])
    for item in multiple_corrections_post_2015:
        writer.writerow(item)

# Q: Which articles have a series of table-wrap graphic elements?

In [5]:
example_doi = '10.1371/journal.pone.0068090'
search_1_file = 'xml_testing/Search-1_TRUE.xml'
search_2_file = 'xml_testing/Search-2_TRUE.xml'
intro_file = doi_to_path(example_doi, directory=corpusdir_prod)
search_1_fail_list = []
fail_file = doi_to_path('10.1371/journal.pone.0183466', directory=corpusdir_prod)
test_list = [fail_file, intro_file, search_1_file, search_2_file]

def find_table_wraps(article):
    """
    find all articles with a `table-wrap` element. of those, if there is no immediate sub-tag of
    'alternative' in table
    """
    intro_condition = False
    intro_condition_overall = False
    search1_ids = []
    search2_ids = []
    alternative_graphic_ids = []

    article_tree = et.parse(article, parser=et.XMLParser(remove_comments=True))  # exclude commented-out tables
    table_wraps = article_tree.findall('.//table-wrap')
    if table_wraps:
        for table_wrap in table_wraps:
            table_parts = table_wrap.getchildren()
            # intro condition 1: table-wrap element does not include a direct child of <alternatives><graphic>
            alternatives_parts = [table_part for table_part in table_parts if 'alternatives' in table_part.tag]
            if not alternatives_parts:
                intro_condition_1 = True
            else:
                for table_part in alternatives_parts:
                    table_subparts = table_part.getchildren()
                    if all('graphic' not in table_subpart.tag for table_subpart in table_subparts):
                        intro_condition_1 = True
                    else:
                        intro_condition_1 = False
                        new_alternative_graphic_ids = [table_subpart.attrib['id'] for table_subpart in table_subparts if 'graphic' in table_subpart.tag]
                        alternative_graphic_ids.extend(new_alternative_graphic_ids)

            # intro condition 2: table-wrap element does not include a direct child of <graphic>
            if all('graphic' not in table_part.tag for table_part in table_parts):
                intro_condition_2 = True
            else:
                intro_condition_2 = False
            
            if intro_condition_1 and intro_condition_2:
                intro_condition = True
                # keep track of articles that have any table match intro condition
                intro_condition_overall = True

            if intro_condition:
                graphics = table_wrap.findall('.//graphic')
                if graphics:
                    new_search1_ids = [graphic.attrib['id'] for graphic in graphics]
                    search1_ids.extend(new_search1_ids)
                inline_graphics = table_wrap.findall('.//inline-graphic')
                if inline_graphics:
                    try:
                        search2_ids = [inline.attrib['id'] for inline in inline_graphics]
                    except KeyError:
                        print('{} has search 2 results but no ids: {}'.format(article, inline_graphics))
                        search2_ids = [inline.attrib for inline in inline_graphics]
            else:
                pass

    else:
        return intro_condition_overall, bool(search1_ids), bool(search2_ids)

    if search1_ids and alternative_graphic_ids:
        # exclude graphics elements that are already accounted for under an <alternatives> tag
        search1_ids = [did for did in search1_ids if did not in alternative_graphic_ids]
    if not search1_ids:
        search1_ids = False
    elif len(search1_ids) == 1:
        search1_ids = search1_ids[0]
    if not search2_ids:
        search2_ids = False
    elif len(search2_ids) == 1:
        search2_ids = search2_ids[0]
    return intro_condition_overall, search1_ids, search2_ids


In [6]:
# testing the code
for article_file in test_list:
    intro_condition, search1_ids, search2_ids = find_table_wraps(article_file)
    print(article_file, intro_condition, search1_ids, search2_ids)

../../allofplos/allofplos/allofplos_xml/journal.pone.0183466.xml False False False
../../allofplos/allofplos/allofplos_xml/journal.pone.0068090.xml True False False
xml_testing/Search-1_TRUE.xml True pmed.1002397.e001g False
xml_testing/Search-2_TRUE.xml True False pmed.1002397.e001g


In [13]:
# running over entire corpus, randomized, with a progressbar
import progressbar
from random import shuffle

table_results = []
file_list = listdir_nohidden(corpusdir_prod)
shuffle(file_list)

bar = progressbar.ProgressBar(redirect_stdout=True, max_value=len(file_list))
for i, article_file in enumerate(file_list):
    intro_condition, search1_ids, search2_ids = find_table_wraps(article_file)
    if intro_condition:
        result = [filename_to_doi(article_file), search1_ids, search2_ids]
        table_results.append(result)
    bar.update(i+1)
bar.finish()

# print(table_results)
with open('table_graphics_search_results.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['DOI', 'Search 1', 'Search 2'])
    for doi_result in sorted(table_results):
        writer.writerow(doi_result)

221852
['../../allofplos/allofplos/allofplos_xml/journal.ppat.1000896.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0065590.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0036030.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0026652.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0029438.xml', '../../allofplos/allofplos/allofplos_xml/journal.pgen.1000989.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0089988.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0015594.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0149634.xml', '../../allofplos/allofplos/allofplos_xml/journal.pone.0000707.xml']


 12% ( 27042 of 221852) |#############                                                                                              | Elapsed Time: 0:04:33 ETA: 0:57:29

../../allofplos/allofplos/allofplos_xml/journal.pone.0002468.xml has search 2 results but no ids: [<Element inline-graphic at 0x115a65248>, <Element inline-graphic at 0x115a65d88>, <Element inline-graphic at 0x115a654c8>, <Element inline-graphic at 0x115a65208>, <Element inline-graphic at 0x115a65108>]
../../allofplos/allofplos/allofplos_xml/journal.pone.0002468.xml has search 2 results but no ids: [<Element inline-graphic at 0x115a65dc8>, <Element inline-graphic at 0x115a65308>, <Element inline-graphic at 0x115a65cc8>, <Element inline-graphic at 0x115a65f48>, <Element inline-graphic at 0x115a653c8>]


 52% (116850 of 221852) |########################################################                                                   | Elapsed Time: 0:19:57 ETA: 0:18:13

../../allofplos/allofplos/allofplos_xml/journal.pone.0075851.xml has search 2 results but no ids: [<Element inline-graphic at 0x11559cc88>]


100% (221852 of 221852) |##########################################################################################################| Elapsed Time: 0:38:14 Time: 0:38:14


# Which Aperta articles have a group collaboration contributor element?

## Example: 10.1371/journal.pmed.1002170
<contrib-group>
<contrib contrib-type="author" xlink:type="simple">
<collab>International Ebola Response Team</collab>
<xref ref-type="fn" rid="fn001">
<sup>¶</sup>

<fn fn-type="other" id="fn001">
<p>
¶ The International Ebola Response Team comprises the authors listed in this article in alphabetical order
</p>
</fn>

In [18]:
 def get_article_collab(doi, corpusdir=corpusdir_prod):
    """
    For a given PLOS article, see if there is a collaborator group in the authors list. Print data if so
    :return: tuple of doi, collaborators, and the footnote number if so
    """
    tag_path_elements = ('/',
                         'article',
                         'front',
                         'article-meta')
    article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)
    meta_categories = article_xml[0].getchildren()
    contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']
    collab = False
    rid = ''
    footnote = False
    collab_tuple = ''
    try:
        for contrib_group in contrib_groups:
            for contrib in contrib_group:
                if contrib.attrib['contrib-type'] == 'author':
                    for child in contrib:
                        if child.tag == "collab":
                            collab = True
                            collaborators = child.text
                            continue
                        if child.tag == 'role':
                            continue
                        elif child.tag == 'xref':
                            rid = (child.attrib['rid'])
                        if collab and rid:
                            break

    except IndexError:
        print('No authors found for {}'.format(doi))
        return False

    if collab and rid:
        tag_path_elements = ('/',
                             'article',
                             'front',
                             'article-meta',
                             'author-notes')

        article_xml = get_article_xml(doi_to_file(doi, directory=corpusdir), tag_path_elements=tag_path_elements)
        notes = article_xml[0].getchildren()
        for note in notes:
            if note.tag == 'fn' and rid in note.attrib.values():
                footnote = True
        if footnote is False:
            print('footnote not found for {}'.format(doi))

        collab_tuple = (doi, collaborators, rid)

    elif collab:
        print('rid not found for {}'.format(doi))

    if collab_tuple:
        print(collab_tuple)

    return collab_tuple

In [21]:
# Restrict to PLOS Biology Aperta articles
article_list = [article for article in listdir_nohidden(corpusdir_prod) if 'pbio.2' in article] 
doi_list = [file_to_doi(article) for article in article_list]
doi_list.append('10.1371/journal.pmed.1002170')

In [22]:
for doi in doi_list:
    get_article_collab(doi)

('10.1371/journal.pbio.2001069', 'CycliX consortium', 'fn001')
('10.1371/journal.pbio.2001855', 'BEEHIVE collaboration', 'fn001')
('10.1371/journal.pmed.1002170', 'International Ebola Response Team', 'fn001')
