In [None]:
from plos_corpus import *
from samples.corpus_analysis import *
corpusdir = 'allofplos_xml'

# Q: Are annotation DOIs resolving correctly?

In [None]:
def make_annotation_dict(save_output=True):
    """
    For every article file whose DOI contains the word "annotation", check whether its DOI resolves correctly
    by creating a dictionary of the resolution status.
    :return: dictionary where each key is a DOI, each value is associated resolution of that DOI via doi.org.
    :param save_output: exports dictionary to csv
    """
    dois = [file_to_doi(file) for file in listdir_nohidden(corpusdir)]
    annotation_list = [x for x in dois if x.startswith('10.1371/annotation')]
    anno_dict = {doi: check_if_doi_resolves(doi) for doi in annotation_list}
    
    if save_output:
        with open('annotations.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['DOI', 'Resolution'])
            for key, value in anno_dict.items():
                writer.writerow([key, value])

    return anno_dict

In [None]:
# run this
make_annotation_dict()

# Q: Which `<contrib>` elements follow a certain pattern?

In [None]:
def get_tina_test_set():
    """
    Return a list of DOIs good for Tina's function
    """
    random_list_of_dois = get_random_list_of_dois(count=10)
    random_list_of_articles = [doi_to_file(doi) for doi in random_list_of_dois if 'annotation' not in doi]
    search_1_dois = ('10.1371/journal.pmed.1002035', '10.1371/journal.pone.0047559', '10.1371/journal.pone.0047944')
    search_1_articles = [doi_to_file(doi) for doi in search_1_dois]
    search_test_set = list(set(random_list_of_articles + search_1_articles))
    return search_test_set

def find_contrib_pattern(article_list=None, csv=True):
    """
    Three separate searches would be most helpful:
    Search #1: Find all articles where a <contrib> element contains an <on-behalf-of> element. 
       Example: pmed.1002035, pone.0047559, and pone.0047944 should all be found by this search.
    Search #2: Find all articles where a <contrib> element that contains an <on-behalf-of> element is
    immediately followed by <contrib> element that contains a <collab> element.
       Example: pone.0047559 and pone.0047944 should both be found by this search, but not pmed.1002035.
    Search #3: Find all articles where a <contrib> element that contains an <on-behalf-of> element is
    immediately followed by <contrib> element that contains a <collab> element that contains a <contrib-group>.
       Example: pone.0047944 should be found by this search, but not pmed.1002035 or pone.0047559.)
    To test this function, use get_tina_test_set() to run on a subset of articles
    """
    if article_list is None:
        article_list = listdir_nohidden(corpusdir)

    search_1_results = []
    search_2_results = []
    search_3_results = []

    for article_file in article_list:
        tag_path_elements = ('/',
                             'article',
                             'front',
                             'article-meta')
        article_xml = get_articleXML_content(article_file, tag_path_elements=tag_path_elements)
        meta_categories = article_xml[0].getchildren()
        contrib_groups = [category for category in meta_categories if category.tag == 'contrib-group']
        for contrib_group in contrib_groups:
            for contributor in contrib_group:
                for element in contributor:
                    if element.tag == 'on-behalf-of':
                        search_1_results.append(file_to_doi(article_file))
                        next_element = contributor.getnext()
                        if next_element is not None:
                            for elem in next_element:
                                if elem.tag == 'collab':
                                    search_2_results.append(file_to_doi(article_file))
                                    for subelem in elem:
                                        if subelem.tag == 'contrib-group':
                                            search_3_results.append(file_to_doi(article_file))
                                            break

    search_1_results = set(search_1_results)
    search_2_results = set(search_2_results)
    search_3_results = set(search_3_results)
    search_results = list(set(search_1_results + search_2_results + search_3_results))
    doi_results = []
    for doi in search_results:
        if doi in search_1_results:
            s1 = 'yes'
        else:
            s1 = 'no'
        if doi in search_2_results:
            s2 = 'yes'
        else:
            s2 = 'no'
        if doi in search_3_results:
            s3 = 'yes'
        else:
            s3 = 'no'
        doi_result = (doi, s1, s2, s3)
        doi_results.append(doi_result)
    if csv:
        with open('search_results.csv', 'w') as f:
            writer = csv.writer(f)
            writer.writerow(['DOI', 'Search 1', 'Search 2', 'Search 3'])
            for doi_result in sorted(doi_results):
                writer.writerow(doi_result)
    return doi_results

In [None]:
# test this function
test_list = get_tina_test_set()
doi_results = find_contrib_pattern(article_list=test_list, csv=False)

In [None]:
print(doi_results)

In [None]:
# run this function for real
doi_results = find_contrib_pattern()

# Q: Which articles after 2015 have 2 or more corrections attached?

In [None]:
corrections_article_list, corrected_article_list = get_corrected_article_list()
multiple_corrections = set([article for article in corrected_article_list
                            if corrected_article_list.count(article) > 1])

In [None]:
multiple_corrections.remove('10.1371/journal.')
multiple_corrections_post_2015 = [article for article in multiple_corrections
                                  if get_article_pubdate(doi_to_file(article)).year >= 2015]

In [None]:
multiple_corrections_post_2015
with open('2_or_more_corrections.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['DOI'])
    for item in multiple_corrections_post_2015:
        writer.writerow(item)