# Required functions

In [2]:
from samples.corpus_analysis import *

# PLOS/NLM article type mapping

In [None]:
# For mapping the JATS article type onto the PLOS article type, while taking NLM DTD into account.
article_types_map = get_article_types_map()
PLOS_article_types_structured = counter(article_types_map).most_common()
print(PLOS_article_types_structured)

In [None]:
# create .csv file mapping JATS to PLOS article types
article_types_map_to_csv(article_types_map)

# Retracted and corrected articles

## Get list of retracted articles

In [2]:
retractions_article_list, retracted_article_list = get_retracted_article_list()
print(retracted_article_list)

79 retracted articles found.
['journal.pbio.0030123', 'journal.pbio.0050005', 'journal.pbio.0050146', 'journal.pbio.1001212', 'journal.pcbi.1002308', 'journal.pgen.1003361', 'journal.pgen.1003791', 'journal.pgen.1005586', 'journal.pgen.1000424', 'journal.pmed.1001214', 'journal.pone.0072333', 'journal.pone.0084127', 'journal.pone.0027571', 'journal.pone.0046410', 'journal.pone.0080145', 'journal.pone.0019652', 'journal.pone.0075928', 'journal.pone.0075046', 'journal.pone.0062178', 'journal.pone.0051549', 'journal.pone.0093095', 'journal.pone.0069669', 'journal.pone.0133525', 'journal.pone.0115980', 'journal.pone.0115741', 'journal.pone.0139044', 'journal.pone.0146193', 'journal.pone.0045667', 'journal.pone.0040789', 'journal.pone.0094830', 'journal.pone.0031943', 'journal.pone.0097700', 'journal.pone.0047218', 'journal.pone.0090951', 'journal.pone.0014232', 'journal.pone.0090318', 'journal.pone.0072895', 'journal.pone.0065651', 'journal.pone.0059556', 'journal.pone.0076809', 'journal.p

## Get list of corrected articles

In [18]:
corrections_article_list, corrected_article_list = get_corrected_article_list()

journal.pcbi.1003582.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003490
journal.pcbi.1003732.xml has incorrect linked DOI: journal.10.1371/journal.pcbi.1003159
journal.pone.0101541.xml has incorrect linked DOI: journal.PONE-D-13-26510
journal.pone.0104353.xml has incorrect linked DOI: journal.
journal.pone.0104472.xml has incorrect linked DOI: journal.
journal.pone.0104581.xml has incorrect linked DOI: journal.
journal.pone.0104601.xml has incorrect linked DOI: journal.
journal.pone.0105485.xml has incorrect linked DOI: journal.
journal.pone.0105486.xml has incorrect linked DOI: journal.
journal.pone.0105490.xml has incorrect linked DOI: journal.
journal.pone.0105658.xml has incorrect linked DOI: journal.
journal.pone.0105668.xml has incorrect linked DOI: journal.
journal.pone.0105669.xml has incorrect linked DOI: journal.
9127 corrected articles found.


# Check raw XML for article updates

In [2]:
# By default, checks only the 30,000 most recent articles
articles_different_list = revisiondate_sanity_check()
print(articles_different_list)

downloaded new version of journal.pone.0182022.xml
downloaded new version of journal.pone.0175323.xml
downloaded new version of journal.pone.0171255.xml
downloaded new version of journal.pone.0158499.xml
30000 article checked for updates.
4 articles have updates.
['journal.pone.0182022.xml', 'journal.pone.0175323.xml', 'journal.pone.0171255.xml', 'journal.pone.0158499.xml']


# DOI and filename sanity check

In [2]:
# Check if article filenames match their full DOIs & that DOI fields are correct
messed_up_plos_list = article_doi_sanity_check()
messed_up_pmc_list = article_doi_sanity_check(directory=pmcdir, article_list=None, source='PMC')

All article file names match DOIs.
PMC2687079.nxml has invalid DOI field: '10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f '


# PubMed Corpus

## Get all local, solr, and PMC DOIs

In [5]:
plos_articles = compare_local_and_solr()
doi_to_pmc = get_articles_by_doi_field(check_new=False)
pmc_articles = list(doi_to_pmc.keys())

[1mArticles that needs to be re-indexed on Solr:
[0m10.1371/journal.pone.0076809


## Compare PLOS's copy to PMC

For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PLOS's version of the corpus by:
* removing Currents articles
* checking if articles are live on journals.plos.org
* checking that the DOIs resolve

In [5]:
missing_plos_articles = process_missing_plos_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)

[1mPMC DOI fields with spaces in them:
[0m"10.1371/annotation/1cdc7975-50d7-40a5-99ca-83580df2982f " 

[1mWorking articles that need to be re-indexed on Solr:
[0m10.1371/annotation/1391941e-93d3-48d3-8c9a-b7c6d98f9527
10.1371/annotation/a81b1fab-890c-447b-a308-5bc8ca3eb21d
10.1371/annotation/df340d50-1f94-4d8b-a252-1a82a7fa5cc7 

[1mArticles on PMC but not on solr or journals:
[0m10.1371/journal.pone.0002957
10.1371/annotation/b83e925b-2f2a-47b9-b939-0a1eeab18324
10.1371/journal.pbio.0020201
10.1371/annotation/011969ee-3f4b-4260-8d95-1b9a4ca39008
10.1371/annotation/8f2ddf91-3499-4627-9a91-449b78465f9d
10.1371/annotation/33d82b59-59a3-4412-9853-e78e49af76b9 

[1mMissing PLOS articles where DOI resolves to different DOI:
[0m 10.1371/annotation/5e4082fd-6d86-441f-b946-a6e87a22ea57 resolves to: 10.1371/annotation/d9496d01-8c5d-4d24-8287-94449ada5064
[0m 10.1371/annotation/b8b66a84-4919-4a3e-ba3e-bb11f3853755 resolves to: 10.1371/annotation/5fbbf39a-fb47-4ce1-8069-acd830b3d41f

 [

## Compare PMC's copy to PLOS

For sets of PLOS's corpus from PMC and PLOS, see which article are missing from PMC's version of the Corpus by:
* updating the PMCID:DOI mapping document
* removing articles too recent to be indexed (pubdate less than 3 weeks ago)
* excluding uncorrected proofs
* excluding PLOS Medicine quizzes

In [6]:
missing_pmc_articles = process_missing_pmc_articles(pmc_articles=pmc_articles, plos_articles=plos_articles)

[1mArticles missing from PMC:
[0m10.1371/annotation/08286cd8-527f-4f14-856f-57267107efa8
10.1371/annotation/0bbea8d3-1f94-48af-915c-aec02da2f5c3
10.1371/annotation/0c5390b8-72b0-4b7e-85a3-b8c0fd9f62bf
10.1371/annotation/0ccac188-950f-4908-b232-35fb44ba7847
10.1371/annotation/0cfd3d5f-c1d0-48f8-ad69-34a95e31a8d2
10.1371/annotation/0e045706-ea24-41db-be90-27d1cbcd35b1
10.1371/annotation/17310bbb-e5bf-4901-8b6e-529577a280db
10.1371/annotation/1c419628-f1b5-45de-9f8a-43f834309ebb
10.1371/annotation/1dc00176-e096-4621-9494-2d848dac8262
10.1371/annotation/1e464689-3c86-4399-b229-1e00d65593a5
10.1371/annotation/1f110857-27d7-4e83-9eb3-4e5f51950a26
10.1371/annotation/21379809-1376-4250-b4c2-bf51eac58a98
10.1371/annotation/221e5f19-370e-4a52-add8-f882437bc85d
10.1371/annotation/230cca90-58e9-4aa1-b6b2-a1d744524fbd
10.1371/annotation/23bca9d0-f934-400e-8bb9-f5ff07f9e625
10.1371/annotation/270b432d-50ec-41f1-ad4d-ddd9f51f62a5
10.1371/annotation/2b218d50-a9d5-45b2-80d0-0e806e530749
10.1371/annot

## Save lists of missing articles to text files if needed

In [None]:
with open('missing_plos_articles.txt', 'w') as file:
    for item in sorted(set(missing_plos_articles)):
        file.write("%s\n" % item)

In [None]:
with open('missing_pmc_articles.txt', 'w') as file:
    for item in sorted(set(missing_pmc_articles)):
        file.write("%s\n" % item)