# Get Code References from papers

This notebook:

 * Uses getpapers externally to download fulltext of all papers in EuPMC which contain github URLs
 * Textmines each paper fulltext and extract occurences of GitHub URLs
 * Outputs a data structure of the form: paper_DOI {{github_url: "http://github.com/blah/blah"}...}

In [1]:
import json
from lxml import etree
import re
import process_eupmc

## Use getpapers to download fulltext of papers

We currently do this outside of the notebook, and assume that the files are available locally.

The command we are using is:

>getpapers --query 'github' -x --limit 100 -o data

which queries EuPMC for all papers containing the term 'github' and returns the full text of the first 100 papers matching this into the directory 'data'

## Textmine each paper

### File locations

In [2]:
# Directory containing the data
data_dir = '../data'

# File containing the list of matching papers
matching_papers = data_dir + '/' + 'eupmc_fulltext_html_urls.txt'

# Name of the Content Mine results file in each paper subdirectory
contentmine_results = 'eupmc_result.json'

# Name of the Content Mine full text xml paper dump in each paper subdirectory
fulltext_xml = 'fulltext.xml'

In [3]:
# Object for building the JSON output file containing the dictionary of papers and URLs to repositories

dict_of_papers = {}

In [4]:
# Get the list of subdirectories dumped by ContentMine
papers = process_eupmc.get_paper_subdirectories(matching_papers)

['PMC5802054', 'PMC5838108', 'PMC5833151', 'PMC5634325', 'PMC5832410', 'PMC5764482', 'PMC5819480', 'PMC5627421', 'PMC5753347', 'PMC5736641']


In [None]:
# For each paper

for paper_dir in papers:

    paper_dict = {}
 
    # Read in the JSON file and get the DOI
    
    filename = data_dir + '/' + paper_dir + '/' + contentmine_results
    
    try:
        with open(filename, 'r') as f:
            paper_json = json.load(f)
            # Get the DOI
            paper_doi = paper_json['doi'][0]
            pub_date = paper_json['journalInfo'][0]['printPublicationDate'][0]
    except IOError:
        print("Error: File does not appear to exist.")
    
    # Read in the XML full text and mine for the github URLs

    fulltext_file = data_dir + '/' + paper_dir + '/' + fulltext_xml
 
    gh_urls = []

    try:
        with open(fulltext_file, 'r') as f:
            data = f.read()
            urls = re.findall(r'(https?://\S+)(?=\")', data)
            for url in urls:
                if re.match(r'https?://github.com', url):
 #                   print(url)
                    gh_urls.append(url) 
    except IOError:
        print("Error: File does not appear to exist.")

    
    paper_dict['pub_date'] = pub_date
    paper_dict['github'] = gh_urls

    dict_of_papers[str(paper_doi)] = paper_dict    
        

## Output data structure

In [None]:
with open('dict_of_papers.json', 'w') as outfile:  
    json.dump(dict_of_papers, outfile)

print(json.dumps(dict_of_papers, sort_keys=True, indent=4))