# Is Software Updated?

This notebook looks at references to GitHub URLs in papers available in the OA corpus from EuroPMC,
and identifies:

  * How many times the GitHub repositories have been updated since paper referencing them was released
  
Note that at present, we are not distinguishing between URLs referencing software *created* by the paper authors,
versus *used* by the authors, nor which software was created as a result of the work in the paper.

In [29]:
import pandas
import json
from datetime import datetime, timedelta
from github import Github

import process_eupmc
import process_urls

## File locations

In [2]:
# Directory containing the data
data_dir = '../data'

# File containing the list of matching papers
matching_papers = data_dir + '/' + 'eupmc_fulltext_html_urls.txt'

# File for the output
output_jsonfile = data_dir + '/' + 'dict_of_papers.json'

# Github Token
gh_token = '../secrets/github_token'

In [35]:
with open(gh_token, 'r') as f:
    github_token = f.read().rstrip()

## Use getpapers to download fulltext of papers

We currently do this outside of the notebook, and assume that the files are available locally.

The command we are using is:

>getpapers --query 'github' -x --limit 100 -o data

which queries EuPMC for all papers containing the term 'github' and returns the full text of the first 100 papers matching this into the directory 'data'

## Textmine each paper

In [3]:
# Get the list of subdirectories dumped by ContentMine
paper_ids = process_eupmc.get_pmcids(matching_papers)

In [4]:
# Process the papers and extract all the references to GitHub and Zenodo urls
papers_info = process_eupmc.process_papers(paper_ids, data_dir)

## Create data structure

In [5]:
dict_of_papers = {}

In [6]:
for p in papers_info:
    paper_dict = {}
    paper_dict['pmcid'] = p.pmcid
    paper_dict['pub_date'] = p.pub_date
    paper_dict['github'] = p.references['github']
    dict_of_papers[str(p.doi)] = paper_dict    
     

## Analyse GitHub repos to see frequency of commits

In [None]:
g = Github(github_token)
number_of_updates = {}

for p in papers_info:

    repos = []
    for gh_url in p.references['github']:
        words = gh_url.split('/')
        if len(words) > 4:
            reponame = words[3] + '/' + words[4]
            if reponame not in repos:
                repos.append(reponame)            


    
    for repo in repos:
        print ("Processing: ", repo)
        code = g.get_repo(repo)
        # limit to commits since publication date
        since = datetime.strptime(p.pub_date, '%Y-%m-%d')
        commits = code.get_commits()
        num_commits = 0
        commit_date = commits[num_commits].commit.author.date
        while commit_date > since:
            num_commits = num_commits + 1
            commit_date = commits[num_commits].commit.author.date
        print("Number of commits since publication: ", num_commits)
        number_of_updates[repo] = num_commits

In [60]:
number_of_updates

{'AllonKleinLab/SPRING': 5,
 'BIC-MNI/minc-toolkit': 0,
 'ChimeRScope/ChimeRScope': 0,
 'ChrisMaherLab/INTEGRATE-Vis': 2,
 'DmitryUlyanov/Multicore-TSNE': 1,
 'GGiecold/ECLAIR': 0,
 'Huiyang520/DMk-BKmeans': 0,
 'MaayanLab/CCLE_Clustergrammer': 0,
 'MaayanLab/CST_Lung_Cancer_Viz': 0,
 'MaayanLab/Cytof_Plasma_PMA': 0,
 'MaayanLab/clustergrammer': 12,
 'MaayanLab/clustergrammer-docs': 6,
 'MaayanLab/clustergrammer-py': 0,
 'MaayanLab/clustergrammer-web': 4,
 'MaayanLab/clustergrammer-widget': 5,
 'PMBio/scLVM': 0,
 'Planteome/amigo': 0,
 'Planteome/common-files-for-ref-ontologies': 0,
 'Planteome/plant-experimental-conditions-ontology': 21,
 'Planteome/plant-ontology': 13,
 'Planteome/plant-trait-ontology': 89,
 'Planteome/planteome-ncbi-taxonomy': 0,
 'SheffieldML/GPy': 48,
 'WGLab/lncScore': 0,
 'YeatmanLab/AFQ-Browser': 0,
 'YeatmanLab/AFQ-Browser_data': 0,
 'YosefLab/FastProject': 0,
 'asncd/MIMOSCA': 0,
 'aziele/alfpy': 2,
 'bioinfo-ut/GenomeTester4': 13,
 'biolink/biolink-api': 20,