# Notebook to process output from thecontentmine

This notebook illustrates the basic process of processing each resource in each paper and allocates a score.

In [1]:
import datetime

import pandas

import process_urls

In [2]:
dict_of_papers = {'10.1016/j.cageo.2012.05.031': {'github': ['https://github.com/andreww/MSAT',
                                                             'https://github.com/url_that_does_not_resolve.com'],
                                                  'zenodo': ['https://zenodo.org/record/1196821#.WrtnltPwYkg']
                                                 }
                 }

In [3]:
def process_github_url(url, doi, verbose=False):
    """
    Given a github URL, calculate a 'code score' and report attributes
    
    A string, 'url', found in a paper, represented by 'doi', is processed
    to establish if the URL resolves, and if it does check for attributes of
    the repository that may be an indicator of code quality.
    """
    url_dict = {'doi': doi,
                'url': url,
                'resourcetype': 'github',
                'timestamp': datetime.datetime.now().isoformat(),
                'resolves': None,
                'score': 0}

    if not process_urls.is_url_valid(url):
        url_dict['resolves'] = False
        if verbose: print("URL {} did not resolve".format(url))
        return url_dict
    url_dict['resolves'] = True
    url_dict['score'] = url_dict['score'] + 1
    return url_dict

In [4]:
def process_zenodo_url(url, verbose=False):
    """
    Given a zenodo URL, calculate a 'code score' and report attributes
    
    A string, 'url', found in a paper, represented by 'doi', is processed
    to establish if the URL resolves, and if it does check for attributes of
    the repository that may be an indicator of code quality.
    """
    url_dict = {'doi': doi,
                'url': url,
                'resourcetype': 'zenodo',
                'timestamp': datetime.datetime.now().isoformat(),
                'resolves': None,
                'score': 0}

    if not process_urls.is_url_valid(url):
        url_dict['resolves'] = False
        if verbose: print("URL {} did not resolve".format(url))
        return url_dict
    url_dict['resolves'] = True
    url_dict['score'] = url_dict['score'] + 1
    return url_dict

In [5]:
def process_papers_dict(dict_of_papers, verbose=False):
    """
    For a list of papers (represented by dois) and URLs check each one
    
    """
    resources_list = []
    papers_output_dict = {}

    for paper_doi in dict_of_papers:
        paper = dict_of_papers[paper_doi]
        paper_score = 0
        number_or_resources = 0
        
        if verbose: print("processing paper with doi {}".format(paper_doi))
        
        if 'github' in paper:
            for url in paper['github']:
                url_dict = process_github_url(url, paper_doi, verbose=verbose)
                resources_list.append(url_dict)
                paper_score = paper_score + url_dict['score']
                number_or_resources = number_or_resources + 1
            
        if 'zenodo' in paper:
            for url in paper['zenodo']:
                url_dict = process_github_url(url, paper_doi, verbose=verbose)
                resources_list.append(url_dict)
                paper_score = paper_score + url_dict['score']
                number_or_resources = number_or_resources + 1
            
        papers_output_dict[paper_doi] = {'score': paper_score, 
                                         'timestamp': datetime.datetime.now().isoformat()}
    
            
        if verbose: print("Paper with doi {} has score of {}".format(paper_doi, 
                                                     paper_score / number_or_resources))
            
        return resources_list

In [6]:
resources_list = process_papers_dict(dict_of_papers, verbose=True)



processing paper with doi 10.1016/j.cageo.2012.05.031
URL https://github.com/url_that_does_not_resolve.com did not resolve
Paper with doi 10.1016/j.cageo.2012.05.031 has score of 0.6666666666666666


In [7]:
print(resources_list)

[{'doi': '10.1016/j.cageo.2012.05.031', 'timestamp': '2018-03-28T13:29:08.977184', 'url': 'https://github.com/andreww/MSAT', 'resourcetype': 'github', 'resolves': True, 'score': 1}, {'doi': '10.1016/j.cageo.2012.05.031', 'timestamp': '2018-03-28T13:29:10.028420', 'url': 'https://github.com/url_that_does_not_resolve.com', 'resourcetype': 'github', 'resolves': False, 'score': 0}, {'doi': '10.1016/j.cageo.2012.05.031', 'timestamp': '2018-03-28T13:29:10.473266', 'url': 'https://zenodo.org/record/1196821#.WrtnltPwYkg', 'resourcetype': 'github', 'resolves': True, 'score': 1}]


## Analysis of the data

We can store the data in a flat SQL database, but here we stuff it into a Pandas dataframe to allow some analysis.

In [8]:
url_df = pandas.DataFrame.from_dict(resources_list)

In [9]:
url_df

Unnamed: 0,doi,resolves,resourcetype,score,timestamp,url
0,10.1016/j.cageo.2012.05.031,True,github,1,2018-03-28T13:29:08.977184,https://github.com/andreww/MSAT
1,10.1016/j.cageo.2012.05.031,False,github,0,2018-03-28T13:29:10.028420,https://github.com/url_that_does_not_resolve.com
2,10.1016/j.cageo.2012.05.031,True,github,1,2018-03-28T13:29:10.473266,https://zenodo.org/record/1196821#.WrtnltPwYkg
