# Notebook to process output from thecontentmine

This notebook illustrates the basic process of processing each resource in each paper and allocates a score.

In [1]:
import datetime

import pandas

import process_urls

In [2]:
dict_of_papers = {'10.1016/j.cageo.2012.05.031': {'github': ['https://github.com/andreww/MSAT',
                                                             'https://github.com/url_that_does_not_resolve.com'],
                                                  'zenodo': ['https://zenodo.org/record/1196821#.WrtnltPwYkg']
                                                 }
                 }

In [3]:
def process_github_url(url, doi):
    url_dict = {'doi': doi,
                'url': url,
                'timestamp': datetime.datetime.now().isoformat(),
                'resolves': None,
                'score': 0}

    if not process_urls.is_url_valid(url):
        url_dict['resolves'] = False
        return url_dict
    url_dict['resolves'] = True
    url_dict['score'] = url_dict['score'] + 1
    return url_dict

In [4]:
def process_zenodo_url(url):
    url_dict = {'doi': doi,
                'url': url,
                'timestamp': datetime.datetime.now().isoformat(),
                'resolves': None,
                'score': 0}

    if not process_urls.is_url_valid(url):
        url_dict['resolves'] = False
        return url_dict
    url_dict['resolves'] = True
    url_dict['score'] = url_dict['score'] + 1
    return url_dict

In [5]:
resources_list = []
papers_output_dict = {}

for paper_doi in dict_of_papers:
    paper = dict_of_papers[paper_doi]
    paper_score = 0
    number_or_resources = 0
    
    if 'github' in paper:
        for url in paper['github']:
            url_dict = process_github_url(url, paper_doi)
            resources_list.append(url_dict)
            paper_score = paper_score + url_dict['score']
            number_or_resources = number_or_resources + 1
            
    if 'zenodo' in paper:
        for url in paper['zenodo']:
            url_dict = process_github_url(url, paper_doi)
            resources_list.append(url_dict)
            paper_score = paper_score + url_dict['score']
            number_or_resources = number_or_resources + 1
            
    papers_output_dict[paper_doi] = {'score': paper_score, 
                                     'timestame': datetime.datetime.now().isoformat()}
    
            
    print("Paper with doi {} has score of {}".format(paper_doi, 
                                                     paper_score / number_or_resources))

Paper with doi 10.1016/j.cageo.2012.05.031 has score of 0.6666666666666666


In [6]:
print(resources_list)

[{'url': 'https://github.com/andreww/MSAT', 'doi': '10.1016/j.cageo.2012.05.031', 'score': 1, 'timestamp': '2018-03-28T12:03:38.860530', 'resolves': True}, {'url': 'https://github.com/url_that_does_not_resolve.com', 'doi': '10.1016/j.cageo.2012.05.031', 'score': 0, 'timestamp': '2018-03-28T12:03:40.135038', 'resolves': False}, {'url': 'https://zenodo.org/record/1196821#.WrtnltPwYkg', 'doi': '10.1016/j.cageo.2012.05.031', 'score': 1, 'timestamp': '2018-03-28T12:03:40.496848', 'resolves': True}]


## Analysis of the data

We can store the data in a flat SQL database, but here we stuff it into a Pandas dataframe to allow some analysis.

In [7]:
url_df = pandas.DataFrame.from_dict(resources_list)

In [8]:
url_df

Unnamed: 0,doi,resolves,score,timestamp,url
0,10.1016/j.cageo.2012.05.031,True,1,2018-03-28T12:03:38.860530,https://github.com/andreww/MSAT
1,10.1016/j.cageo.2012.05.031,False,0,2018-03-28T12:03:40.135038,https://github.com/url_that_does_not_resolve.com
2,10.1016/j.cageo.2012.05.031,True,1,2018-03-28T12:03:40.496848,https://zenodo.org/record/1196821#.WrtnltPwYkg
