In [1]:
import os
import sys

parent_dir = os.path.dirname(os.path.realpath("."))
sys.path.append(parent_dir)


## Breakdown of pipeline
#### Within this document you will see examples of the executions of the pipeline and how to assess a pdf's (paper's) Bidirectionality and Unidirectionality

In [2]:
# Lets start by first obtaining the metadata of the doi
import sys

sys.path.append('../object_creator')

doi = "10.1016/j.compbiomed.2019.05.002"

# This doi can be used to create a metadata (fetched) obj.

from object_creator.doi_to_metadata import *

meta = doi_to_metadataObj(doi=doi)

#doi_to_metadataObj takes a doi, queries OpenAlex and creates a Metadata Obj: Title, doi, arxiv
print("here is the metadata that it extracts:")
print("-Title: " + meta.title)
print("-Doi: " +   meta.doi)
print("-Arxiv: " + meta.arxiv)

here is the metadata that it extracts:
-Title: Association of genomic subtypes of lower-grade gliomas with shape features automatically extracted by a deep learning algorithm
-Doi: 10.1016/j.compbiomed.2019.05.002
-Arxiv: 1906.03720


In [3]:
#All objects can be converted to a dictionary as seen:
print("Example of object to dict function")
print(str(meta.to_dict()))

#But there is a function for each object that creates a dictionary to be converted to JSON
#Key being the doi
#Value being the obj to dictionary
print("\n")
print("Here is an example:")
dict = metadataObj_to_metadataDict(meta)
print(dict)


Example of object to dict function
{'title': 'Association of genomic subtypes of lower-grade gliomas with shape features automatically extracted by a deep learning algorithm', 'doi': '10.1016/j.compbiomed.2019.05.002', 'arxiv': '1906.03720'}


Here is an example:
{'10.1016/j.compbiomed.2019.05.002': {'title': 'Association of genomic subtypes of lower-grade gliomas with shape features automatically extracted by a deep learning algorithm', 'doi': '10.1016/j.compbiomed.2019.05.002', 'arxiv': '1906.03720'}}


<h4>Once the metadata has been obtained. You will need to download the pdf to which the doi pertains to.__

We store this information as a DownloadedObj.</h4>

In [4]:
from object_creator.create_downloadedObj import * 

#We can take the previously created metadataObj to create the downloaded Obj

dwnldd = meta_to_dwnldd(metadataObj=meta,output_dir=".")

#This will download the pdf into a PDFs directory (will be made if it does not exist)

#The metadata is the same as metaObj although now has the file_name and the file_path, filename is a modification on the DOI
print("File path to the pdf:")
print(dwnldd.file_path)
print("File name of the pdf:")
print(dwnldd.file_name)


#This object also has the same .to_dict() function and another dictionary function to create a dict for JSON



File path to the pdf:
./PDFs/10-DOT-1016_j-DOT-compbiomed-DOT-2019-DOT-05-DOT-002.pdf
File name of the pdf:
10-DOT-1016_j-DOT-compbiomed-DOT-2019-DOT-05-DOT-002.pdf


<h4> Now that we have the Pdfs downloaded we can now extract the urls within the pdfs </h4>

In [5]:
from object_creator.downloaded_to_paperObj import * 

processed = downloaded_to_paperObj(downloadedObj=dwnldd)

#When creating a processed (paperObj) we open the pdf and scan it for the urls using tika

print("The code_urls are a list of pairs, each pair being a url and the number of times it has been mentioned")
print(processed.code_urls)
print("They are ordered by frequency")

The code_urls are a list of pairs, each pair being a url and the number of times it has been mentioned
[('https://github.com/mateuszbuda/brain-segmentation', 2)]
They are ordered by frequency


In [7]:
#PaperObjs like all other objects can be turned to jsons like seen below:

pp_dic = paperObj_ppDict(processed)

papers_json = pp_dic_to_json(pp_dic=pp_dic,output_dir="../example")



<h4>The creation of these jsons will allow you to fragment the pipeline to your liking.
</h4>
<h4>
Now that we have a paperObj (Processed) we can now assess its bidirectionality or unidirectionality
</h4>

In [10]:
from object_creator.pipeline import * 

#Now that we have a paper obj we can assess its bidirectionality

bidir = from_papers_json_to_bidir(papers_json, output_dir="../example")

#Where a dictionary is created per bidirectional doi/ID 
# its K: being the doi of the paper
# its V: being all the code_urls that are bidirectional

Already created a file: ../example/JSONs/mateuszbuda_brain-segmentation.json
{'10.1016/j.compbiomed.2019.05.002': ['https://github.com/mateuszbuda/brain-segmentation']}
