In [60]:
import json
import urllib
import warnings

class SemanticScholarMetaDataExtractor():
    def __init__(self):
        self.API_ID = 'https://api.semanticscholar.org/v1/paper/arXiv:'

    def get_response(self, paper_id):
        paper_url = self.API_ID+paper_id
        return urllib.request.urlopen(paper_url)
    
    def get_data_json(self, paper_id):
        response = self.get_response(paper_id)
        return json.loads(response.read())

class ArXivPaper():
    def __init__(self, paper):
        self.paper = paper

        self.essential_metadata_keys = {'abstract', 'arxivId', 'authors', 'citations', 'influentialCitationCount',
                                        'doi', 'fieldsOfStudy', 'paperId', 'references',
                                        'title', 'topics', 'url', 'venue', 'year'}

        self.representational_info_keys = ['abstract', 'authors', 'url', 'year',
                                           'fieldsOfStudy', 'numCitations', 'venue', 'numReferences']
        
        self.check_paper()
        self.check_relevant_keys()
        self.discard_non_influential_citations()
        self.discard_non_influential_references()
        self.discard_none_arxiv_references()
        self.discard_none_arxiv_citations()
        self.set_num_references()
        self.set_num_citations()

    def check_paper(self):
        if isinstance(self.paper, str):
            warnings.warn("Paper not present in memory. Extracting Paper MetaData from Semantic Scholar!")
            metadata_extractor = SemanticScholarMetaDataExtractor()
            self.paper = metadata_extractor.get_data_json(self.paper)
        
        elif not isinstance(self.paper, dict):
            raise TypeError("Paper must be a Dict or an Arxiv Id")
    
    def check_relevant_keys(self):
        missing_keys = self.essential_metadata_keys.difference(self.paper.keys())
        if not missing_keys == set():
            error_message = "The following essential keys are missing from the paper: " + \
                            ", ".join(missing_keys)
            raise KeyError(error_message)
        

    def discard_non_influential_citations(self):
        self.paper['citations'] = list(filter(lambda i: i['isInfluential'] is True, self.paper['citations']))

    def discard_none_arxiv_references(self):
        self.paper['references'] = list(filter(lambda i: i['arxivId'] is not None, self.paper['references'])) 

    def discard_none_arxiv_citations(self):
        self.paper['citations'] = list(filter(lambda i: i['arxivId'] is not None, self.paper['citations']))

    def discard_non_influential_references(self):
        self.paper['references'] = list(filter(lambda i: i['isInfluential'] is True, self.paper['references'])) 

    def set_num_references(self):
        self.paper['numReferences'] = len(self.paper['references'])

    def set_num_citations(self):
        self.paper['numCitations'] = len(self.paper['citations'])

    def __getitem__(self, key):
        return self.paper[key]
    
    def __repr__(self):
        repr = f"Paper Title: {self.__getitem__('title')} \n\n"
        for idx, key in enumerate(self.representational_info_keys):
            if key == 'abstract':
                repr += f"{idx+1}) {'Abstract'}: \n{self.__getitem__(key)} \n\n"
                continue
            if key == 'authors':
                repr += f"{idx+1}) {'Authors'}:\n"
                authors = self.__getitem__(key)
                for i, author in enumerate(authors):
                    repr += f"\t{i+1}) {'Name'}: {author.__getitem__('name')}\n"
                    repr += f"\t{'URL'}: {author.__getitem__('url')}\n"
                    repr +="\n"
                continue
            repr += f"{idx+1}) {key}: {self.__getitem__(key)} \n\n"
        return(repr)
    
    def get_top_k_citations_information(self, k:int):
        if k > self.__getitem__('numCitations'):
             warnings.warn(f"Total citations are {self.__getitem__('numCitations')}. Retrieving all citations")
             k = self.__getitem__('numCitations')

        citations = []
        all_citations = self.__getitem__('citations')

        info_keys = ['arxivId', 'authors', 'title', 'url','venue', 'year']

        i=0
        while i < k:
            citation = all_citations[i]        
            citation = {key:val for key, val in citation.items() if key in info_keys}
            citations.append(citation)
            i+=1

        return citations

    def get_top_k_references_information(self, k:int):
        if k > self.__getitem__('numReferences'):
             warnings.warn(f"Total references are {self.__getitem__('numReferences')}. Retrieving all references")
             k = self.__getitem__('numReferences')

        references = []
        all_references = self.__getitem__('references')

        info_keys = ['arxivId', 'authors', 'title', 'url','venue', 'year']

        i=0
        while i < k:
            reference = all_references[i]
            reference = {key:val for key, val in reference.items() if key in info_keys}
            references.append(reference)
            i+=1

        return references

    def get_top_k_references_metadata(self, k:int):
        reference_papers = []
        references = self.get_top_k_references_information(k)

        for i in range(len(references)):
            reference_papers.append(ArXivPaper(references[i]['arxivId']))

        return reference_papers

    def get_top_k_citations_metadata(self, k:int):
        citation_papers = []
        citations = self.get_top_k_citations_information(k)

        for i in range(len(citations)):
            citation_papers.append(ArXivPaper(citations[i]['arxivId']))

        return citation_papers
    

class GraphNode():
    def __init__(self, paper:ArXivPaper, num_citations:int=1, num_references:int=1):
        self.paper = paper 
        self.num_citations = num_citations
        self.num_references = num_references
        self.citation_childen = None
        self.reference_childen = None
    
    def is_reference_leaf(self):
         return self.paper['numReferences'] == 0

    def is_citation_leaf(self):
         return self.paper['numCitations'] == 0
    
    def get_citation_children(self):
        if not self.is_citation_leaf():
            self.citation_childen = self.paper.get_top_k_citations_information(self.num_citations)

    def get_reference_children(self):
        if not self.is_reference_leaf():
            self.reference_childen = self.paper.get_top_k_references_information(self.num_references)
           

In [57]:
paper = ArXivPaper("1806.07366")

In [51]:
references = paper.get_top_k_references_information(3)

In [72]:
node = GraphNode(paper=paper, num_citations=20, num_references=10)

In [73]:
node.get_citation_children()

In [74]:
node.get_reference_children()

In [75]:
node.citation_childen

[{'arxivId': '2006.03364',
  'authors': [{'authorId': '2791391', 'name': 'E. Celledoni'},
   {'authorId': '48125860', 'name': 'Matthias Joachim Ehrhardt'},
   {'authorId': '11371884', 'name': 'Christian Etmann'},
   {'authorId': '47283982', 'name': 'R. McLachlan'},
   {'authorId': '1868160', 'name': 'B. Owren'},
   {'authorId': '1711104', 'name': 'C. Schönlieb'},
   {'authorId': '52585009', 'name': 'F. Sherry'}],
  'title': 'Structure preserving deep learning',
  'url': 'https://www.semanticscholar.org/paper/96efa5af47c75fe90909cbceafe7524714c9e5b9',
  'venue': 'ArXiv',
  'year': 2020},
 {'arxivId': '2006.00104',
  'authors': [{'authorId': '31509385', 'name': 'Derek Onken'},
   {'authorId': '143745940', 'name': 'Samy Wu Fung'},
   {'authorId': '7824051', 'name': 'Xingjian Li'},
   {'authorId': '49418655', 'name': 'Lars Ruthotto'}],
  'title': 'OT-Flow: Fast and Accurate Continuous Normalizing Flows via Optimal Transport',
  'url': 'https://www.semanticscholar.org/paper/c837226031bde30a

In [76]:
node.reference_childen

[{'arxivId': '1708.00065',
  'authors': [{'authorId': None, 'name': 'Yang Li'},
   {'authorId': '145585757', 'name': 'Nan Du'},
   {'authorId': '1751569', 'name': 'S. Bengio'}],
  'title': 'Time-Dependent Representation for Neural Event Sequence Prediction',
  'url': 'https://www.semanticscholar.org/paper/ec7bab52b2220a6cad410dd82b3fbe140d2196f0',
  'venue': 'ICLR',
  'year': 2018},
 {'arxivId': '1606.04130',
  'authors': [{'authorId': '32219137', 'name': 'Zachary Chase Lipton'},
   {'authorId': '2107807', 'name': 'David C. Kale'},
   {'authorId': '144616817', 'name': 'R. Wetzel'}],
  'title': 'Directly Modeling Missing Data in Sequences with RNNs: Improved Classification of Clinical Time Series',
  'url': 'https://www.semanticscholar.org/paper/562f33611cdc0d8ed6609aa09f153e6238d5409e',
  'venue': 'MLHC',
  'year': 2016},
 {'arxivId': '1505.05770',
  'authors': [{'authorId': '1748523', 'name': 'Danilo Jimenez Rezende'},
   {'authorId': '14594344', 'name': 'S. Mohamed'}],
  'title': 'Va

In [52]:
references

[{'arxivId': '1708.00065',
  'authors': [{'authorId': None, 'name': 'Yang Li'},
   {'authorId': '145585757', 'name': 'Nan Du'},
   {'authorId': '1751569', 'name': 'S. Bengio'}],
  'title': 'Time-Dependent Representation for Neural Event Sequence Prediction',
  'url': 'https://www.semanticscholar.org/paper/ec7bab52b2220a6cad410dd82b3fbe140d2196f0',
  'venue': 'ICLR',
  'year': 2018},
 {'arxivId': '1606.04130',
  'authors': [{'authorId': '32219137', 'name': 'Zachary Chase Lipton'},
   {'authorId': '2107807', 'name': 'David C. Kale'},
   {'authorId': '144616817', 'name': 'R. Wetzel'}],
  'title': 'Directly Modeling Missing Data in Sequences with RNNs: Improved Classification of Clinical Time Series',
  'url': 'https://www.semanticscholar.org/paper/562f33611cdc0d8ed6609aa09f153e6238d5409e',
  'venue': 'MLHC',
  'year': 2016},
 {'arxivId': '1505.05770',
  'authors': [{'authorId': '1748523', 'name': 'Danilo Jimenez Rezende'},
   {'authorId': '14594344', 'name': 'S. Mohamed'}],
  'title': 'Va

In [53]:
references_papers = paper.get_top_k_references_metadata(3)

In [54]:
references_papers

[Paper Title: Time-Dependent Representation for Neural Event Sequence Prediction 
 
 1) Abstract: 
 Existing sequence prediction methods are mostly concerned with time-independent sequences, in which the actual time span between events is irrelevant and the distance between events is simply the difference between their order positions in the sequence. While this time-independent view of sequences is applicable for data such as natural languages, e.g., dealing with words in a sentence, it is inappropriate and inefficient for many real world events that are observed and collected at unequally spaced points of time as they naturally arise, e.g., when a person goes to a grocery store or makes a phone call. The time span between events can carry important information about the sequence dependence of human behaviors. In this work, we propose a set of methods for using time in sequence prediction. Because neural sequence models such as RNN are more amenable for handling token-like input, we p

In [13]:
#

In [21]:
c = 0
for i in paper['citations']:
    if i['isInfluential'] == True:
        c+=1

In [22]:
c

186

In [35]:
paper.paper['citations'][0]

{'arxivId': None,
 'authors': [{'authorId': '104314859', 'name': 'Fred Daum'},
  {'authorId': '50535618', 'name': 'J. Huang'},
  {'authorId': '9130376', 'name': 'A. Noushin'}],
 'doi': '10.1117/12.2517980',
 'intent': ['background'],
 'isInfluential': False,
 'paperId': 'd13739de9b7e22eea9ff03c23d322817c14bdfd8',
 'title': "Extremely deep Bayesian learning with Gromov's method",
 'url': 'https://www.semanticscholar.org/paper/d13739de9b7e22eea9ff03c23d322817c14bdfd8',
 'venue': 'Defense + Commercial Sensing',
 'year': 2019}