In [40]:
import json
import urllib
import warnings

class SemanticScholarMetaDataExtractor():
    def __init__(self):
        self.API_ID = 'https://api.semanticscholar.org/v1/paper/arXiv:'

    def get_response(self, paper_id):
        paper_url = self.API_ID+paper_id
        return urllib.request.urlopen(paper_url)
    
    def get_data_json(self, paper_id):
        response = self.get_response(paper_id)
        return json.loads(response.read())

class ArXivPaper():
    def __init__(self, paper):
        self.paper = paper

        self.essential_metadata_keys = {'abstract', 'arxivId', 'authors', 'citations', 'influentialCitationCount',
                                        'doi', 'fieldsOfStudy', 'paperId', 'references',
                                        'title', 'topics', 'url', 'venue', 'year'}

        self.representational_info_keys = ['abstract', 'authors', 'url', 'year',
                                           'fieldsOfStudy', 'numCitations', 'venue', 'numReferences']
        
        self.check_paper()
        self.check_relevant_keys()
        self.discard_non_influential_citations()
        self.discard_non_influential_references()

    def check_paper(self):
        if isinstance(self.paper, str):
            warnings.warn("Paper not present in memory. Extracting Paper MetaData from Semantic Scholar!")
            metadata_extractor = SemanticScholarMetaDataExtractor()
            self.paper = metadata_extractor.get_data_json(self.paper)
        
        elif not isinstance(self.paper, dict):
            raise TypeError("Paper must be a Dict or an Arxiv Id")
    
    def check_relevant_keys(self):
        missing_keys = self.essential_metadata_keys.difference(self.paper.keys())
        if not missing_keys == set():
            error_message = "The following essential keys are missing from the paper: " + \
                            ", ".join(missing_keys)
            raise KeyError(error_message)
        
        self.paper['numCitations'] = len(self.paper['citations'])
        self.paper['numReferences'] = len(self.paper['references'])

    def discard_non_influential_citations(self):
        self.paper['citations'] = list(filter(lambda i: i['isInfluential'] is True, self.paper['citations'])) 

    def discard_non_influential_references(self):
        self.paper['references'] = list(filter(lambda i: i['isInfluential'] is True, self.paper['references'])) 

    def __getitem__(self, key):
        return self.paper[key]
    
    def __repr__(self):
        repr = f"Paper Title: {self.__getitem__('title')} \n\n"
        for idx, key in enumerate(self.representational_info_keys):
            if key == 'abstract':
                repr += f"{idx+1}) {'Abstract'}: \n{self.__getitem__(key)} \n\n"
                continue
            if key == 'authors':
                repr += f"{idx+1}) {'Authors'}:\n"
                authors = self.__getitem__(key)
                for i, author in enumerate(authors):
                    repr += f"\t{i+1}) {'Name'}: {author.__getitem__('name')}\n"
                    repr += f"\t{'URL'}: {author.__getitem__('url')}\n"
                    repr +="\n"
                continue
            repr += f"{idx+1}) {key}: {self.__getitem__(key)} \n\n"
        return(repr)
    
    def get_top_k_citations_information(self, k:int):
        if k > self.__getitem__('numCitations'):
             warnings.warn(f"Total citations are {self.__getitem__('numCitations')}. Retrieving all citations")
             k = self.__getitem__('numCitations')

        citations = {}
        all_citations = self.__getitem__('citations')

        info_keys = ['arxivId', 'authors', 'title', 'url','venue', 'year']

        i=0
        while i < k:
            citation = all_citations[i]
            if citation['arxivId'] is None:
                warnings.warn(f"The citation at index {i+1} has no Arxiv ID. Skipping this citation.")
                k+=1 
                i+=1
                continue                   
        
            citation = {key:val for key, val in citation.items() if key in info_keys}
            citations[i+1] = citation
            i+=1

        return citations

    def get_top_k_references_information(self, k:int):
        if k > self.__getitem__('numReferences'):
             warnings.warn(f"Total references are {self.__getitem__('numReferences')}. Retrieving all references")
             k = self.__getitem__('numReferences')

        references = {}
        all_references = self.__getitem__('references')

        info_keys = ['arxivId', 'authors', 'title', 'url','venue', 'year']

        i=0
        while i < k:
            reference = all_references[i]

            if reference['arxivId'] is None:
                warnings.warn(f"The reference at index {i+1} has no Arxiv ID. Skipping this reference.")
                k+=1
                i+=1
                continue                   

            reference = {key:val for key, val in reference.items() if key in info_keys}
            references[i+1] = reference
            i+=1

        return references

    def get_top_k_references_metadata(self, k:int):
        reference_papers = {}
        references = self.get_top_k_references_information(k)

        for i in range(1, len(references)+1):
            reference_papers[i] = ArXivPaper(references[i]['arxivId'])

        return reference_papers

    def get_top_k_citations_metadata(self, k:int):
        citation_papers = {}
        citations = self.get_top_k_references_information(k)

        for i in range(1, len(citations)+1):
            citation_papers[i] = ArXivPaper(citations[i]['arxivId'])

        return citation_papers
    



In [41]:
paper = ArXivPaper("1806.07366")

In [42]:
paper.paper.keys()

dict_keys(['abstract', 'arxivId', 'authors', 'citationVelocity', 'citations', 'corpusId', 'doi', 'fieldsOfStudy', 'influentialCitationCount', 'is_open_access', 'is_publisher_licensed', 'paperId', 'references', 'title', 'topics', 'url', 'venue', 'year', 'numCitations', 'numReferences'])

In [43]:
len(paper['citations'])

186

In [44]:
len(paper['references'])

5

In [32]:
d = ['citationVelocity', 'influentialCitationCount', 'numCitations', 'numReferences']

In [33]:
paper.paper[d[1]]

186

In [18]:
citations = paper.get_top_k_citations_information(3)

In [19]:
citations

{3: {'arxivId': '1906.00586',
  'authors': [{'authorId': '52193502', 'name': 'Mitchell Wortsman'},
   {'authorId': '143787583', 'name': 'Ali Farhadi'},
   {'authorId': '143887493', 'name': 'M. Rastegari'}],
  'title': 'Discovering Neural Wirings',
  'url': 'https://www.semanticscholar.org/paper/9c48f787f9590fcbad78707419ddfad269102cd3',
  'venue': 'NeurIPS',
  'year': 2019},
 4: {'arxivId': '1905.11602',
  'authors': [{'authorId': '2316494', 'name': 'Peter Karkus'},
   {'authorId': '145572784', 'name': 'Xiao Ma'},
   {'authorId': '1384318941', 'name': 'David Hsu'},
   {'authorId': '1709512', 'name': 'L. Kaelbling'},
   {'authorId': '1740222', 'name': 'Wee Sun Lee'},
   {'authorId': '1388700951', 'name': 'Tomas Lozano-Perez'}],
  'title': 'Differentiable Algorithm Networks for Composable Robot Learning',
  'url': 'https://www.semanticscholar.org/paper/1e6fbab02acf8baf93e8991a35849cd5b3cbac94',
  'venue': 'Robotics: Science and Systems',
  'year': 2019},
 5: {'arxivId': '1909.13334',
  '

In [11]:
# citation_papers = paper.get_top_k_citations_metadata(3)

In [12]:
# citation_papers

In [13]:
#

In [21]:
c = 0
for i in paper['citations']:
    if i['isInfluential'] == True:
        c+=1

In [22]:
c

186

In [35]:
paper.paper['citations'][0]

{'arxivId': None,
 'authors': [{'authorId': '104314859', 'name': 'Fred Daum'},
  {'authorId': '50535618', 'name': 'J. Huang'},
  {'authorId': '9130376', 'name': 'A. Noushin'}],
 'doi': '10.1117/12.2517980',
 'intent': ['background'],
 'isInfluential': False,
 'paperId': 'd13739de9b7e22eea9ff03c23d322817c14bdfd8',
 'title': "Extremely deep Bayesian learning with Gromov's method",
 'url': 'https://www.semanticscholar.org/paper/d13739de9b7e22eea9ff03c23d322817c14bdfd8',
 'venue': 'Defense + Commercial Sensing',
 'year': 2019}