In [1]:
import sys
sys.path.append('../../')
# Import directly from config
from src.config import SEMANTIC_SCHOLAR_API_KEY
import requests
import time
from typing import Dict, List, Optional
import pandas as pd


3P6CcwiomS8oEFSvyt4Vm1mmfB4HnJyV4iauQScg


In [7]:
#First Endpoint Exercise
BASE_URL = "https://api.semanticscholar.org/graph/v1"
headers = {
    "x-api-key": SEMANTIC_SCHOLAR_API_KEY
}

# ALL AVAILABLE FIELDS FOR SEMANTIC SCHOLAR API

# PAPER FIELDS (for /paper endpoints)
ALL_PAPER_FIELDS = [
    # Basic Paper Info
    "paperId",              # Unique paper ID
    "corpusId",             # Corpus ID
    "title",                # Paper title
    "abstract",             # Paper abstract
    "url",                  # Semantic Scholar URL
    "venue",                # Publication venue
    "publicationVenue",     # Detailed venue info
    "year",                 # Publication year
    "publicationDate",      # Full publication date
    "publicationTypes",     # List of publication types
    
    # Authors
    "authors",              # List of author objects
    "authors.authorId",     # Author IDs
    "authors.name",         # Author names
    "authors.url",          # Author profile URLs
    "authors.affiliations", # Author affiliations
    
    # Citation Metrics
    "citationCount",        # Number of citations
    "referenceCount",       # Number of references
    "influentialCitationCount",  # Highly influential citations
    
    # Classifications
    "fieldsOfStudy",        # Fields of study categories
    "s2FieldsOfStudy",      # Semantic Scholar fields
    
    # External IDs
    "externalIds",          # Dict of external IDs
    "externalIds.DOI",      # DOI
    "externalIds.ArXiv",    # ArXiv ID
    "externalIds.MAG",      # Microsoft Academic Graph ID
    "externalIds.DBLP",     # DBLP ID
    "externalIds.PubMed",   # PubMed ID
    "externalIds.PubMedCentral",  # PMC ID
    
    # Journal Info
    "journal",              # Journal details
    "journal.name",         # Journal name
    "journal.pages",        # Page numbers
    "journal.volume",       # Volume number
    
    # Embeddings
    "embedding",            # Paper embedding vector (SPECTER)
    
    # TL;DR
    "tldr",                 # AI-generated summary
    
    # Open Access
    "isOpenAccess",         # Is paper open access
    "openAccessPdf",        # Open access PDF info
    "openAccessPdf.url",    # PDF URL
    "openAccessPdf.status", # PDF status
]

# AUTHOR FIELDS (for /author endpoints)
ALL_AUTHOR_FIELDS = [
    "authorId",             # Unique author ID
    "name",                 # Author name
    "aliases",              # Alternative names
    "affiliations",         # List of affiliations
    "homepage",             # Homepage URL
    "url",                  # Semantic Scholar profile URL
    "paperCount",           # Number of papers
    "citationCount",        # Total citations
    "hIndex",               # h-index
]

# CITATION/REFERENCE FIELDS (for citations/references endpoints)
CITATION_REFERENCE_FIELDS = [
    "contexts",             # Citation contexts (snippets)
    "intents",              # Citation intents (background, methodology, etc.)
    "isInfluential",        # Is this an influential citation
    "citingPaper",          # The citing paper (with paper fields)
    "citedPaper",           # The cited paper (with paper fields)
]

# Example: Request specific nested fields
DETAILED_PAPER_FIELDS = [
    "paperId",
    "title",
    "abstract",
    "year",
    "citationCount",
    "influentialCitationCount",
    "fieldsOfStudy",
    "s2FieldsOfStudy",
    "publicationVenue",
    "publicationTypes",
    "publicationDate",
    "authors",
    "authors.authorId",
    "authors.name",
    "authors.affiliations",
    "journal.name",
    "journal.volume",
    "journal.pages",
    "externalIds.DOI",
    "externalIds.ArXiv",
    "externalIds.PubMed",
    "isOpenAccess",
    "openAccessPdf.url",
    "tldr.text",
    "embedding.vector",  # Note: This returns a 768-dimensional vector
]

# Define fields you want to retrieve
PAPER_FIELDS = [
    "paperId",
    "title",
    "abstract",
    "year",
    "authors",
    "citationCount",
    "referenceCount",
    "publicationDate",
    "journal",
    "fieldsOfStudy",
    "url",
    "externalIds"
]

In [8]:
# 1. SEARCH FOR PAPERS BY QUERY
def search_papers(query: str, limit: int = 10, offset: int = 0) -> Dict:
    """
    Search for papers using a query string.
    
    Args:
        query: Search query (e.g., "machine learning")
        limit: Number of results to return (max 100)
        offset: Pagination offset
    
    Returns:
        Dictionary with search results
    """
    url = f"{BASE_URL}/paper/search"
    params = {
        "query": query,
        "limit": limit,
        "offset": offset,
        "fields": ",".join(PAPER_FIELDS)
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        print(response.text)
        return None

# Example: Search for papers about "artificial intelligence in healthcare"
results = search_papers("artificial intelligence in healthcare", limit=5)

if results:
    print(f"Total results: {results.get('total', 0)}")
    print(f"\nFirst {len(results.get('data', []))} papers:\n")
    
    for i, paper in enumerate(results.get('data', []), 1):
        print(f"{i}. {paper.get('title')}")
        print(f"   Year: {paper.get('year')}")
        print(f"   Citations: {paper.get('citationCount')}")
        print(f"   Paper ID: {paper.get('paperId')}")
        print()


Total results: 31628

First 5 papers:

1. Artificial intelligence in healthcare: past, present and future
   Year: 2017
   Citations: 3335
   Paper ID: 10f919b1a5161b560504c225cfb2d1b3a4768f80

2. The potential for artificial intelligence in healthcare
   Year: 2019
   Citations: 2559
   Paper ID: ddf4172cad889f178c2db9b1b6302b3c7d5c0147

3. Explainability for artificial intelligence in healthcare: a multidisciplinary perspective
   Year: 2020
   Citations: 1300
   Paper ID: d7f1a885e32faa2194ccd5f85da4c4fb5d788392

4. Evolution of artificial intelligence in healthcare: a 30-year bibliometric study
   Year: 2025
   Citations: 36
   Paper ID: c198ce9c77b5b1afcbcf31fc6748114d332cf924

5. Artificial intelligence in healthcare: transforming the practice of medicine
   Year: 2021
   Citations: 1053
   Paper ID: 2b6d375d8abea91d46894ebfa7051077253834d5



In [None]:
# 2. GET PAPER DETAILS BY ID
def get_paper_details(paper_id: str) -> Dict:
    """
    Get detailed information about a specific paper.
    
    Args:
        paper_id: Semantic Scholar paper ID or DOI/ArXiv ID
    
    Returns:
        Dictionary with paper details
    """
    url = f"{BASE_URL}/paper/{paper_id}"
    params = {
        "fields": ",".join(PAPER_FIELDS)
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Example: Get details for a specific paper
paper_id = "649def34f8be52c8b66281af98ae884c09aef38b"  # Example ID
paper = get_paper_details(paper_id)

if paper:
    print(f"Title: {paper.get('title')}")
    print(f"Abstract: {paper.get('abstract', 'N/A')[:200]}...")
    print(f"Year: {paper.get('year')}")
    print(f"Citations: {paper.get('citationCount')}")

# Example: Get paper with all important fields
def get_paper_full_details(paper_id: str) -> Dict:
    """Get comprehensive paper details."""
    url = f"{BASE_URL}/paper/{paper_id}"
    params = {
        "fields": ",".join(DETAILED_PAPER_FIELDS)
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code}")
        return None

# Test with a paper
paper = get_paper_full_details("649def34f8be52c8b66281af98ae884c09aef38b")

if paper:
    print(f"Title: {paper.get('title')}")
    print(f"Year: {paper.get('year')}")
    print(f"Citations: {paper.get('citationCount')}")
    print(f"Influential Citations: {paper.get('influentialCitationCount')}")
    print(f"\nFields of Study: {paper.get('fieldsOfStudy')}")
    print(f"\nAuthors:")
    for author in paper.get('authors', [])[:3]:
        print(f"  - {author.get('name')} (ID: {author.get('authorId')})")
        print(f"    Affiliations: {author.get('affiliations', [])}")
    
    print(f"\nJournal: {paper.get('journal', {}).get('name', 'N/A')}")
    print(f"DOI: {paper.get('externalIds', {}).get('DOI', 'N/A')}")
    print(f"ArXiv: {paper.get('externalIds', {}).get('ArXiv', 'N/A')}")
    
    if paper.get('isOpenAccess'):
        print(f"\nOpen Access PDF: {paper.get('openAccessPdf', {}).get('url', 'N/A')}")
    
    if paper.get('tldr'):
        print(f"\nTL;DR: {paper.get('tldr', {}).get('text', 'N/A')}")


Title: Construction of the Literature Graph in Semantic Scholar
Abstract: We describe a deployed scalable system for organizing published scientific literature into a heterogeneous graph to facilitate algorithmic manipulation and discovery. The resulting literature graph co...
Year: 2018
Citations: 423


In [5]:
# 3. GET PAPER CITATIONS
def get_paper_citations(paper_id: str, limit: int = 100) -> List[Dict]:
    """
    Get papers that cite a specific paper.
    
    Args:
        paper_id: Semantic Scholar paper ID
        limit: Number of citations to retrieve
    
    Returns:
        List of citing papers
    """
    url = f"{BASE_URL}/paper/{paper_id}/citations"
    params = {
        "limit": limit,
        "fields": ",".join(PAPER_FIELDS)
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.json().get('data', [])
    else:
        print(f"Error: {response.status_code}")
        return []

# Example: Get citations for a paper
citations = get_paper_citations(paper_id, limit=10)
print(f"Found {len(citations)} citations")

for i, citation in enumerate(citations[:5], 1):
    citing_paper = citation.get('citingPaper', {})
    print(f"{i}. {citing_paper.get('title')}")
    print(f"   Year: {citing_paper.get('year')}")
    print()


Found 10 citations
1. Generating Literature-Driven Scientific Theories at Scale
   Year: 2026

2. SciLaD: A Large-Scale, Transparent, Reproducible Dataset for Natural Scientific Language Processing
   Year: 2025

3. A dataset of Curie and NÃ©el temperatures auto-generated with ChemDataExtractor and the Snowball algorithm
   Year: 2025

4. Encoder Fine-tuning with Stochastic Sampling Outperforms Open-weight GPT in Astronomy Knowledge Extraction
   Year: 2025

5. FusionTRIZ Framework for Cross-Disciplinary Innovation: A Neuromorphic Imaging Application
   Year: 2025



In [9]:
# 4. GET PAPER REFERENCES
def get_paper_references(paper_id: str, limit: int = 100) -> List[Dict]:
    """
    Get papers referenced by a specific paper.
    
    Args:
        paper_id: Semantic Scholar paper ID
        limit: Number of references to retrieve
    
    Returns:
        List of referenced papers
    """
    url = f"{BASE_URL}/paper/{paper_id}/references"
    params = {
        "limit": limit,
        "fields": ",".join(PAPER_FIELDS)
    }
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        return response.json().get('data', [])
    else:
        print(f"Error: {response.status_code}")
        return []

# Example: Get references
references = get_paper_references(paper_id, limit=10)
print(f"Found {len(references)} references")

for i, ref in enumerate(references[:5], 1):
    cited_paper = ref.get('citedPaper', {})
    print(f"{i}. {cited_paper.get('title')}")
    print(f"   Year: {cited_paper.get('year')}")
    print()


Found 10 references
1. Extracting Scientific Figures with Distantly Supervised Neural Networks
   Year: 2018

2. Content-Based Citation Recommendation
   Year: 2018

3. The AI2 system at SemEval-2017 Task 10 (ScienceIE): semi-supervised end-to-end entity and relation extraction
   Year: 2017

4. Learning to Predict Citation-Based Impact Measures
   Year: 2017

5. Learning a Neural Semantic Parser from User Feedback
   Year: 2017

