In [7]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

def init_webdriver():
    """Initialize Selenium WebDriver"""
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")  # Run in headless mode for automation
    options.add_argument("--disable-blink-features=AutomationControlled")  # Prevent bot detection
    return webdriver.Chrome(options=options)

def get_paper_url(doi):
    """Convert DOI to its direct publisher link"""
    return f"https://doi.org/{doi}"

def navigate_to_paper(driver, doi):
    """Navigate to the paper's webpage using DOI"""
    paper_url = get_paper_url(doi)
    driver.get(paper_url)
    time.sleep(3)  # Let the redirect complete
    
    # Get the final redirected URL (actual publisher page)
    final_url = driver.current_url
    print(f"Redirected to: {final_url}")
    
    return final_url

def extract_metadata(driver):
    """Extract metadata like title, authors, and abstract from a paper page"""
    metadata = {"Title": None, "Authors": None, "Abstract": None}

    try:
        title_elem = driver.find_element(By.TAG_NAME, "h1")  # Many publishers use h1 for titles
        metadata["Title"] = title_elem.text
    except:
        print("Title not found")

    try:
        authors = driver.find_elements(By.CSS_SELECTOR, "span[class*='author']")
        metadata["Authors"] = ", ".join([a.text for a in authors])
    except:
        print("Authors not found")

    try:
        abstract_elem = driver.find_element(By.CSS_SELECTOR, "div[class*='abstract']")
        metadata["Abstract"] = abstract_elem.text
    except:
        print("Abstract not found")

    return metadata

def extract_references(driver):
    """Extract references from the paper page"""
    references = []
    
    try:
        # Adjust selector to match publisher's website structure
        reference_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='doi']")
        
        for ref in reference_links:
            doi_link = ref.get_attribute("href")
            if "doi.org" in doi_link:
                references.append(doi_link.split("doi.org/")[-1])  # Extract DOI
    except Exception as e:
        print(f"Error extracting references: {e}")
    
    return references

def get_paper_neighborhood(driver, start_doi, depth=1):
    """Build an n-neighborhood of references recursively"""
    visited = set()
    queue = [(start_doi, 0, "Source")]
    results = []

    while queue:
        paper_doi, current_depth, relation = queue.pop(0)
        
        if paper_doi in visited:
            continue
        visited.add(paper_doi)

        print(f"\n🔍 Processing Paper: {paper_doi} | Depth: {current_depth}")

        # Navigate to paper page
        navigate_to_paper(driver, paper_doi)

        # Extract metadata
        metadata = extract_metadata(driver)
        metadata["DOI"] = paper_doi
        metadata["Relation"] = relation
        results.append(metadata)

        # Extract references and visit them if depth allows
        if current_depth < depth:
            references = extract_references(driver)
            for ref_doi in references:
                if ref_doi not in visited:
                    queue.append((ref_doi, current_depth + 1, "Reference"))

    return pd.DataFrame(results)

if __name__ == "__main__":
    driver = init_webdriver()
    try:
        test_doi = "10.1038/nphys1170"
        neighborhood_df = get_paper_neighborhood(driver, test_doi, depth=1)

        print("\n📄 Extracted Data:")
        print(neighborhood_df)

        # Save results
        neighborhood_df.to_csv("paper_neighborhood.csv", index=False)
        print("\n Data saved as 'paper_neighborhood.csv'")

    finally:
        driver.quit()



🔍 Processing Paper: 10.1038/nphys1170 | Depth: 0
Redirected to: https://www.nature.com/articles/nphys1170
Abstract not found

🔍 Processing Paper: 10.1038%2Fnphys1133 | Depth: 1
Redirected to: https://www.nature.com/articles/nphys1133
Abstract not found

🔍 Processing Paper: 10.1103%2FPhysRevA.64.052312 | Depth: 1
Redirected to: https://journals.aps.org/pra/abstract/10.1103/PhysRevA.64.052312
Abstract not found

🔍 Processing Paper: 10.1103%2FPhysRevA.55.R1561 | Depth: 1
Redirected to: https://journals.aps.org/pra/abstract/10.1103/PhysRevA.55.R1561
Abstract not found

🔍 Processing Paper: 10.1103%2FPhysRevA.61.010304 | Depth: 1
Redirected to: https://journals.aps.org/pra/abstract/10.1103/PhysRevA.61.010304
Abstract not found

🔍 Processing Paper: 10.1080%2F09500349708231894 | Depth: 1
Redirected to: https://www.tandfonline.com/doi/abs/10.1080/09500349708231894
Abstract not found

🔍 Processing Paper: 10.1103%2FPhysRevLett.78.390 | Depth: 1
Redirected to: https://journals.aps.org/prl/abstrac