<a href="https://colab.research.google.com/github/RushiBShinde/ThinkForge-IE-643-project/blob/main/Research_Paper_Scraper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import csv

def scrape_paper(url):
    """
    Scrapes a research paper page for its title, authors, abstract, and full text content.

    Args:
        url: The URL of the research paper.

    Returns:
        A dictionary containing the paper's metadata and full text,
        or None if scraping fails.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')

        # --- Clean the soup by removing irrelevant tags ---
        # This helps in removing common website elements that are not part of the paper's content.
        for tag in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
            tag.decompose()

        # --- Find Paper Title ---
        # Common tags for titles are <h1> or <meta property="og:title">
        title_tag = soup.find('h1', {'class': 'title'}) or soup.find('meta', property='og:title')
        title = title_tag['content'] if title_tag and title_tag.has_attr('content') else (title_tag.get_text(strip=True) if title_tag else "Title not found")


        # --- Find Authors ---
        # This is highly variable. We'll try a few common patterns.
        authors_list = []
        # Pattern 1: Meta tag for authors
        author_tags = soup.find_all('meta', {'name': 'citation_author'})
        if author_tags:
            authors_list = [tag['content'] for tag in author_tags]
        else:
            # Pattern 2: Specific class names (these are examples, will need adjustment)
            author_div = soup.find('div', class_='authors')
            if author_div:
                authors_list = [a.get_text(strip=True) for a in author_div.find_all('a')]

        authors = ", ".join(authors_list) if authors_list else "Authors not found"

        # --- Find Abstract ---
        # Abstracts are often in a <div> with a specific class or heading
        abstract_heading = soup.find(['h2', 'h3'], string=lambda t: t and 'abstract' in t.lower())
        abstract = "Abstract not found"
        if abstract_heading:
            abstract_tag = abstract_heading.find_next('div', {'class': 'abstract-content'}) or abstract_heading.find_next('p')
            if abstract_tag:
                 abstract = abstract_tag.get_text(strip=True)

        # --- Find Full Text ---
        # This is a generic approach to get all text. It might include headers, footers, etc.
        # For more precision, you'd need to identify the main content container for each site.
        full_text = "Full text not found"
        # A common tag for the main content is <article> or a div with id='content' or role='main'
        content_area = soup.find('article') or soup.find('div', id='content') or soup.find('div', class_='content') or soup.find('div', role='main')
        if content_area:
            full_text = content_area.get_text(separator='\n', strip=True)
        else:
            # Fallback to getting all text from the body, which is now cleaner after decomposition
            body_tag = soup.find('body')
            if body_tag:
                full_text = body_tag.get_text(separator='\n', strip=True)


        return {
            'title': title,
            'authors': authors,
            'abstract': abstract,
            'full_text': full_text,
            'url': url
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")
        return None

def process_links_from_file(filepath):
    """
    Reads a file of URLs, scrapes each one, and returns the data.

    Args:
        filepath: The path to the text or csv file with URLs.

    Returns:
        A list of dictionaries, where each dictionary is a scraped paper.
    """
    scraped_data = []
    with open(filepath, 'r') as f:
        # Read either as plain text or from the first column of a CSV
        reader = csv.reader(f)
        urls = [row[0] for row in reader] if '.' in f.name and f.name.rsplit('.', 1)[1].lower() == 'csv' else f.read().splitlines()


    for url in urls:
        if url.strip(): # Ensure the line is not empty
            print(f"Scraping: {url}")
            data = scrape_paper(url)
            if data:
                scraped_data.append(data)
    return scraped_data

if __name__ == '__main__':
    # Create a dummy links file for testing
    with open("paper_links.txt", "w") as f:
        f.write("https://arxiv.org/abs/1706.03762\n") # Attention is All You Need
        f.write("https://arxiv.org/abs/2106.07682\n") # Paper on Vision Transformers
        f.write("https://www.ijpsjournal.com/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation") # Example of a failed URL

    # 1. Process the file of links
    papers = process_links_from_file('paper_links.txt')

    if papers:
        # 2. Save the results to a CSV file
        output_filename = 'scraped_papers.csv'
        with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['title', 'authors', 'abstract', 'full_text', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for paper in papers:
                writer.writerow(paper)

        print(f"\nScraping complete. Data saved to {output_filename}")
        print(f"Successfully scraped {len(papers)} papers.")




Scraping: https://arxiv.org/abs/1706.03762
Scraping: https://arxiv.org/abs/2106.07682
Scraping: https://www.ijpsjournal.com/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation

Scraping complete. Data saved to scraped_papers.csv
Successfully scraped 3 papers.


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import fitz  # PyMuPDF - install with 'pip install PyMuPDF'
from urllib.parse import urljoin

def extract_text_from_pdf(pdf_content):
    """
    Extracts text from the raw content of a PDF file.

    Args:
        pdf_content: The byte content of the PDF file.

    Returns:
        A string containing all the text from the PDF, or an empty string if extraction fails.
    """
    try:
        # Open the PDF from memory
        with fitz.open(stream=pdf_content, filetype="pdf") as doc:
            full_text = ""
            for page in doc:
                full_text += page.get_text()
            return full_text
    except Exception as e:
        print(f"  - Could not extract text from PDF: {e}")
        return ""

def scrape_paper(url):
    """
    Scrapes a research paper page for its title, authors, abstract, and full text content.
    It prioritizes finding and extracting text from a linked PDF.

    Args:
        url: The URL of the research paper.

    Returns:
        A dictionary containing the paper's metadata and full text,
        or None if scraping fails.
    """
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise an exception for bad status codes

        soup = BeautifulSoup(response.content, 'html.parser')

        # --- Clean the soup by removing irrelevant tags ---
        # This helps in removing common website elements that are not part of the paper's content.
        for tag in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
            tag.decompose()

        # --- Find Paper Title ---
        # Common tags for titles are <h1> or <meta property="og:title">
        title_tag = soup.find('h1', {'class': 'title'}) or soup.find('meta', property='og:title')
        title = title_tag['content'] if title_tag and title_tag.has_attr('content') else (title_tag.get_text(strip=True) if title_tag else "Title not found")


        # --- Find Authors ---
        # This is highly variable. We'll try a few common patterns.
        authors_list = []
        # Pattern 1: Meta tag for authors
        author_tags = soup.find_all('meta', {'name': 'citation_author'})
        if author_tags:
            authors_list = [tag['content'] for tag in author_tags]
        else:
            # Pattern 2: Specific class names (these are examples, will need adjustment)
            author_div = soup.find('div', class_='authors')
            if author_div:
                authors_list = [a.get_text(strip=True) for a in author_div.find_all('a')]

        authors = ", ".join(authors_list) if authors_list else "Authors not found"

        # --- Find Abstract ---
        # Abstracts are often in a <div> with a specific class or heading
        abstract_heading = soup.find(['h2', 'h3'], string=lambda t: t and 'abstract' in t.lower())
        abstract = "Abstract not found"
        if abstract_heading:
            abstract_tag = abstract_heading.find_next('div', {'class': 'abstract-content'}) or abstract_heading.find_next('p')
            if abstract_tag:
                 abstract = abstract_tag.get_text(strip=True)

        full_text = "Full text not found"

        # --- Find and Process PDF ---
        # Look for links that point to a PDF file.
        pdf_links = soup.find_all('a', href=True)
        for link in pdf_links:
            href = link['href']
            # Check if the link is likely a PDF link
            if href.lower().endswith('.pdf') or 'download pdf' in link.get_text(strip=True).lower():
                # Construct the absolute URL for the PDF
                pdf_url = urljoin(url, href)
                print(f"  - Found potential PDF link: {pdf_url}")
                try:
                    # Download the PDF
                    pdf_response = requests.get(pdf_url, headers=headers, timeout=10)
                    pdf_response.raise_for_status()

                    # Extract text from the downloaded PDF content
                    extracted_text = extract_text_from_pdf(pdf_response.content)
                    if extracted_text:
                        print("  - Successfully extracted text from PDF.")
                        full_text = extracted_text
                        break # Stop after the first successfully processed PDF
                except requests.exceptions.RequestException as e:
                    print(f"  - Failed to download PDF from {pdf_url}: {e}")
                    continue # Try the next link

        # --- Fallback to Full Text from HTML ---
        # This will only run if no PDF was found or processed successfully.
        if full_text == "Full text not found":
            print("  - No usable PDF found. Falling back to scraping HTML body.")
            content_area = soup.find('article') or soup.find('div', id='content') or soup.find('div', class_='content') or soup.find('div', role='main')
            if content_area:
                full_text = content_area.get_text(separator='\n', strip=True)
            else:
                # Fallback to getting all text from the body, which is now cleaner after decomposition
                body_tag = soup.find('body')
                if body_tag:
                    full_text = body_tag.get_text(separator='\n', strip=True)


        return {
            'title': title,
            'authors': authors,
            'abstract': abstract,
            'full_text': full_text,
            'url': url
        }

    except requests.exceptions.RequestException as e:
        print(f"Error fetching URL {url}: {e}")
        return None
    except Exception as e:
        print(f"An error occurred while scraping {url}: {e}")
        return None

def process_links_from_file(filepath):
    """
    Reads a file of URLs, scrapes each one, and returns the data.

    Args:
        filepath: The path to the text or csv file with URLs.

    Returns:
        A list of dictionaries, where each dictionary is a scraped paper.
    """
    scraped_data = []
    with open(filepath, 'r') as f:
        # Read either as plain text or from the first column of a CSV
        reader = csv.reader(f)
        urls = [row[0] for row in reader] if '.' in f.name and f.name.rsplit('.', 1)[1].lower() == 'csv' else f.read().splitlines()


    for url in urls:
        if url.strip(): # Ensure the line is not empty
            print(f"Scraping: {url}")
            data = scrape_paper(url)
            if data:
                scraped_data.append(data)
    return scraped_data

if __name__ == '__main__':
    # Create a dummy links file for testing
    # Note: arXiv pages have a ".pdf" link right on the abstract page.
    with open("paper_links.txt", "w") as f:
        f.write("https://arxiv.org/abs/1706.03762\n") # Attention is All You Need
        f.write("https://arxiv.org/abs/2106.07682\n") # Paper on Vision Transformers
        f.write("https://www.ijpsjournal.com/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation") # Example of a failed URL

    # 1. Process the file of links
    papers = process_links_from_file('paper_links.txt')

    if papers:
        # 2. Save the results to a CSV file
        output_filename = 'scraped_papers.csv'
        with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['title', 'authors', 'abstract', 'full_text', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for paper in papers:
                writer.writerow(paper)

        print(f"\nScraping complete. Data saved to {output_filename}")
        print(f"Successfully scraped {len(papers)} papers.")



Scraping: https://arxiv.org/abs/1706.03762
  - No usable PDF found. Falling back to scraping HTML body.
Scraping: https://arxiv.org/abs/2106.07682
  - No usable PDF found. Falling back to scraping HTML body.
Scraping: https://www.ijpsjournal.com/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation
  - Found potential PDF link: https://www.ijpsjournal.com/assetsbackoffice/uploads/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation.pdf
  - Successfully extracted text from PDF.

Scraping complete. Data saved to scraped_papers.csv
Successfully scraped 3 papers.


The code above scrapes pdfs from unprotected sites. but there are some sites that are anti scraping.

In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import fitz  # PyMuPDF - install with 'pip install PyMuPDF'
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
import time

def get_page_source_with_selenium(url):
    """
    Fetches the page source of a URL using a headless Selenium browser
    to bypass anti-scraping measures. CONFIGURED FOR GOOGLE COLAB.

    Args:
        url: The URL to fetch.

    Returns:
        The page's HTML source as a string, or None if it fails.
    """
    chrome_options = Options()
    # Add options to run in a Colab environment
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")

    try:
        # The service object is not needed when running in Colab this way
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)
        # Wait for dynamically loaded content
        time.sleep(3)
        page_source = driver.page_source
        driver.quit()
        return page_source
    except WebDriverException as e:
        print(f"  - Selenium error: {e}")
        return None
    except Exception as e:
        print(f"  - An unexpected error occurred with Selenium: {e}")
        if 'driver' in locals() and driver:
            driver.quit()
        return None


def extract_text_from_pdf(pdf_content):
    """
    Extracts text from the raw content of a PDF file.

    Args:
        pdf_content: The byte content of the PDF file.

    Returns:
        A string containing all the text from the PDF, or an empty string if extraction fails.
    """
    try:
        # Open the PDF from memory
        with fitz.open(stream=pdf_content, filetype="pdf") as doc:
            full_text = ""
            for page in doc:
                full_text += page.get_text()
            return full_text
    except Exception as e:
        print(f"  - Could not extract text from PDF: {e}")
        return ""

def scrape_paper(url):
    """
    Scrapes a research paper page for its title, authors, abstract, and full text content.
    It prioritizes finding and extracting text from a linked PDF.

    Args:
        url: The URL of the research paper.

    Returns:
        A dictionary containing the paper's metadata and full text,
        or None if scraping fails.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    }

    page_source = get_page_source_with_selenium(url)
    if not page_source:
        print(f"Error: Could not retrieve page source for {url}")
        return None

    try:
        soup = BeautifulSoup(page_source, 'html.parser')

        # --- Clean the soup by removing irrelevant tags ---
        for tag in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
            tag.decompose()

        # --- Find Paper Title ---
        title_tag = soup.find('h1', {'class': 'title'}) or soup.find('meta', property='og:title')
        title = title_tag['content'] if title_tag and title_tag.has_attr('content') else (title_tag.get_text(strip=True) if title_tag else "Title not found")

        # --- Find Authors ---
        authors_list = []
        author_tags = soup.find_all('meta', {'name': 'citation_author'})
        if author_tags:
            authors_list = [tag['content'] for tag in author_tags]
        else:
            author_div = soup.find('div', class_='authors')
            if author_div:
                authors_list = [a.get_text(strip=True) for a in author_div.find_all('a')]

        authors = ", ".join(authors_list) if authors_list else "Authors not found"

        # --- Find Abstract ---
        abstract_heading = soup.find(['h2', 'h3'], string=lambda t: t and 'abstract' in t.lower())
        abstract = "Abstract not found"
        if abstract_heading:
            abstract_tag = abstract_heading.find_next('div', {'class': 'abstract-content'}) or abstract_heading.find_next('p')
            if abstract_tag:
                 abstract = abstract_tag.get_text(strip=True)

        full_text = "Full text not found"

        # --- Find and Process PDF ---
        pdf_meta_tag = soup.find('meta', {'name': 'citation_pdf_url'})
        if pdf_meta_tag and pdf_meta_tag.has_attr('content'):
            pdf_url_from_meta = pdf_meta_tag['content']
            print(f"  - Found PDF meta tag: {pdf_url_from_meta}")
            try:
                pdf_response = requests.get(pdf_url_from_meta, headers=headers, timeout=15)
                pdf_response.raise_for_status()
                extracted_text = extract_text_from_pdf(pdf_response.content)
                if extracted_text:
                    print("  - Successfully extracted text from PDF via meta tag.")
                    full_text = extracted_text
            except requests.exceptions.RequestException as e:
                print(f"  - Failed to download PDF from meta tag URL {pdf_url_from_meta}: {e}")

        if full_text == "Full text not found":
            print("  - No usable PDF found via meta tag. Searching for links on page.")
            pdf_links = soup.find_all('a', href=True)
            for link in pdf_links:
                href = link['href']
                if href.lower().endswith('.pdf') or 'download pdf' in link.get_text(strip=True).lower():
                    pdf_url = urljoin(url, href)
                    print(f"  - Found potential PDF link: {pdf_url}")
                    try:
                        pdf_response = requests.get(pdf_url, headers=headers, timeout=10)
                        pdf_response.raise_for_status()
                        extracted_text = extract_text_from_pdf(pdf_response.content)
                        if extracted_text:
                            print("  - Successfully extracted text from PDF.")
                            full_text = extracted_text
                            break
                    except requests.exceptions.RequestException as e:
                        print(f"  - Failed to download PDF from {pdf_url}: {e}")
                        continue

        if full_text == "Full text not found":
            print("  - No usable PDF found. Falling back to scraping HTML body.")
            content_area = soup.find('article') or soup.find('div', id='content') or soup.find('div', class_='content') or soup.find('div', role='main')
            if content_area:
                full_text = content_area.get_text(separator='\n', strip=True)
            else:
                body_tag = soup.find('body')
                if body_tag:
                    full_text = body_tag.get_text(separator='\n', strip=True)

        return {
            'title': title,
            'authors': authors,
            'abstract': abstract,
            'full_text': full_text,
            'url': url
        }

    except Exception as e:
        print(f"An error occurred while parsing {url}: {e}")
        return None

def process_links_from_file(filepath):
    """
    Reads a file of URLs, scrapes each one, and returns the data.

    Args:
        filepath: The path to the text or csv file with URLs.

    Returns:
        A list of dictionaries, where each dictionary is a scraped paper.
    """
    scraped_data = []
    try:
        with open(filepath, 'r') as f:
            # Read either as plain text or from the first column of a CSV
            is_csv = '.csv' in filepath.lower()
            if is_csv:
                reader = csv.reader(f)
                urls = [row[0] for row in reader if row]
            else:
                urls = f.read().splitlines()
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return []


    for url in urls:
        if url.strip(): # Ensure the line is not empty
            print(f"Scraping: {url}")
            data = scrape_paper(url)
            if data:
                scraped_data.append(data)
    return scraped_data

if __name__ == '__main__':
    # Create a dummy links file for testing
    # Note: arXiv pages have a ".pdf" link right on the abstract page.
    with open("paper_links.txt", "w") as f:
        f.write("https://arxiv.org/abs/1706.03762\n") # Attention is All You Need
        f.write("https://www.sciencedirect.com/science/article/pii/S2213846323001116\n") # ScienceDirect Paper
        f.write("https://www.ijpsjournal.com/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation") # Example of a failed URL

    # 1. Process the file of links
    papers = process_links_from_file('paper_links.txt')

    if papers:
        # 2. Save the results to a CSV file
        output_filename = 'scraped_papers.csv'
        with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['title', 'authors', 'abstract', 'full_text', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

            writer.writeheader()
            for paper in papers:
                writer.writerow(paper)

        print(f"\nScraping complete. Data saved to {output_filename}")
        print(f"Successfully scraped {len(papers)} papers.")


Scraping: https://arxiv.org/abs/1706.03762
  - Found PDF meta tag: http://arxiv.org/pdf/1706.03762
  - Successfully extracted text from PDF via meta tag.
Scraping: https://www.sciencedirect.com/science/article/pii/S2213846323001116
  - No usable PDF found via meta tag. Searching for links on page.
  - No usable PDF found. Falling back to scraping HTML body.
Scraping: https://www.ijpsjournal.com/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation
  - Found PDF meta tag: https://www.ijpsjournal.com/assetsbackoffice/uploads/article/Polypharmacy+A+Review+of+Adverse+Drug+Reaction+Interaction+and+Mitigation.pdf
  - Successfully extracted text from PDF via meta tag.

Scraping complete. Data saved to scraped_papers.csv
Successfully scraped 3 papers.


Try


In [None]:
import requests
from bs4 import BeautifulSoup
import csv
import fitz  # PyMuPDF - install with 'pip install PyMuPDF'
from urllib.parse import urljoin
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import WebDriverException
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

def get_page_source_with_selenium(url):
    """
    Fetches the page source of a URL using a headless Selenium browser
    to bypass anti-scraping measures. CONFIGURED FOR GOOGLE COLAB.

    Args:
        url: The URL to fetch.

    Returns:
        The page's HTML source as a string, or None if it fails.
    """
    chrome_options = Options()
    # Add options to run in a Colab environment
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--disable-dev-shm-usage')
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36")

    driver = None
    try:
        # The service object is not needed when running in Colab this way
        driver = webdriver.Chrome(options=chrome_options)
        driver.get(url)

        # Use an explicit wait for better reliability instead of a fixed sleep.
        # Wait up to 20 seconds for an element with id 'body' to be present.
        # This ensures the page's JavaScript has loaded the main content.
        wait = WebDriverWait(driver, 20)
        wait.until(EC.presence_of_element_located((By.ID, "body")))

        page_source = driver.page_source
        driver.quit()
        return page_source
    except WebDriverException as e:
        print(f"  - Selenium error: {e}")
        if driver:
            driver.quit()
        return None
    except Exception as e:
        print(f"  - An unexpected error occurred with Selenium: {e}")
        if driver:
            driver.quit()
        return None


def extract_text_from_pdf(pdf_content):
    """
    Extracts text from the raw content of a PDF file.

    Args:
        pdf_content: The byte content of the PDF file.

    Returns:
        A string containing all the text from the PDF, or an empty string if extraction fails.
    """
    try:
        # Open the PDF from memory
        with fitz.open(stream=pdf_content, filetype="pdf") as doc:
            full_text = ""
            for page in doc:
                full_text += page.get_text()
            return full_text
    except Exception as e:
        print(f"  - Could not extract text from PDF: {e}")
        return ""

def scrape_paper(url):
    """
    Scrapes a research paper page for its title, authors, abstract, and full text content.
    It prioritizes finding and extracting text from a linked PDF.

    Args:
        url: The URL of the research paper.

    Returns:
        A dictionary containing the paper's metadata and full text,
        or None if scraping fails.
    """
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
    }

    page_source = get_page_source_with_selenium(url)
    if not page_source:
        print(f"Error: Could not retrieve page source for {url}")
        return None

    try:
        soup = BeautifulSoup(page_source, 'html.parser')

        # --- Clean the soup by removing irrelevant tags ---
        for tag in soup(['nav', 'header', 'footer', 'aside', 'script', 'style']):
            tag.decompose()

        # --- Find Paper Title ---
        title_tag = soup.find('h1', {'class': 'title'}) or soup.find('meta', property='og:title')
        title = title_tag['content'] if title_tag and title_tag.has_attr('content') else (title_tag.get_text(strip=True) if title_tag else "Title not found")

        # --- Find Authors ---
        authors_list = []
        author_tags = soup.find_all('meta', {'name': 'citation_author'})
        if author_tags:
            authors_list = [tag['content'] for tag in author_tags]
        else:
            author_div = soup.find('div', class_='authors')
            if author_div:
                authors_list = [a.get_text(strip=True) for a in author_div.find_all('a')]

        authors = ", ".join(authors_list) if authors_list else "Authors not found"

        # --- Find Abstract ---
        abstract_heading = soup.find(['h2', 'h3'], string=lambda t: t and 'abstract' in t.lower())
        abstract = "Abstract not found"
        if abstract_heading:
            abstract_tag = abstract_heading.find_next('div', {'class': 'abstract-content'}) or abstract_heading.find_next('p')
            if abstract_tag:
                 abstract = abstract_tag.get_text(strip=True)

        full_text = "Full text not found"

        # --- Find and Process PDF ---
        # Method 1: Look for the citation_pdf_url meta tag (often the best source)
        pdf_meta_tag = soup.find('meta', {'name': 'citation_pdf_url'})
        if pdf_meta_tag and pdf_meta_tag.has_attr('content'):
            pdf_url_from_meta = pdf_meta_tag['content']
            print(f"  - Found PDF meta tag: {pdf_url_from_meta}")
            try:
                pdf_response = requests.get(pdf_url_from_meta, headers=headers, timeout=15)
                pdf_response.raise_for_status()
                extracted_text = extract_text_from_pdf(pdf_response.content)
                if extracted_text:
                    print("  - Successfully extracted text from PDF via meta tag.")
                    full_text = extracted_text
            except requests.exceptions.RequestException as e:
                print(f"  - Failed to download PDF from meta tag URL {pdf_url_from_meta}: {e}")

        # Method 2: Specifically look for ScienceDirect's PDF button if the first method fails
        if full_text == "Full text not found":
            pdf_button = soup.find('a', id='pdfLink')
            if pdf_button and pdf_button.has_attr('href'):
                pdf_url = urljoin(url, pdf_button['href'])
                print(f"  - Found specific PDF button (e.g., ScienceDirect): {pdf_url}")
                try:
                    pdf_response = requests.get(pdf_url, headers=headers, timeout=15)
                    pdf_response.raise_for_status()
                    if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
                        extracted_text = extract_text_from_pdf(pdf_response.content)
                        if extracted_text:
                            print("  - Successfully extracted text from PDF button link.")
                            full_text = extracted_text
                except requests.exceptions.RequestException as e:
                    print(f"  - Failed to download PDF from button link {pdf_url}: {e}")

        # Method 3: Fallback to searching all links on the page
        if full_text == "Full text not found":
            print("  - No usable PDF found yet. Searching all links on page as a fallback.")
            pdf_links = soup.find_all('a', href=True)
            processed_urls = set()

            for link in pdf_links:
                href = link['href']
                pdf_url = urljoin(url, href)
                if pdf_url in processed_urls:
                    continue

                link_text = link.get_text(strip=True).lower()

                is_pdf_link = (
                    href.lower().endswith('.pdf') or
                    'download' in link_text or
                    'view pdf' in link_text
                )

                if is_pdf_link:
                    processed_urls.add(pdf_url)
                    print(f"  - Found potential PDF link: {pdf_url}")
                    try:
                        pdf_response = requests.get(pdf_url, headers=headers, timeout=10)
                        pdf_response.raise_for_status()
                        if 'application/pdf' in pdf_response.headers.get('Content-Type', ''):
                            extracted_text = extract_text_from_pdf(pdf_response.content)
                            if extracted_text:
                                print("  - Successfully extracted text from PDF.")
                                full_text = extracted_text
                                break
                        else:
                            print(f"  - Link did not lead to a PDF. Content-Type: {pdf_response.headers.get('Content-Type')}")

                    except requests.exceptions.RequestException as e:
                        print(f"  - Failed to download PDF from {pdf_url}: {e}")
                        continue

        # Method 4: Final fallback to scraping HTML body text
        if full_text == "Full text not found":
            print("  - No usable PDF found. Falling back to scraping HTML body.")
            content_area = soup.find('article') or soup.find('div', id='content') or soup.find('div', class_='content') or soup.find('div', role='main')
            if content_area:
                full_text = content_area.get_text(separator='\n', strip=True)
            else:
                body_tag = soup.find('body')
                if body_tag:
                    full_text = body_tag.get_text(separator='\n', strip=True)

        return {
            'title': title,
            'authors': authors,
            'abstract': abstract,
            'full_text': full_text,
            'url': url
        }

    except Exception as e:
        print(f"An error occurred while parsing {url}: {e}")
        return None

def process_links_from_file(filepath):
    """
    Reads a file of URLs, scrapes each one, and returns the data.

    Args:
        filepath: The path to the text or csv file with URLs.

    Returns:
        A list of dictionaries, where each dictionary is a scraped paper.
    """
    scraped_data = []
    try:
        with open(filepath, 'r') as f:
            is_csv = '.csv' in filepath.lower()
            if is_csv:
                reader = csv.reader(f)
                urls = [row[0] for row in reader if row]
            else:
                urls = f.read().splitlines()
    except FileNotFoundError:
        print(f"Error: The file '{filepath}' was not found.")
        return []

    for url in urls:
        if url.strip():
            print(f"Scraping: {url}")
            data = scrape_paper(url)
            if data:
                scraped_data.append(data)
    return scraped_data

if __name__ == '__main__':
    with open("paper_links.txt", "w") as f:
        f.write("https://arxiv.org/abs/1706.03762\n") # Attention is All You Need
        f.write("https://www.sciencedirect.com/science/article/pii/S2213846323001116\n") # ScienceDirect Paper
        f.write("https://invalid-url-example.com\n") # Example of a failed URL

    papers = process_links_from_file('paper_links.txt')

    if papers:
        output_filename = 'scraped_papers.csv'
        with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile:
            fieldnames = ['title', 'authors', 'abstract', 'full_text', 'url']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for paper in papers:
                writer.writerow(paper)

        print(f"\nScraping complete. Data saved to {output_filename}")
        print(f"Successfully scraped {len(papers)} papers.")



Scraping: https://arxiv.org/abs/1706.03762
  - Selenium error: Message: 
Stacktrace:
#0 0x5a0745f7794a <unknown>
#1 0x5a07459ec8a0 <unknown>
#2 0x5a0745a3e540 <unknown>
#3 0x5a0745a3e731 <unknown>
#4 0x5a0745a8c824 <unknown>
#5 0x5a0745a6405d <unknown>
#6 0x5a0745a89c23 <unknown>
#7 0x5a0745a63e03 <unknown>
#8 0x5a0745a30968 <unknown>
#9 0x5a0745a315e1 <unknown>
#10 0x5a0745f3b548 <unknown>
#11 0x5a0745f3f272 <unknown>
#12 0x5a0745f22313 <unknown>
#13 0x5a0745f3fdc5 <unknown>
#14 0x5a0745f0749f <unknown>
#15 0x5a0745f64158 <unknown>
#16 0x5a0745f64332 <unknown>
#17 0x5a0745f76a53 <unknown>
#18 0x7980dd67eac3 <unknown>

Error: Could not retrieve page source for https://arxiv.org/abs/1706.03762
Scraping: https://www.sciencedirect.com/science/article/pii/S2213846323001116
  - Selenium error: Message: 
Stacktrace:
#0 0x58c3e675994a <unknown>
#1 0x58c3e61ce8a0 <unknown>
#2 0x58c3e6220540 <unknown>
#3 0x58c3e6220731 <unknown>
#4 0x58c3e626e824 <unknown>
#5 0x58c3e624605d <unknown>
#6 0x58c3e

In [None]:
!apt-get update
!apt-get install -y chromium-chromedriver
!pip install selenium
!pip install PyMuPDF

0% [Working]            Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [2,005 kB]
Hit:10 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Get:11 https://r2u.stat.illinois.edu/ubuntu jammy/main amd64 Packages [2,789 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [3,623 kB]
Get:13 https://ppa.launchpadcontent.net/g

