In [4]:
import os
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote, urljoin

# Get user input (defaulting to None if not entered)
BASE_URL = input("Enter the URL: ").strip()

start_year_input = input("Enter the start year (leave empty to start from the earliest available): ").strip()
end_year_input = input("Enter the end year (leave empty to download up to the latest available): ").strip()

# Convert inputs to integers if provided, otherwise None
START_YEAR = int(start_year_input) if start_year_input.isdigit() else None
END_YEAR = int(end_year_input) if end_year_input.isdigit() else None

# Set up crawling queue
urls_to_visit = [BASE_URL]
visited_urls = set()
pdf_batch = []
BATCH_SIZE = 35  # Process 35 PDFs at a time


def find_pdf_links(url):
    """Find and return all PDF links on a webpage."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        return [urljoin(BASE_URL, link['href']) for link in soup.find_all('a', href=True) if link['href'].endswith('.pdf')]
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []


def download_pdfs(pdf_list):
    """Download a batch of PDF files using threads."""
    threads = []
    for pdf_url, paper_year in pdf_list:
        thread = threading.Thread(target=download_pdf, args=(pdf_url, paper_year))
        threads.append(thread)

    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()


# New Downloader skips downloading if already exist
def download_pdf(pdf_url, paper_year):
    """Download a PDF file and save it under `paper/<year>/` directory if not already downloaded."""
    try:
        # Create directory for the paper year
        save_directory = os.path.join("paper", str(paper_year))
        os.makedirs(save_directory, exist_ok=True)

        # Extract the filename
        pdf_filename = extract_pdf_name(pdf_url)
        pdf_path = os.path.join(save_directory, pdf_filename)

        # **Check if file already exists**
        if os.path.exists(pdf_path):
            print(f"Skipping: {pdf_path} (Already downloaded)")
            return  # **Skip downloading this file**

        # Proceed with download if file doesn't exist
        response = requests.get(pdf_url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True)
        response.raise_for_status()

        with open(pdf_path, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)

        print(f"Downloaded: {pdf_path}")
    except requests.RequestException as e:
        print(f"Error downloading {pdf_url}: {e}")


# old Downloader redownload if already exist
# def download_pdf(pdf_url, paper_year):
#     """Download a PDF file and save it under `paper/<year>/` directory."""
#     try:
#         response = requests.get(pdf_url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True)
#         response.raise_for_status()
#
#         save_directory = os.path.join("paper", str(paper_year))
#         os.makedirs(save_directory, exist_ok=True)
#
#         pdf_path = os.path.join(save_directory, extract_pdf_name(pdf_url))
#
#         with open(pdf_path, "wb") as pdf_file:
#             for chunk in response.iter_content(chunk_size=8192):
#                 pdf_file.write(chunk)
#
#         print(f"Downloaded: {pdf_path}")
#     except requests.RequestException as e:
#         print(f"Error downloading {pdf_url}: {e}")


def extract_pdf_name(pdf_url):
    """Extracts the filename from a PDF URL."""
    return unquote(urlparse(pdf_url).path.split('/')[-1])


def extract_paper_year(pdf_url):
    """Extracts the year from the URL."""
    parsed_path = urlparse(pdf_url).path.split('/')

    try:
        year_index = parsed_path.index("paper") + 1  # Year comes after 'paper'
        return int(parsed_path[year_index])  # Convert to integer
    except (ValueError, IndexError):
        return None  # Return None if no valid year is found


def crawl_website():
    """Crawl the website for PDF links and download them in batches."""
    global END_YEAR

    while urls_to_visit:
        current_url = urls_to_visit.pop()
        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)
        pdf_links = find_pdf_links(current_url)

        if pdf_links:
            for pdf in pdf_links:
                paper_year = extract_paper_year(pdf)

                if paper_year:
                    if START_YEAR and paper_year < START_YEAR:
                        print(f"Skipping {pdf} (Year {paper_year} is before {START_YEAR})")
                        continue
                    if END_YEAR and paper_year > END_YEAR:
                        print(f"Stopping: Encountered year {paper_year}, which exceeds END_YEAR {END_YEAR}")
                        return  # Stop crawling immediately

                    pdf_batch.append((pdf, paper_year))

                    # Process PDFs in batches of 10
                    if len(pdf_batch) >= BATCH_SIZE:
                        download_pdfs(pdf_batch)
                        pdf_batch.clear()

        # Find new internal links to visit
        try:
            response = requests.get(current_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            for link in soup.select("li a[href]"):
                new_url = urljoin(BASE_URL, link['href'])
                if new_url.startswith(BASE_URL) and new_url not in visited_urls:
                    urls_to_visit.append(new_url)
        except requests.RequestException:
            continue

    # Download any remaining PDFs in the batch
    if pdf_batch:
        download_pdfs(pdf_batch)
        pdf_batch.clear()


# Run the crawler
crawl_website()


Skipping: paper\1987\fe9fc289c3ff0af142b6d3bead98a923-Paper.pdf (Already downloaded)
Skipping: paper\1987\fc490ca45c00b1249bbe3554a4fdf6fb-Paper.pdf (Already downloaded)
Skipping: paper\1987\fbd7939d674997cdb4692d34de8633c4-Paper.pdf (Already downloaded)
Skipping: paper\1987\f457c545a9ded88f18ecee47145a72c0-Paper.pdf (Already downloaded)
Skipping: paper\1987\f7177163c833dff4b38fc8d2872f1ec6-Paper.pdf (Already downloaded)
Skipping: paper\1987\f033ab37c30201f73f142449d037028d-Paper.pdf (Already downloaded)
Skipping: paper\1987\eccbc87e4b5ce2fe28308fd9f2a7baf3-Paper.pdf (Already downloaded)
Skipping: paper\1987\ea5d2f1c4608232e07d3aa3d998e5135-Paper.pdf (Already downloaded)
Skipping: paper\1987\e369853df766fa44e1ed0ff613f563bd-Paper.pdf (Already downloaded)
Skipping: paper\1987\e4da3b7fbbce2345d7772b0674a318d5-Paper.pdf (Already downloaded)
Skipping: paper\1987\e2c420d928d4bf8ce0ff2ec19b371514-Paper.pdf (Already downloaded)
Skipping: paper\1987\d9d4f495e875a2e075a1a4a6e1b9770f-Paper.pdf (