In [None]:
import os
import threading
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, unquote, urljoin

# Get user input (defaulting to None if not entered)
BASE_URL = input("Enter the URL: ").strip()

start_year_input = input("Enter the start year (leave empty to start from the earliest available): ").strip()
end_year_input = input("Enter the end year (leave empty to download up to the latest available): ").strip()

# Convert inputs to integers if provided, otherwise None
START_YEAR = int(start_year_input) if start_year_input.isdigit() else None
END_YEAR = int(end_year_input) if end_year_input.isdigit() else None

# Set up crawling queue
urls_to_visit = [BASE_URL]
visited_urls = set()
max_year_encountered = None  # Track the latest year encountered

# List to store batches of PDFs
pdf_batch = []
BATCH_SIZE = 10  # Process 10 PDFs at a time


def find_pdf_links(url):
    """Find and return all PDF links on a webpage."""
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.content, 'html.parser')

        return [urljoin(BASE_URL, link['href']) for link in soup.find_all('a', href=True) if link['href'].endswith('.pdf')]
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return []


def download_pdfs(pdf_list):
    """Download a batch of PDF files using threads."""
    threads = []
    for pdf_url, paper_year in pdf_list:
        thread = threading.Thread(target=download_pdf, args=(pdf_url, paper_year))
        threads.append(thread)

    # Start and join threads
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()


def download_pdf(pdf_url, paper_year):
    """Download a PDF file and save it under `paper/<year>/` directory."""
    try:
        response = requests.get(pdf_url, headers={'User-Agent': 'Mozilla/5.0'}, stream=True)
        response.raise_for_status()

        # Create directory for the paper year
        save_directory = os.path.join("paper", str(paper_year))
        os.makedirs(save_directory, exist_ok=True)

        pdf_path = os.path.join(save_directory, extract_pdf_name(pdf_url))

        with open(pdf_path, "wb") as pdf_file:
            for chunk in response.iter_content(chunk_size=8192):
                pdf_file.write(chunk)

        print(f"Downloaded: {pdf_path}")
    except requests.RequestException as e:
        print(f"Error downloading {pdf_url}: {e}")


def extract_pdf_name(pdf_url):
    """Extracts the filename from a PDF URL."""
    return unquote(urlparse(pdf_url).path.split('/')[-1])


def extract_paper_year(pdf_url):
    """Extracts the year from the URL."""
    parsed_path = urlparse(pdf_url).path.split('/')

    try:
        year_index = parsed_path.index("paper") + 1  # Year comes after 'paper'
        return int(parsed_path[year_index])  # Convert to integer
    except (ValueError, IndexError):
        return None  # Return None if no valid year is found


def crawl_website():
    """Crawl the website for PDF links and download them in batches."""
    global max_year_encountered

    while urls_to_visit:
        current_url = urls_to_visit.pop()
        if current_url in visited_urls:
            continue

        visited_urls.add(current_url)
        pdf_links = find_pdf_links(current_url)

        if pdf_links:
            for pdf in pdf_links:
                paper_year = extract_paper_year(pdf)

                # If year is found, apply filtering logic
                if paper_year:
                    if START_YEAR and paper_year < START_YEAR:
                        print(f"Skipping {pdf} (Year {paper_year} is before {START_YEAR})")
                        continue
                    if END_YEAR and paper_year > END_YEAR:
                        print(f"Skipping {pdf} (Year {paper_year} is after {END_YEAR})")
                        continue

                    # Track max encountered year and stop if it exceeds END_YEAR
                    if max_year_encountered is None or paper_year > max_year_encountered:
                        max_year_encountered = paper_year
                    if END_YEAR and max_year_encountered > END_YEAR:
                        print(f"Stopping: Encountered year {max_year_encountered}, which exceeds END_YEAR {END_YEAR}")
                        return

                    pdf_batch.append((pdf, paper_year))

                    # Process PDFs in batches of 10
                    if len(pdf_batch) >= BATCH_SIZE:
                        download_pdfs(pdf_batch)
                        pdf_batch.clear()

        # Find new internal links to visit
        try:
            response = requests.get(current_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            for link in soup.select("li a[href]"):
                new_url = urljoin(BASE_URL, link['href'])
                if new_url.startswith(BASE_URL) and new_url not in visited_urls:
                    urls_to_visit.append(new_url)
        except requests.RequestException:
            continue

    # Download any remaining PDFs in the batch
    if pdf_batch:
        download_pdfs(pdf_batch)
        pdf_batch.clear()


# Run the crawler
crawl_website()


In [None]:
import os
import fitz #pymupdf
import pandas as pd
import ollama

# Folder containing PDFs
pdf_folder = "paper"
output_file = "pdf_metadata.xlsx"

# Function to extract metadata using Ollama Phi-3 Mini
def extract_metadata(text):
    prompt = f"""
    Extract the following information from this research paper:
    - Title
    - Authors
    - Year
    - Country/Countries
    - University
    - 3 best-suited topic labels

    If any information is missing, return 'Unknown'.

    Text:
    {text[:3000]}  # Limiting input to avoid long processing times
    """
    response = ollama.chat(model="phi3:mini", messages=[{"role": "user", "content": prompt}])
    return response['message']['content']

# Extract PDFs and get metadata
data = []
for root, _, files in os.walk(pdf_folder):
    for file in files:
        if file.endswith(".pdf"):
            pdf_path = os.path.join(root, file)

            try:
                # Extract text from PDF
                doc = fitz.open(pdf_path)
                text = "\n".join([page.get_text("text") for page in doc])

                # Get metadata from Ollama
                metadata = extract_metadata(text)

                # Parse metadata response (Ollama output should be structured)
                lines = metadata.split("\n")
                title = lines[0].replace("Title:", "").strip() if "Title:" in lines[0] else "Unknown"
                authors = lines[1].replace("Authors:", "").strip() if len(lines) > 1 else "Unknown"
                year = lines[2].replace("Year:", "").strip() if len(lines) > 2 else "Unknown"
                countries = lines[3].replace("Country/Countries:", "").strip() if len(lines) > 3 else "Unknown"
                university = lines[4].replace("University:", "").strip() if len(lines) > 4 else "Unknown"
                labels = lines[5].replace("Labels:", "").strip() if len(lines) > 5 else "Unknown"

                # Store data
                data.append([pdf_path, title, authors, year, countries, university, labels])

            except Exception as e:
                print(f"Error processing {file}: {e}")

# Save data to XLSX
df = pd.DataFrame(data, columns=["PDF Path", "Title", "Authors", "Year", "Countries", "University", "Labels"])
df.to_excel(output_file, index=False)

print(f"Metadata saved to {output_file}")
