In [1]:
import os
import pdfplumber
import json
from PyPDF2 import PdfReader

In [None]:
def parse_robots_txt(folder_path):
    """
    Parses the robots.txt file in the given folder, if it exists, 
    to extract the disallowed PDF file names.

    Args:
    - folder_path (str): The path to the folder.

    Returns:
    - set: A set of disallowed file names.
    """
    disallowed_files = set()
    robots_path = os.path.join(folder_path, "robots.txt")
    if os.path.exists(robots_path):
        with open(robots_path, "r") as file:
            for line in file:
                line = line.strip()
                if line.startswith("Disallow:"):
                    # Extract the file name after "Disallow: "
                    disallowed_file = line.split("Disallow: ")[1].strip()
                    disallowed_files.add(disallowed_file)
    return disallowed_files

In [3]:
def extract_pdf_content(file_path):
    """
    Extracts text content from a PDF file.

    Args:
    - file_path (str): Path to the PDF file.

    Returns:
    - str: The extracted text content.
    """
    try:
        reader = PdfReader(file_path)
        content = ""
        for page in reader.pages:
            content += page.extract_text()
        return content
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return ""

In [None]:
def crawl_pdfs_with_content(folder_path):
    """
    Crawls through the folder to find all PDF files, respecting robots.txt,
    and extracts their content.

    Args:
    - folder_path (str): The path to the folder containing the PDF documents.

    Returns:
    - list of dict: A list where each dictionary contains the document name, 
      file path, and content.
    """
    pdf_files = []
    for root, dirs, files in os.walk(folder_path):
        # Check robots.txt in the current directory
        disallowed_files = parse_robots_txt(root)
        
        for file in files:
            if file.endswith(".pdf") and file not in disallowed_files:
                file_path = os.path.join(root, file)
                content = extract_pdf_content(file_path)
                pdf_files.append({
                    "document_name": file,
                    "file_path": file_path,
                    "content": content
                })
    return pdf_files

In [5]:
def save_crawled_data_to_json(data, output_file):
    """
    Saves the crawled PDF data into a JSON file.

    Args:
    - data (list of dict): The crawled PDF data to save.
    - output_file (str): The path to the output JSON file.
    """
    try:
        with open(output_file, "w", encoding="utf-8") as json_file:
            json.dump(data, json_file, indent=4, ensure_ascii=False)
        print(f"Data successfully saved to {output_file}")
    except Exception as e:
        print(f"An error occurred while saving to JSON: {e}")