In [4]:
import os
import pdfplumber

In [5]:
def parse_robots_txt(file_path):
    """Parse robots.txt and extract disallowed files for FileCrawler."""
    disallowed_files = set()
    try:
        with open(file_path, 'r') as f:
            lines = f.readlines()
        user_agent_found = False
        for line in lines:
            line = line.strip()
            if line.startswith("User-agent:"):
                user_agent_found = line.split(":")[1].strip() == "FileCrawler"
            elif user_agent_found and line.startswith("Disallow:"):
                disallowed_file = line.split(":")[1].strip()
                disallowed_files.add(disallowed_file)
            elif line == "":
                user_agent_found = False
    except Exception as e:
        print(f"Error reading robots.txt at {file_path}: {e}")
    return disallowed_files

In [6]:
def extract_text_from_pdf(file_path):
    """Extract text from a PDF file."""
    try:
        with pdfplumber.open(file_path) as pdf:
            text = ''
            for page in pdf.pages:
                text += page.extract_text()
        return text
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

In [7]:
def crawl_and_extract(data_folder):
    """Crawl through the data folder and extract text from PDF files."""
    pdf_contents = {}
    for root, _, files in os.walk(data_folder):
        # Check for a robots.txt file
        robots_txt_path = os.path.join(root, "robots.txt")
        disallowed_files = parse_robots_txt(robots_txt_path) if os.path.exists(robots_txt_path) else set()
        
        for file in files:
            if file.endswith('.pdf') and f"/{file}" not in disallowed_files:
                file_path = os.path.join(root, file)
                print(f"Processing: {file_path}")
                text = extract_text_from_pdf(file_path)
                if text:
                    pdf_contents[file_path] = text
            elif file.endswith('.pdf'):
                print(f"Skipping disallowed file: {file}")
    return pdf_contents