In [4]:
import json
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Configuration
metadata_file = 'metadata.json'
pdf_output_dir = 'scraped_pdfs_2'
pdf_metadata_file = 'pdf_metadata.json'

# Ensure the PDF output directory exists
os.makedirs(pdf_output_dir, exist_ok=True)

# Initialize an empty dictionary for the new PDF metadata
pdf_metadata = {}

def sanitize_filename(name):
    """Sanitize the filename."""
    return "".join([c for c in name if c.isalpha() or c.isdigit() or c in ' ._-']).rstrip()

def download_pdf(url, title):
    """Download a PDF file and update the PDF metadata."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error on bad status
        pdf_filename = sanitize_filename(title) + '.pdf'
        pdf_path = os.path.join(pdf_output_dir, pdf_filename)
        with open(pdf_path, 'wb') as f:
            f.write(response.content)
        print(f"Downloaded PDF: {pdf_path}")
        # Update the PDF metadata
        pdf_metadata[url] = {
            'url': url,
            'title': title,
            'filename': pdf_filename
        }
    except Exception as e:
        print(f"Error downloading {url}: {e}")

def scrape_pdfs(url, title):
    """Scrape webpage for PDF links and download them."""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an error on bad status

        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')

        # Find all PDF links
        for link in soup.find_all('a', href=True):
            href = link['href']
            if href.endswith('.pdf'):
                # Reconstruct full URL if necessary
                if href.startswith('/'):
                    href = 'https://www.electoralcommission.org.uk' + href
                else:
                    href = urljoin(url, href)
                download_pdf(href, title)

    except Exception as e:
        print(f"Error scraping {url}: {e}")

def main():
    # Load metadata
    with open(metadata_file, 'r', encoding='utf-8') as f:
        metadata = json.load(f)

    # Iterate over each URL in the metadata
    for entry in metadata.values():
        url = entry['url']
        title = entry['title']
        scrape_pdfs(url, title)

    # Save the new PDF metadata
    with open(pdf_metadata_file, 'w', encoding='utf-8') as f:
        json.dump(pdf_metadata, f, indent=4)

if __name__ == '__main__':
    main()

Downloaded PDF: scraped_pdfs_2/Report on the May 2023 local elections in England.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Freedom of Information.pdf
Downloaded PDF: scraped_pdfs_2/Interim corporate plan 202021 - 202425.pdf
Downloaded PDF: scraped_pdfs_2/Register to vote anonymously.pdf
Downloaded PDF: scraped_pdfs_2/Register to vote anonymously.pdf
Downloaded PDF: scraped_pdfs_2/Register to vote anonymously.pdf
Downloaded PDF: scraped_pdfs_2/Register to vote anonymously.pdf
Downloaded PDF: scraped_pdfs_2/Register to vote anonymously.pdf

: 

In [5]:
import os
import fitz  # PyMuPDF
import traceback

def delete_corrupted_pdfs(directory):
    """
    Deletes corrupted or invalid PDF files from the specified directory.

    Args:
    - directory (str): The path to the directory containing PDF files.
    """
    for filename in os.listdir(directory):
        if not filename.lower().endswith('.pdf'):
            continue  # Skip non-PDF files

        filepath = os.path.join(directory, filename)
        try:
            # Attempt to open the PDF file
            with fitz.open(filepath) as doc:
                # You can perform additional checks here (e.g., number of pages)
                pass
        except Exception as e:
            # If an error occurs, the PDF is likely corrupted or invalid
            print(f"Deleting corrupted or invalid PDF: {filename}")
            os.remove(filepath)
            # Optionally, log the error
            print(f"Error details: {traceback.format_exc()}")

if __name__ == "__main__":
    directory_path = './data/scraped_pdfs'
    delete_corrupted_pdfs(directory_path)
    print("Finished checking and deleting corrupted PDF files.")

Deleting corrupted or invalid PDF: Spending for EU referendum campaigners  Electoral Commission.pdf
Error details: Traceback (most recent call last):
  File "/var/folders/9d/jc2663851_lgqff4sx4jlfz40000gn/T/xpython_52412/664033441.py", line 19, in delete_corrupted_pdfs
    with fitz.open(filepath) as doc:
  File "/Users/cardigan/anaconda3/lib/python3.10/site-packages/fitz/fitz.py", line 4122, in __init__
    _fitz.Document_swiginit(self, _fitz.new_Document(filename, stream, filetype, rect, width, height, fontsize))
fitz.fitz.FileDataError: cannot open broken document

Deleting corrupted or invalid PDF: Observers at UK elections booklet  Electoral Commission.pdf
Error details: Traceback (most recent call last):
  File "/var/folders/9d/jc2663851_lgqff4sx4jlfz40000gn/T/xpython_52412/664033441.py", line 19, in delete_corrupted_pdfs
    with fitz.open(filepath) as doc:
  File "/Users/cardigan/anaconda3/lib/python3.10/site-packages/fitz/fitz.py", line 4122, in __init__
    _fitz.Document_swi

Deleting corrupted or invalid PDF: Explanatory note for RD1C NI  Electoral Commission.pdf
Error details: Traceback (most recent call last):
  File "/var/folders/9d/jc2663851_lgqff4sx4jlfz40000gn/T/xpython_52412/664033441.py", line 19, in delete_corrupted_pdfs
    with fitz.open(filepath) as doc:
  File "/Users/cardigan/anaconda3/lib/python3.10/site-packages/fitz/fitz.py", line 4122, in __init__
    _fitz.Document_swiginit(self, _fitz.new_Document(filename, stream, filetype, rect, width, height, fontsize))
fitz.fitz.FileDataError: cannot open broken document

Deleting corrupted or invalid PDF: form-r1-ro.pdf
Error details: Traceback (most recent call last):
  File "/var/folders/9d/jc2663851_lgqff4sx4jlfz40000gn/T/xpython_52412/664033441.py", line 19, in delete_corrupted_pdfs
    with fitz.open(filepath) as doc:
  File "/Users/cardigan/anaconda3/lib/python3.10/site-packages/fitz/fitz.py", line 4122, in __init__
    _fitz.Document_swiginit(self, _fitz.new_Document(filename, stream, filety