In [5]:
from io import BytesIO, StringIO
from pypdf import PdfReader
from urllib.request import Request, urlopen
import requests
import pandas as pd
import time
import logging


In [6]:
# Configure logging
logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')

def download_and_process_pdf(url):
    """
    Downloads a PDF from the given URL, extracts text data, and returns a DataFrame.

    Args:
        url: The URL of the PDF file.

    Returns:
        A pandas DataFrame containing the extracted data or None if failed
    """
    try:
        remoteFile = urlopen(Request(url)).read()
        memoryFile = BytesIO(remoteFile)
        reader = PdfReader(memoryFile)
        reader.get_fields()
        filtered_text_data = {k: v["/V"] for k, v in reader.get_fields().items()
                              if isinstance(v, dict) and "/V" in v.keys()}

        return pd.DataFrame([filtered_text_data])
    except Exception as e:
        logging.error(f"Failed to download or process PDF from {url}: {e}")
        return None

# Define a list of URLs for your PDF files
urls = [
    "https://www.dmv.ca.gov/portal/file/waymo_12272024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_122524-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_12202024b-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_12202024a-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_12202024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_121824-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_12172024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_121624-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_121424-pdf/",
    "https://www.dmv.ca.gov/portal/file/pony-ai_12122024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo-12112024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox-12082024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waym0_120220242-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_120220241-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_11302024-pdf/",
    "https://www.dmv.ca.gov/portal/file/weride_11272024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_11272024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_11262024b-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_11262024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_110824-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_110524-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_110424-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_110324_2-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_110324_1-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_102424-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_102024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_101924-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_101624-pdf/",
    "https://www.dmv.ca.gov/portal/file/woven_101624/",
    "https://www.dmv.ca.gov/portal/file/waymo_101424-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_101024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_100624-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_10032024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_09292024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_09292024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_092724-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_092624-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_09182024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_09132024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_09032024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_09032024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_083020242-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_083020241-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_08292024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_08202024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_08092024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_08022024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_08012024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_07272024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_07202024a-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_07202024b-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_07202024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_07192024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_07182024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_07162024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_071124-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_070624_a-pdf/",
    #"https://www.dmv.ca.gov/portal/file/waymo_070624_b-pdf/", # defective link
    "https://www.dmv.ca.gov/portal/file/waymo_07022024-pdf/",
    "https://www.dmv.ca.gov/portal/file/weride_06212024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_06112024-pdf/",
    "https://www.dmv.ca.gov/portal/file/weride_06102024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_06052024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_06042024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_06032024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_05302024-pdf/",
    "https://www.dmv.ca.gov/portal/file/nuro_052224-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_052024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_051724-pdf/",
    "https://www.dmv.ca.gov/portal/file/ponyai05152024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_05132024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_05082024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_042824-pdf/",
    "https://www.dmv.ca.gov/portal/file/pony-ai_04232024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_04172024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_04132024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_04112024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_040224-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_033024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_032824-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_032224_amended_redacted-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_031924-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_031824-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_03142024-pdf/",
    "https://www.dmv.ca.gov/portal/file/nuro_031224-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_03082024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_03032024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_03022024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_022220241-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_022220242-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_021824-pdf/",
    "https://www.dmv.ca.gov/portal/file/apple_021524-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_02052024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_013124-pdf/",
    "https://www.dmv.ca.gov/portal/file/nuro_013124-pdf/",
    "https://www.dmv.ca.gov/portal/file/apple_012624-pdf/",
    "https://www.dmv.ca.gov/portal/file/weride01172024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_01152024-pdf/",
    "https://www.dmv.ca.gov/portal/file/nuro_0111122024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_011120241-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_011120242-pdf/",
    "https://www.dmv.ca.gov/portal/file/ghostautonomy01112024-pdf/",
    "https://www.dmv.ca.gov/portal/file/waymo_01082024-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_01012024a-pdf/",
    "https://www.dmv.ca.gov/portal/file/zoox_01012024b-pdf/",
]

# Create an empty list to store DataFrames from each URL
data_frames = []

In [None]:

# Loop through each URL and append the resulting DataFrame to the list
for url in urls:
    df = download_and_process_pdf(url)
    if df is not None:
        data_frames.append(df)

# Combine all DataFrames into a single DataFrame
if data_frames:
    combined_df = pd.concat(data_frames, ignore_index=True)
    print(combined_df)
else:
    print("No data was extracted from the PDFs.")

2025-02-28 09:10:57,399 - ERROR - Failed to download or process PDF from https://www.dmv.ca.gov/portal/file/waymo_12272024-pdf/: HTTP Error 504: Gateway Time-out
2025-02-28 09:12:27,790 - ERROR - Failed to download or process PDF from https://www.dmv.ca.gov/portal/file/waymo_122524-pdf/: HTTP Error 504: Gateway Time-out
