In [1]:
import httpx
import pandas as pd

import os
from io import BytesIO
from zipfile import ZipFile
from datetime import datetime, date

In [2]:
URL = "https://www.nhtsa.gov/nhtsa-datasets-and-apis"

columns = [
    "CMPLID",
    "ODINO",
    "MFR_NAME",
    "MAKETXT",
    "MODELTXT",
    "YEARTXT",
    "CRASH",
    "FAILDATE",
    "FIRE",
    "INJURED",
    "DEATHS",
    "COMPDESC",
    "CITY",
    "STATE",
    "VIN",
    "DATEA",
    "LDATE",
    "MILES",
    "OCCURENCES",
    "CDESCR",
    "CMPL_TYPE",
    "POLICE_RPT_YN",
    "PURCH_DT",
    "ORIG_OWER_YN",
    "ANTI_BRAKES_YN",
    "CRUISE_CONT_YN",
    "NUM_CYLS",
    "DRIVE_TRAIN",
    "FUEL_SYS",
    "FUEL_TYPE",
    "TRASN_TYPE",
    "VEH_SPEED",
    "DOT",
    "TIRE_SIZE",
    "LOC_OF_TIRE",
    "TIRE_FAIL_TYPE",
    "ORIG_EQUIP_YN",
    "MANUF_DT",
    "SEAT_TYPE",
    "RESTRAINT_TYPE",
    "DEALER_NAME",
    "DEALER_TEL",
    "DEALER_CITY",
    "DEALER_STATE",
    "DEALER_ZIP",
    "PROD_TYPE",
    "REPAIRED_YN",
    "MEDICAL_ATTN",
    "VEHICLES_TOWED_YN",
]

In [6]:
def create_client() -> httpx.Client:
    """
    Creates a common client for future http requests

    Returns:
        httpx.Client: client with ford proxies
    """
    ford_proxy = str(os.getenv("FORD_PROXY"))
    timeout_config = httpx.Timeout(10.0, connect=5.0)
    proxy_mounts = {
        "http://": httpx.HTTPTransport(proxy=httpx.Proxy(ford_proxy)),
        "https://": httpx.HTTPTransport(proxy=httpx.Proxy(ford_proxy)),
    }
    return httpx.Client(
        timeout=timeout_config,
        mounts=proxy_mounts,
        verify=False,
    )

In [4]:
def mount_dataset_from_content():
    with create_client() as client:
        resp = client.get(
            "https://static.nhtsa.gov/odi/ffdd/cmpl/COMPLAINTS_RECEIVED_2020-2024.zip",
            timeout=160,
        ).content

    with ZipFile(BytesIO(resp)) as myzip:
        with myzip.open("COMPLAINTS_RECEIVED_2020-2024.txt") as file:
            dataset = pd.read_csv(file, sep="\t", header=None, names=columns)

    return dataset

In [None]:
df = mount_dataset_from_content()
df.head()  # out of VPN: 23.7s

In [9]:
df.drop_duplicates(subset=["ODINO"], inplace=True)
data = df[df["ODINO"] > 11572825]
data.shape

(1293, 49)

In [None]:
data.to_csv("mock_dataset.csv", index=False)