In [8]:
import httpx
import pandas as pd

from io import BytesIO
from zipfile import ZipFile
from datetime import datetime

import bs4

In [6]:
URL = "https://www.nhtsa.gov/nhtsa-datasets-and-apis"

columns = [
    "CMPLID",
    "ODINO",
    "MFR_NAME",
    "MAKETXT",
    "MODELTXT",
    "YEARTXT",
    "CRASH",
    "FAILDATE",
    "FIRE",
    "INJURED",
    "DEATHS",
    "COMPDESC",
    "CITY",
    "STATE",
    "VIN",
    "DATEA",
    "LDATE",
    "MILES",
    "OCCURENCES",
    "CDESCR",
    "CMPL_TYPE",
    "POLICE_RPT_YN",
    "PURCH_DT",
    "ORIG_OWER_YN",
    "ANTI_BRAKES_YN",
    "CRUISE_CONT_YN",
    "NUM_CYLS",
    "DRIVE_TRAIN",
    "FUEL_SYS",
    "FUEL_TYPE",
    "TRASN_TYPE",
    "VEH_SPEED",
    "DOT",
    "TIRE_SIZE",
    "LOC_OF_TIRE",
    "TIRE_FAIL_TYPE",
    "ORIG_EQUIP_YN",
    "MANUF_DT",
    "SEAT_TYPE",
    "RESTRAINT_TYPE",
    "DEALER_NAME",
    "DEALER_TEL",
    "DEALER_CITY",
    "DEALER_STATE",
    "DEALER_ZIP",
    "PROD_TYPE",
    "REPAIRED_YN",
    "MEDICAL_ATTN",
    "VEHICLES_TOWED_YN",
]

In [9]:
soup = bs4.BeautifulSoup(httpx.get(URL).text, "html.parser")
tables = soup.select("#nhtsa_s3_listing > tbody")

complaints = tables[3]
elements = [row for row in complaints.find_all("td")]
data_list = []

if len(elements) % 3 != 0:
    print("The list of elements does not contain complete data for each row.")
else:
    for i in range(0, len(elements), 3):
        url_elem = elements[i].find("a")
        size_elem = elements[i + 1]
        date_elem = elements[i + 2]

        data_dict = {
            "url": url_elem.get("href") if url_elem else None,
            "size": size_elem.text.strip() if size_elem else None,
            "updated_date": (
                datetime.strptime(date_elem.text.strip(" ET"), "%m/%d/%Y %I:%M:%S %p")
                if date_elem
                else None
            ),
        }
        data_list.append(data_dict)

In [None]:
dataset = pd.DataFrame()

if data_list[0]["updated_date"] < datetime.now():
    resp = httpx.get(data_list[0]["url"], timeout=160).content

    with ZipFile(BytesIO(resp)) as myzip:
        dataset = pd.read_csv(
            myzip.open("COMPLAINTS_RECEIVED_2020-2024.txt"),
            sep="\t",
            header=None,
            names=columns,
        )

dataset.shape

In [None]:
today = datetime.today().strftime("%Y-%m-%d")

df = dataset[
    (dataset["MFR_NAME"].isin(["Ford Motor Company"]) & dataset["DATEA"].gt(20230215))
]
df.to_csv(
    f"../data/raw/NHTSA_COMPLAINTS_{today}.csv", sep=";", encoding="utf-8", index=False
)