In [1]:
import httpx
import pandas as pd

import os
from io import BytesIO
from zipfile import ZipFile
from datetime import datetime, date

In [2]:
URL = "https://www.nhtsa.gov/nhtsa-datasets-and-apis"

columns = [
    "CMPLID",
    "ODINO",
    "MFR_NAME",
    "MAKETXT",
    "MODELTXT",
    "YEARTXT",
    "CRASH",
    "FAILDATE",
    "FIRE",
    "INJURED",
    "DEATHS",
    "COMPDESC",
    "CITY",
    "STATE",
    "VIN",
    "DATEA",
    "LDATE",
    "MILES",
    "OCCURENCES",
    "CDESCR",
    "CMPL_TYPE",
    "POLICE_RPT_YN",
    "PURCH_DT",
    "ORIG_OWER_YN",
    "ANTI_BRAKES_YN",
    "CRUISE_CONT_YN",
    "NUM_CYLS",
    "DRIVE_TRAIN",
    "FUEL_SYS",
    "FUEL_TYPE",
    "TRASN_TYPE",
    "VEH_SPEED",
    "DOT",
    "TIRE_SIZE",
    "LOC_OF_TIRE",
    "TIRE_FAIL_TYPE",
    "ORIG_EQUIP_YN",
    "MANUF_DT",
    "SEAT_TYPE",
    "RESTRAINT_TYPE",
    "DEALER_NAME",
    "DEALER_TEL",
    "DEALER_CITY",
    "DEALER_STATE",
    "DEALER_ZIP",
    "PROD_TYPE",
    "REPAIRED_YN",
    "MEDICAL_ATTN",
    "VEHICLES_TOWED_YN",
]

In [11]:
def create_client() -> httpx.Client:
    """
    Creates a common client for future http requests

    Returns:
        httpx.Client: client with ford proxies
    """
    ford_proxy = str(os.getenv("FORD_PROXY"))
    timeout_config = httpx.Timeout(10.0, connect=5.0)
    proxy_mounts = {
        "http://": httpx.HTTPTransport(proxy=httpx.Proxy(ford_proxy)),
        "https://": httpx.HTTPTransport(proxy=httpx.Proxy(ford_proxy)),
    }
    return httpx.Client(
        timeout=timeout_config,
        # mounts=proxy_mounts,
        verify=False,
    )

In [9]:
def mount_dataset_from_content():
    with create_client() as client:
        resp = client.get(
            "https://static.nhtsa.gov/odi/ffdd/cmpl/COMPLAINTS_RECEIVED_2020-2024.zip",
            timeout=160,
        ).content

    with ZipFile(BytesIO(resp)) as myzip:
        with myzip.open("COMPLAINTS_RECEIVED_2020-2024.txt") as file:
            dataset = pd.read_csv(file, sep="\t", header=None, names=columns)

    return dataset

In [12]:
df = mount_dataset_from_content()
df.head()  # out of VPN: 23.7s

  dataset = pd.read_csv(file, sep="\t", header=None, names=columns)


Unnamed: 0,CMPLID,ODINO,MFR_NAME,MAKETXT,MODELTXT,YEARTXT,CRASH,FAILDATE,FIRE,INJURED,...,RESTRAINT_TYPE,DEALER_NAME,DEALER_TEL,DEALER_CITY,DEALER_STATE,DEALER_ZIP,PROD_TYPE,REPAIRED_YN,MEDICAL_ATTN,VEHICLES_TOWED_YN
0,1633294,11292384,Honda (American Honda Motor Co.),HONDA,ACCORD,2018.0,N,20191221,N,0,...,,,,,,,V,,N,N
1,1633295,11292384,Honda (American Honda Motor Co.),HONDA,ACCORD,2018.0,N,20191221,N,0,...,,,,,,,V,,N,N
2,1633296,11292384,Honda (American Honda Motor Co.),HONDA,ACCORD,2018.0,N,20191221,N,0,...,,,,,,,V,,N,N
3,1633297,11292385,Ford Motor Company,FORD,EXPLORER,2020.0,N,20191226,N,0,...,,,,,,,V,,N,N
4,1633298,11292386,"General Motors, LLC",CHEVROLET,VOLT,2017.0,N,20190712,N,0,...,,,,,,,V,,N,N


In [13]:
df.drop_duplicates(subset=["ODINO"], inplace=True)
data = df[
    (df["MFR_NAME"] == "Ford Motor Company")
    & (df["ODINO"] > int(str(os.getenv("LAST_ODINO_CAPTURED"))))
]
data.head()

Unnamed: 0,CMPLID,ODINO,MFR_NAME,MAKETXT,MODELTXT,YEARTXT,CRASH,FAILDATE,FIRE,INJURED,...,RESTRAINT_TYPE,DEALER_NAME,DEALER_TEL,DEALER_CITY,DEALER_STATE,DEALER_ZIP,PROD_TYPE,REPAIRED_YN,MEDICAL_ATTN,VEHICLES_TOWED_YN
332878,1966250,11571618,Ford Motor Company,LINCOLN,CORSAIR,2021.0,N,20240213,N,0,...,,,,,,,V,,N,N
332883,1966255,11571621,Ford Motor Company,FORD,EXPEDITION,2020.0,N,20240213,N,0,...,,,,,,,V,,N,N
332886,1966258,11571624,Ford Motor Company,FORD,FUSION,2018.0,N,20231116,N,0,...,,,,,,,V,,N,N
332890,1966262,11571627,Ford Motor Company,FORD,F-350,2020.0,N,20240212,N,0,...,,,,,,,V,,N,N
332893,1966265,11571629,Ford Motor Company,FORD,FOCUS,2017.0,N,20240202,N,0,...,,,,,,,V,,N,N


In [15]:
data.to_csv("mock_dataset.csv", index=False)