In [1]:
import os
import asyncio
from io import BytesIO
from typing import Dict
from zipfile import ZipFile

import httpx
import pandas as pd

In [2]:
columns = [
    "CMPLID",
    "ODINO",
    "MFR_NAME",
    "MAKETXT",
    "MODELTXT",
    "YEARTXT",
    "CRASH",
    "FAILDATE",
    "FIRE",
    "INJURED",
    "DEATHS",
    "COMPDESC",
    "CITY",
    "STATE",
    "VIN",
    "DATEA",
    "LDATE",
    "MILES",
    "OCCURENCES",
    "CDESCR",
    "CMPL_TYPE",
    "POLICE_RPT_YN",
    "PURCH_DT",
    "ORIG_OWER_YN",
    "ANTI_BRAKES_YN",
    "CRUISE_CONT_YN",
    "NUM_CYLS",
    "DRIVE_TRAIN",
    "FUEL_SYS",
    "FUEL_TYPE",
    "TRASN_TYPE",
    "VEH_SPEED",
    "DOT",
    "TIRE_SIZE",
    "LOC_OF_TIRE",
    "TIRE_FAIL_TYPE",
    "ORIG_EQUIP_YN",
    "MANUF_DT",
    "SEAT_TYPE",
    "RESTRAINT_TYPE",
    "DEALER_NAME",
    "DEALER_TEL",
    "DEALER_CITY",
    "DEALER_STATE",
    "DEALER_ZIP",
    "PROD_TYPE",
    "REPAIRED_YN",
    "MEDICAL_ATTN",
    "VEHICLES_TOWED_YN",
]

In [6]:
async def download_zip_file_in_batches(url):
    async with httpx.AsyncClient() as client:
        # Preliminary request to get the file size
        head = await client.head(url)
        file_size = int(head.headers["Content-Length"])

        print(f"Total file size: {file_size} bytes")

        batch_size = 1024 * 1024 * 10  # 10 MB per batch
        file_bytes = bytearray()

        for start in range(0, file_size, batch_size):
            end = min(start + batch_size - 1, file_size - 1)
            print(f"Requesting bactch [{start}:{end}] of {file_size}")
            headers = {"Range": f"bytes={start}-{end}"}
            response = await client.get(url, headers=headers)
            file_bytes.extend(response.content)

    return bytes(file_bytes)


async def extract_zip_to_dataframe(zip_bytes):
    with BytesIO(zip_bytes) as bytes_io:
        with ZipFile(bytes_io, "r") as zip_file:
            file_name = zip_file.namelist()[0]
            with zip_file.open(file_name) as file:
                df = pd.read_csv(file)
                return df


async def download_and_extract_zip(url):
    zip_bytes = await download_zip_file_in_batches(url)
    df = await extract_zip_to_dataframe(zip_bytes)
    return df

In [7]:
await download_and_extract_zip(
    "https://static.nhtsa.gov/odi/ffdd/cmpl/COMPLAINTS_RECEIVED_2020-2024.zip"
)

Total file size: 61599925 bytes
Requesting bactch [0:10485759] of 61599925
Requesting bactch [10485760:20971519] of 61599925


DecodingError: Error -3 while decompressing data: incorrect header check