In [38]:
import os
import httpx
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

In [39]:
# URL of the webpage containing file links
url = "https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page"

# Folder to save downloaded files
download_folder = "rides"
if not os.path.exists(download_folder):
    os.makedirs(download_folder)

# Function to download a file with session reuse and timeout
def download_file(file_url, folder_path, session):
    file_name = file_url.split("/")[-1]
    file_path = os.path.join(folder_path, file_name)
    
    # Check if file already exists
    if os.path.exists(file_path):
        print(f"{file_name} already exists. Skipping download.")
        return

    # Download the file with a timeout and session reuse
    try:
        response = session.get(file_url, timeout=10.0)  # Timeout set to 10 seconds
        if response.status_code == 200:
            with open(file_path, "wb") as f:
                f.write(response.content)
            print(f"Downloaded {file_name}")
        else:
            print(f"Failed to download {file_name}: {response.status_code}")
    except httpx.RequestError as e:
        print(f"An error occurred while requesting {file_url}: {e}")

In [40]:
print(os.path.join("rides","fhv_tripdata_2015-01.parquet"))

rides\fhv_tripdata_2015-01.parquet


In [41]:
# Fetch the page HTML
with httpx.Client(timeout=10.0) as session:  # Timeout for page request
    response = session.get(url)
    soup = BeautifulSoup(response.content, "html.parser")

    # Find all links that match the download pattern
    file_links = []
    for link in soup.find_all("a", href=True):
        href = link["href"]
        if "d37ci6vzurychx.cloudfront.net/trip-data/" in href and href.endswith(".parquet"):
            # Construct the full URL if the href is relative
            file_links.append(urljoin(url, href))

In [42]:
# Download files concurrently using ThreadPoolExecutor
with httpx.Client() as session:  # Reuse session across threads
    with ThreadPoolExecutor(max_workers=5) as executor:  # Use 5 threads for parallel downloads
        futures = [executor.submit(download_file, file_url, download_folder, session) for file_url in file_links]

        # Wait for all futures to complete
        for future in as_completed(futures):
            future.result()  # Trigger exception handling if download_file raises any exceptions

green_tripdata_2024-02.parquet already exists. Skipping download.fhv_tripdata_2024-03.parquet already exists. Skipping download.

yellow_tripdata_2024-04.parquet already exists. Skipping download.
green_tripdata_2024-04.parquet already exists. Skipping download.
fhv_tripdata_2024-04.parquet already exists. Skipping download.
fhvhv_tripdata_2024-04.parquet already exists. Skipping download.
green_tripdata_2024-05.parquet already exists. Skipping download.
fhvhv_tripdata_2024-05.parquet already exists. Skipping download.
fhv_tripdata_2024-05.parquet already exists. Skipping download.
yellow_tripdata_2024-06.parquet already exists. Skipping download.
yellow_tripdata_2024-05.parquet already exists. Skipping download.
green_tripdata_2024-06.parquet already exists. Skipping download.
fhvhv_tripdata_2024-06.parquet already exists. Skipping download.
green_tripdata_2024-07.parquet already exists. Skipping download.
fhvhv_tripdata_2024-07.parquet already exists. Skipping download.
fhv_tripdata_