In [5]:
import os
import requests
from nyc_taxi_eta.configs.settings import ROOT_DIR
from tqdm.notebook import tqdm

# Set the base URL
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/"
YELLOW_TAXI_DIR = ROOT_DIR / "data" / "landing" / "trips_data" / "yellow_taxi"
GREEN_TAXI_DIR = ROOT_DIR / "data" / "landing" / "trips_data" / "green_taxi"
FHV_DIR = ROOT_DIR / "data" / "landing" / "trips_data" / "fhv"
HVFHV_DIR = ROOT_DIR / "data" / "landing" / "trips_data" / "hvfhv"

# Years and months to download
years = range(2009, 2026)
months = range(1, 13)

# Download Yellow Taxi data
os.makedirs(YELLOW_TAXI_DIR, exist_ok=True)

for year in tqdm(years, desc="Yellow Taxi Years", unit="year", position=0):
    for month in tqdm(months, desc="Months", unit="month", leave=False, position=1):
        file_name = f"yellow_tripdata_{year}-{month:02}.parquet"
        file_url = BASE_URL + file_name
        save_path = os.path.join(YELLOW_TAXI_DIR, file_name)

        if os.path.exists(save_path):
            continue

        try:
            response = requests.get(file_url, stream=True)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    for chunk in tqdm(
                        response.iter_content(chunk_size=8192),
                        desc=f"Downloading {file_name}",
                        unit="KB",
                        leave=False,
                        position=2,
                    ):
                        f.write(chunk)
        except Exception as e:
            print(f"Error downloading {file_name}: {e}")

# Download Green Taxi data (starts from August 2013)
os.makedirs(GREEN_TAXI_DIR, exist_ok=True)
green_years = range(2013, 2026)

for year in tqdm(green_years, desc="Green Taxi Years", unit="year", position=0):
    start_month = 8 if year == 2013 else 1
    for month in tqdm(range(start_month, 13), desc="Months", unit="month", leave=False, position=1):
        file_name = f"green_tripdata_{year}-{month:02}.parquet"
        file_url = BASE_URL + file_name
        save_path = os.path.join(GREEN_TAXI_DIR, file_name)

        if os.path.exists(save_path):
            continue

        try:
            response = requests.get(file_url, stream=True)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    for chunk in tqdm(
                        response.iter_content(chunk_size=8192),
                        desc=f"Downloading {file_name}",
                        unit="KB",
                        leave=False,
                        position=2,
                    ):
                        f.write(chunk)
        except Exception as e:
            print(f"Error downloading {file_name}: {e}")

# Download FHV data (starts from January 2015)
os.makedirs(FHV_DIR, exist_ok=True)
fhv_years = range(2015, 2026)

for year in tqdm(fhv_years, desc="FHV Years", unit="year", position=0):
    for month in tqdm(months, desc="Months", unit="month", leave=False, position=1):
        file_name = f"fhv_tripdata_{year}-{month:02}.parquet"
        file_url = BASE_URL + file_name
        save_path = os.path.join(FHV_DIR, file_name)

        if os.path.exists(save_path):
            continue

        try:
            response = requests.get(file_url, stream=True)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    for chunk in tqdm(
                        response.iter_content(chunk_size=8192),
                        desc=f"Downloading {file_name}",
                        unit="KB",
                        leave=False,
                        position=2,
                    ):
                        f.write(chunk)
        except Exception as e:
            print(f"Error downloading {file_name}: {e}")

# Download HVFHV data (starts from February 2019)
os.makedirs(HVFHV_DIR, exist_ok=True)
hvfhv_years = range(2019, 2026)

for year in tqdm(hvfhv_years, desc="HVFHV Years", unit="year", position=0):
    start_month = 2 if year == 2019 else 1
    for month in tqdm(range(start_month, 13), desc="Months", unit="month", leave=False, position=1):
        file_name = f"fhvhv_tripdata_{year}-{month:02}.parquet"
        file_url = BASE_URL + file_name
        save_path = os.path.join(HVFHV_DIR, file_name)

        if os.path.exists(save_path):
            continue

        try:
            response = requests.get(file_url, stream=True)
            if response.status_code == 200:
                with open(save_path, "wb") as f:
                    for chunk in tqdm(
                        response.iter_content(chunk_size=8192),
                        desc=f"Downloading {file_name}",
                        unit="KB",
                        leave=False,
                        position=2,
                    ):
                        f.write(chunk)
        except Exception as e:
            print(f"Error downloading {file_name}: {e}")


Yellow Taxi Years:   0%|          | 0/17 [00:00<?, ?year/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Green Taxi Years:   0%|          | 0/13 [00:00<?, ?year/s]

Months:   0%|          | 0/5 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

FHV Years:   0%|          | 0/11 [00:00<?, ?year/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

HVFHV Years:   0%|          | 0/7 [00:00<?, ?year/s]

Months:   0%|          | 0/11 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

Months:   0%|          | 0/12 [00:00<?, ?month/s]

In [None]:
import openmeteo_requests

import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(".cache", expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
    "latitude": 40.7357,
    "longitude": -74.1724,
    "start_date": "2009-04-27",
    "end_date": "2025-05-11",
    "hourly": [
        "rain",
        "snowfall",
        "snow_depth",
        "precipitation",
        "apparent_temperature",
        "relative_humidity_2m",
        "temperature_2m",
        "dew_point_2m",
    ],
    "timezone": "America/New_York",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_rain = hourly.Variables(0).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(1).ValuesAsNumpy()
hourly_snow_depth = hourly.Variables(2).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(3).ValuesAsNumpy()
hourly_apparent_temperature = hourly.Variables(4).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(5).ValuesAsNumpy()
hourly_temperature_2m = hourly.Variables(6).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(7).ValuesAsNumpy()

hourly_data = {
    "date": pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        end=pd.to_datetime(hourly.TimeEnd(), unit="s", utc=True),
        freq=pd.Timedelta(seconds=hourly.Interval()),
        inclusive="left",
    )
}

hourly_data["rain"] = hourly_rain
hourly_data["snowfall"] = hourly_snowfall
hourly_data["snow_depth"] = hourly_snow_depth
hourly_data["precipitation"] = hourly_precipitation
hourly_data["apparent_temperature"] = hourly_apparent_temperature
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m

hourly_dataframe = pd.DataFrame(data=hourly_data)
print(hourly_dataframe)