In [None]:
# https://data.cityofchicago.org/Transportation/Transportation-Network-Providers-Trips-2023-2024-/n26f-ihde/about_data
# https://dev.socrata.com/foundry/data.cityofchicago.org/n26f-ihde
# 2023～2024: https://data.cityofchicago.org/resource/n26f-ihde.json
# 2018～2022: https://data.cityofchicago.org/resource/m6dm-c72p.json

import requests
import pandas as pd
import json
import time
import os
from datetime import datetime, timedelta

SAVE_DIR = "../../data/raw data"
SUB_FOLDER = "2020"
os.makedirs(SAVE_DIR, exist_ok=True)

# Socrata API endpoint
BASE_URL = "https://data.cityofchicago.org/resource/m6dm-c72p.json"

# SODA 2.1 API Parameters (Using SQL-like query)
LIMIT = 1000000  # Maximum records per request
WINDOW_SIZE = 1
DATA = []

def fetch_data(start_date, end_date):
    OFFSET = 0
    temp_data = []

    start_time = f"{start_date}T00:00:00"
    end_time = f"{end_date}T23:59:59"

    while True:
        query = f"""
            SELECT * 
            WHERE trip_start_timestamp BETWEEN '{start_time}' AND '{end_time}'
            ORDER BY trip_start_timestamp ASC
            LIMIT {LIMIT} OFFSET {OFFSET}
        """

        params = {"$query": query}  # Using SODA 2.1 SQL query
        response = requests.get(BASE_URL, params=params)

        if response.status_code != 200:
            print(f"❌ Error: {response.status_code} - {response.text}")
            break

        records = response.json()

        if not records:
            print("✅ No more data available. Fetching completed.")
            break

        temp_data.extend(records)
        print(f"✅ Fetched {len(records)} records, total: {len(temp_data)} records.")

        OFFSET += LIMIT  # Increase offset for pagination
        time.sleep(1)  # Prevent rate limiting

    return temp_data

start_date = datetime(2019, 11, 1)
final_date = datetime(2020, 2, 21)

# Fetch data
while start_date <= final_date:
    target_date = start_date.strftime('%Y-%m-%d')
    print(f"\n🔄 Fetching data for {target_date}...")

    daily_data = fetch_data(target_date, target_date)

    if daily_data:
        DATA.extend(daily_data)

    start_date += timedelta(days=WINDOW_SIZE)

# **Convert and save as CSV**
df = pd.DataFrame(DATA)
csv_file = os.path.join(SAVE_DIR, SUB_FOLDER, "chicago_tnp_trips.csv")
df.to_csv(csv_file, index=False)
print(f"📁 Data saved to {csv_file}")


🔄 Fetching data for 2019-11-01...
✅ Fetched 387122 records, total: 387122 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2019-11-02...
✅ Fetched 392185 records, total: 392185 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2019-11-03...
✅ Fetched 302478 records, total: 302478 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2019-11-04...
✅ Fetched 248522 records, total: 248522 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2019-11-05...
✅ Fetched 260606 records, total: 260606 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2019-11-06...
✅ Fetched 277635 records, total: 277635 records.


KeyboardInterrupt: 


🔄 Fetching data for 2019-11-01...
