In [5]:
# https://data.cityofchicago.org/Transportation/Taxi-Trips-2013-2023-/wrvz-psew/about_data

import requests
import pandas as pd
import json
import time
import os
from datetime import datetime, timedelta

SAVE_DIR = "../../data/raw data"
SUB_FOLDER = "2019"
os.makedirs(SAVE_DIR, exist_ok=True)

# Socrata API endpoint
BASE_URL = "https://data.cityofchicago.org/resource/wrvz-psew.json"

# SODA 2.1 API Parameters (Using SQL-like query)
LIMIT = 100000  # Maximum records per request
WINDOW_SIZE = 1
DATA = []

def fetch_data(start_date, end_date):
    OFFSET = 0
    temp_data = []

    start_time = f"{start_date}T00:00:00"
    end_time = f"{end_date}T23:59:59"

    while True:
        query = f"""
            SELECT * 
            WHERE trip_start_timestamp BETWEEN '{start_time}' AND '{end_time}'
            ORDER BY trip_start_timestamp ASC
            LIMIT {LIMIT} OFFSET {OFFSET}
        """

        params = {"$query": query}  # Using SODA 2.1 SQL query
        response = requests.get(BASE_URL, params=params)

        if response.status_code != 200:
            print(f"❌ Error: {response.status_code} - {response.text}")
            break

        records = response.json()

        if not records:
            print("✅ No more data available. Fetching completed.")
            break

        temp_data.extend(records)
        print(f"✅ Fetched {len(records)} records, total: {len(temp_data)} records.")

        OFFSET += LIMIT  # Increase offset for pagination
        time.sleep(1)  # Prevent rate limiting

    return temp_data

start_date = datetime(2018, 11, 1)
final_date = datetime(2019, 2, 21)

# Fetch data
while start_date <= final_date:
    target_date = start_date.strftime('%Y-%m-%d')
    print(f"\n🔄 Fetching data for {target_date}...")

    daily_data = fetch_data(target_date, target_date)

    if daily_data:
        DATA.extend(daily_data)

    start_date += timedelta(days=WINDOW_SIZE)

# **Convert and save as CSV**
df = pd.DataFrame(DATA)
csv_file = os.path.join(SAVE_DIR, SUB_FOLDER, "chicago_taxi_trips.csv")
df.to_csv(csv_file, index=False)
print(f"📁 Data saved to {csv_file}")



🔄 Fetching data for 2018-11-01...
✅ Fetched 62836 records, total: 62836 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2018-11-02...
✅ Fetched 62349 records, total: 62349 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2018-11-03...
✅ Fetched 45540 records, total: 45540 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2018-11-04...
✅ Fetched 40978 records, total: 40978 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2018-11-05...
✅ Fetched 53156 records, total: 53156 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2018-11-06...
✅ Fetched 58419 records, total: 58419 records.
✅ No more data available. Fetching completed.

🔄 Fetching data for 2018-11-07...
✅ Fetched 63993 records, total: 63993 records.
❌ Error: 500 - {"message":"Internal error: please include code 3a7c776e-e68f-418a-b11f-3f58020e6884 if you report the error","errorCode":"internal-err