# 01 - Data Ingestion

This notebook fetches air quality data from NYC Open Data API using Socrata and saves raw data files.


In [22]:
import os
import json
from pathlib import Path
from datetime import datetime
import pandas as pd
from sodapy import Socrata
from dotenv import load_dotenv

load_dotenv()

# Dataset configuration
DATASET_ID = "c3uy-2p5r"
DOMAIN = "data.cityofnewyork.us"

# Data directories
PROJECT_ROOT = Path().resolve().parent
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_RAW.mkdir(parents=True, exist_ok=True)

print(f"Project root: {PROJECT_ROOT}")
print(f"Raw data directory: {DATA_RAW}")


Project root: /Users/shohruz/Air-Quality-Analysis
Raw data directory: /Users/shohruz/Air-Quality-Analysis/data/raw


In [23]:
# Initialize Socrata client
app_token = os.getenv("SOCRATA_APP_TOKEN")
if not app_token:
    print("Warning: SOCRATA_APP_TOKEN not found. Requests will be throttled.")
    print("Set it in .env file: SOCRATA_APP_TOKEN=your_token_here")

client = Socrata(DOMAIN, app_token, timeout=100)
print(f"Connected to {DOMAIN}")




Set it in .env file: SOCRATA_APP_TOKEN=your_token_here
Connected to data.cityofnewyork.us


In [24]:
# Fetch data with pagination
# Socrata has a limit of 50,000 records per request
# We'll fetch all available data

print("Fetching data from NYC Open Data API...")
all_results = []
offset = 0
limit = 50000

while True:
    try:
        results = client.get(
            DATASET_ID,
            limit=limit,
            offset=offset,
            order="start_date ASC"
        )
        
        if not results:
            break
        
        all_results.extend(results)
        offset += len(results)
        print(f"Fetched {len(results)} records (total: {len(all_results)})")
        
        # If we got fewer than limit, we've reached the end
        if len(results) < limit:
            break
            
    except Exception as e:
        print(f"Error at offset {offset}: {e}")
        break

print(f"\nTotal records fetched: {len(all_results)}")


Fetching data from NYC Open Data API...
Fetched 18862 records (total: 18862)

Total records fetched: 18862


In [25]:
# Save raw JSON
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_filename = f"air_quality_raw_{timestamp}.json"
json_path = DATA_RAW / json_filename

with open(json_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"Saved raw JSON to: {json_path}")
print(f"File size: {json_path.stat().st_size / 1024 / 1024:.2f} MB")


Saved raw JSON to: /Users/shohruz/Air-Quality-Analysis/data/raw/air_quality_raw_20251205_152721.json
File size: 6.70 MB


In [26]:
# Convert to DataFrame and save sample CSV
df = pd.DataFrame(all_results)
print(f"DataFrame shape: {df.shape}")
print(f"\nColumns: {list(df.columns)}")
df.head()


DataFrame shape: (18862, 11)

Columns: ['unique_id', 'indicator_id', 'name', 'measure', 'measure_info', 'geo_type_name', 'geo_join_id', 'geo_place_name', 'time_period', 'start_date', 'data_value']


Unnamed: 0,unique_id,indicator_id,name,measure,measure_info,geo_type_name,geo_join_id,geo_place_name,time_period,start_date,data_value
0,154596,643,Annual vehicle miles traveled,Million miles,per square mile,CD,313,Coney Island (CD13),2005,2005-01-01T00:00:00.000,31.85136
1,131251,657,Asthma emergency department visits due to PM2.5,Estimated annual rate (age 18+),"per 100,000 adults",UHF42,405,Ridgewood - Forest Hills,2005-2007,2005-01-01T00:00:00.000,19.1
2,151656,643,Annual vehicle miles traveled,Million miles,per square mile,UHF42,406,Fresh Meadows,2005,2005-01-01T00:00:00.000,61.967759
3,131253,657,Asthma emergency department visits due to PM2.5,Estimated annual rate (age 18+),"per 100,000 adults",UHF42,407,Southwest Queens,2005-2007,2005-01-01T00:00:00.000,30.6
4,130915,650,Respiratory hospitalizations due to PM2.5 (age...,Estimated annual rate,"per 100,000 adults",UHF42,405,Ridgewood - Forest Hills,2005-2007,2005-01-01T00:00:00.000,18.3


In [27]:
# Save sample CSV (first 1000 rows for quick inspection)
csv_filename = f"air_quality_sample_{timestamp}.csv"
csv_path = DATA_RAW / csv_filename
df.head(1000).to_csv(csv_path, index=False)
print(f"Saved sample CSV to: {csv_path}")


Saved sample CSV to: /Users/shohruz/Air-Quality-Analysis/data/raw/air_quality_sample_20251205_152721.csv


In [28]:
# Data overview
print("Data Overview:")
print(f"Total records: {len(df)}")
print(f"\nUnique pollutants: {df['name'].nunique()}")
print(df['name'].value_counts())
print(f"\nUnique stations: {df['geo_join_id'].nunique()}")
print(f"\nDate range: {df['start_date'].min()} to {df['start_date'].max()}")
print(f"\nUnits: {df['measure_info'].unique()}")


Data Overview:
Total records: 18862

Unique pollutants: 18
name
Fine particles (PM 2.5)                                   6345
Nitrogen dioxide (NO2)                                    6345
Ozone (O3)                                                2115
Asthma hospitalizations due to Ozone                       480
Asthma emergency departments visits due to Ozone           480
Asthma emergency department visits due to PM2.5            480
Annual vehicle miles traveled                              321
Annual vehicle miles traveled (cars)                       321
Annual vehicle miles traveled (trucks)                     321
Cardiovascular hospitalizations due to PM2.5 (age 40+)     240
Deaths due to PM2.5                                        240
Cardiac and respiratory deaths due to Ozone                240
Respiratory hospitalizations due to PM2.5 (age 20+)        240
Outdoor Air Toxics - Benzene                               203
Outdoor Air Toxics - Formaldehyde                     