In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import json
from pathlib import Path

cfg_path = Path("/content/drive/MyDrive/UrbanSimAI_Chicago/urbansim_config.json")
cfg = json.loads(cfg_path.read_text())

PROJECT_ROOT = Path(cfg["PROJECT_ROOT"])
RAW_DIR = Path(cfg["RAW_DIR"])
PROC_DIR = Path(cfg["PROC_DIR"])
REPORT_DIR = Path(cfg["REPORT_DIR"])

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR)

PROJECT_ROOT: /content/drive/MyDrive/UrbanSimAI_Chicago
RAW_DIR: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw


In [None]:
#if runtime reset
!pip -q install requests tqdm pandas pyarrow fastparquet geopandas shapely osmnx

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m7.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.8/1.8 MB[0m [31m27.1 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/101.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.5/101.5 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, time, math, shutil
import requests
import pandas as pd
from tqdm import tqdm
from pathlib import Path
from datetime import datetime, timezone, timedelta
from dateutil import parser as dtparser

In [None]:
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

session = requests.Session()
retries = Retry(
    total=10,
    backoff_factor=1.5,
    status_forcelist=[429, 500, 502, 503, 504],
    allowed_methods=["GET"]
)
session.mount("https://", HTTPAdapter(max_retries=retries))

def stamp():
    return datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")

def download_file(url: str, out_path: Path, overwrite: bool = False):
    """
    For small/medium files. Skips if exists.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    if out_path.exists() and not overwrite:
        print(f"⏭️ Skip (exists): {out_path.name}")
        return

    if overwrite and out_path.exists():
        out_path.unlink()

    print(f"⬇️ Downloading: {url}")
    r = session.get(url, stream=True, timeout=(30, 1200))
    r.raise_for_status()

    tmp_path = out_path.with_suffix(out_path.suffix + ".part")
    with open(tmp_path, "wb") as f:
        for chunk in r.iter_content(chunk_size=1024 * 1024):
            if chunk:
                f.write(chunk)

    tmp_path.rename(out_path)
    print(f"✅ Saved: {out_path} ({out_path.stat().st_size/1e6:.2f} MB)")

In [None]:
traffic_dir = RAW_DIR / "chicago_traffic"
traffic_dir.mkdir(parents=True, exist_ok=True)

cta_dir = RAW_DIR / "cta_gtfs"
cta_dir.mkdir(parents=True, exist_ok=True)

zillow_dir = RAW_DIR / "zillow"
zillow_dir.mkdir(parents=True, exist_ok=True)

osm_dir = RAW_DIR / "osm"
osm_dir.mkdir(parents=True, exist_ok=True)

print("traffic_dir:", traffic_dir)
print("cta_dir:", cta_dir)
print("zillow_dir:", zillow_dir)
print("osm_dir:", osm_dir)

traffic_dir: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/chicago_traffic
cta_dir: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/cta_gtfs
zillow_dir: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/zillow
osm_dir: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/osm


In [None]:
KEEP_EXISTING_4G9F_CSV = True

DOWNLOAD_SXS8 = True     # 2018–2023
DOWNLOAD_77HQ = False    # 2011–2018

LIMIT = 20000
SLEEP_S = 0.05

SXS8_MAX_ROWS = 2_000_000     # ~2 million rows only
HQ77_MAX_ROWS = 1_000_000     # ~1 million rows only

SXS8_DAYS_BACK = 120
HQ77_DAYS_BACK = 120

print("KEEP_EXISTING_4G9F_CSV:", KEEP_EXISTING_4G9F_CSV)
print("DOWNLOAD_SXS8:", DOWNLOAD_SXS8, "| SXS8_MAX_ROWS:", SXS8_MAX_ROWS)
print("DOWNLOAD_77HQ:", DOWNLOAD_77HQ, "| HQ77_MAX_ROWS:", HQ77_MAX_ROWS)
print("LIMIT:", LIMIT, "SLEEP_S:", SLEEP_S)

KEEP_EXISTING_4G9F_CSV: True
DOWNLOAD_SXS8: True | SXS8_MAX_ROWS: 2000000
DOWNLOAD_77HQ: False | HQ77_MAX_ROWS: 1000000
LIMIT: 20000 SLEEP_S: 0.05


In [None]:
import pyarrow as pa
import pyarrow.parquet as pq
from dateutil import parser as dtparser

def folder_size_gb(folder: Path) -> float:
    if not folder.exists():
        return 0.0
    total = 0
    for p in folder.rglob("*"):
        if p.is_file():
            total += p.stat().st_size
    return total / (1024**3)

def parts_status(parts_dir: Path, limit: int):
    parts = sorted(parts_dir.glob("*.parquet")) if parts_dir.exists() else []
    if not parts:
        return {"parts": 0, "rows_approx": 0, "gb": 0.0, "max_offset": 0}
    # filenames are offsets like 015340000.parquet
    offsets = [int(p.stem) for p in parts]
    max_offset = max(offsets)
    done_parts = len(parts)
    rows_approx = done_parts * limit
    gb = folder_size_gb(parts_dir)
    return {"parts": done_parts, "rows_approx": rows_approx, "gb": gb, "max_offset": max_offset}

def get_timestamp_field(view_id: str) -> str:
    meta_url = f"https://data.cityofchicago.org/api/views/{view_id}.json"
    r = session.get(meta_url, timeout=(30, 1200))
    r.raise_for_status()
    meta = r.json()

    candidates = []
    for col in meta.get("columns", []):
        dt = col.get("dataTypeName", "")
        fname = col.get("fieldName", "")
        if dt in ("calendar_date", "floating_timestamp", "fixed_timestamp"):
            score = 0
            name = fname.lower()
            if "time" in name: score += 3
            if "date" in name: score += 2
            if "measurement" in name: score += 2
            candidates.append((score, fname))

    if not candidates:
        return None
    candidates.sort(reverse=True)
    return candidates[0][1]

def get_max_timestamp(view_id: str, ts_field: str):
    url = f"https://data.cityofchicago.org/resource/{view_id}.json"
    params = {"$select": f"max({ts_field}) as max_ts"}
    r = session.get(url, params=params, timeout=(30, 1200))
    r.raise_for_status()
    js = r.json()
    if not js or js[0].get("max_ts") is None:
        return None
    return dtparser.isoparse(js[0]["max_ts"]).astimezone(timezone.utc)

def socrata_download_parts_maxrows(view_id: str,
                                  out_parts_dir: Path,
                                  max_rows: int,
                                  days_back: int,
                                  limit: int = 20000,
                                  sleep_s: float = 0.05):
    out_parts_dir.mkdir(parents=True, exist_ok=True)
    base = f"https://data.cityofchicago.org/resource/{view_id}.json"

    ts_field = get_timestamp_field(view_id)
    where = None
    order = None
    if ts_field:
        max_ts = get_max_timestamp(view_id, ts_field)
        if max_ts:
            start_ts = max_ts - timedelta(days=days_back)
            start_s = start_ts.strftime("%Y-%m-%dT%H:%M:%S.000")
            end_s = max_ts.strftime("%Y-%m-%dT%H:%M:%S.000")
            where = f"{ts_field} between '{start_s}' and '{end_s}'"
            order = f"{ts_field} DESC"

    st = parts_status(out_parts_dir, limit)
    print("\n==============================")
    print("FAST MINI (MAX ROWS) DOWNLOAD")
    print("view_id:", view_id)
    print("parts_dir:", out_parts_dir)
    print("timestamp_field:", ts_field)
    print("where:", where)
    print("max_rows:", max_rows, "| limit:", limit)
    print("CURRENT:", st)
    print("==============================\n")

    # Start from the next missing offset (resumable)
    offset = 0
    if st["parts"] > 0:
        # existing parts mean we continue after the largest existing offset
        offset = st["max_offset"] + limit

    # If already exceeded target, stop immediately
    if offset >= max_rows:
        print(f"✅ Already have enough. offset={offset:,} >= max_rows={max_rows:,}. STOP.")
        return

    while offset < max_rows:
        part_path = out_parts_dir / f"{offset:09d}.parquet"
        if part_path.exists():
            offset += limit
            continue

        params = {"$limit": limit, "$offset": offset}
        if where: params["$where"] = where
        if order: params["$order"] = order

        try:
            r = session.get(base, params=params, timeout=(30, 2400))
            r.raise_for_status()
            data = r.json()
        except Exception as e:
            print(f"⚠️ Error at offset={offset}: {e}")
            print("⏳ wait 20s then retry...")
            time.sleep(20)
            continue

        if not data:
            print("✅ No more rows returned. Done.")
            break

        df = pd.DataFrame(data)
        rows = len(df)

        tmp_path = part_path.with_suffix(".parquet.part")
        table = pa.Table.from_pandas(df, preserve_index=False)
        pq.write_table(table, tmp_path, compression="snappy")
        tmp_path.rename(part_path)

        if offset % (limit * 10) == 0:
            print(f"✅ Saved {part_path.name} rows={rows} | folder={folder_size_gb(out_parts_dir):.3f} GB")

        if rows < limit:
            print("✅ Last page reached (rows < limit). Done.")
            break

        offset += limit
        time.sleep(sleep_s)

    print("✅ Reached max_rows cap. STOP.")

In [None]:
manifest = {"downloaded_at": stamp(), "traffic": {}}

traffic_dir = RAW_DIR / "chicago_traffic"
traffic_dir.mkdir(parents=True, exist_ok=True)

# A) 2024-current (4g9f-3jbs) — keep existing CSV
traffic_4g9f = traffic_dir / "traffic_hist_2024_current_4g9f_3jbs.csv"
if KEEP_EXISTING_4G9F_CSV and traffic_4g9f.exists():
    print("✅ Using existing:", traffic_4g9f.name, "| GB:", traffic_4g9f.stat().st_size/1024**3)
else:
    url = "https://data.cityofchicago.org/api/views/4g9f-3jbs/rows.csv?accessType=DOWNLOAD"
    download_file(url, traffic_4g9f)

manifest["traffic"]["4g9f_3jbs_csv"] = {"view_id": "4g9f-3jbs", "file": str(traffic_4g9f)}

# B) 2018–2023 (sxs8-h27x) — FAST MINI (max rows)
if DOWNLOAD_SXS8:
    sxs8_parts = traffic_dir / "sxs8_h27x_mini_parts"
    socrata_download_parts_maxrows(
        view_id="sxs8-h27x",
        out_parts_dir=sxs8_parts,
        max_rows=SXS8_MAX_ROWS,
        days_back=SXS8_DAYS_BACK,
        limit=LIMIT,
        sleep_s=SLEEP_S
    )
    manifest["traffic"]["sxs8_h27x_parts"] = {
        "view_id": "sxs8-h27x", "parts_dir": str(sxs8_parts),
        "max_rows": SXS8_MAX_ROWS, "days_back": SXS8_DAYS_BACK, "limit": LIMIT
    }

# C) 2011–2018 (77hq-huss) — FAST MINI (optional)
if DOWNLOAD_77HQ:
    hq77_parts = traffic_dir / "77hq_huss_mini_parts"
    socrata_download_parts_maxrows(
        view_id="77hq-huss",
        out_parts_dir=hq77_parts,
        max_rows=HQ77_MAX_ROWS,
        days_back=HQ77_DAYS_BACK,
        limit=LIMIT,
        sleep_s=SLEEP_S
    )
    manifest["traffic"]["77hq_huss_parts"] = {
        "view_id": "77hq-huss", "parts_dir": str(hq77_parts),
        "max_rows": HQ77_MAX_ROWS, "days_back": HQ77_DAYS_BACK, "limit": LIMIT
    }

# D) Latest snapshot (small)
latest_url = "https://data.cityofchicago.org/api/views/n4j6-wkkf/rows.csv?accessType=DOWNLOAD"
latest_csv = traffic_dir / "traffic_latest_n4j6_wkkf.csv"
download_file(latest_url, latest_csv)
manifest["traffic"]["n4j6_wkkf_latest"] = {"view_id": "n4j6-wkkf", "file": str(latest_csv)}

✅ Using existing: traffic_hist_2024_current_4g9f_3jbs.csv | GB: 16.976862384937704

FAST MINI (MAX ROWS) DOWNLOAD
view_id: sxs8-h27x
parts_dir: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/chicago_traffic/sxs8_h27x_mini_parts
timestamp_field: time
where: time between '2023-05-11T12:51:17.000' and '2023-09-08T12:51:17.000'
max_rows: 2000000 | limit: 20000
CURRENT: {'parts': 804, 'rows_approx': 16080000, 'gb': 0.43933881539851427, 'max_offset': 16060000}

✅ Already have enough. offset=16,080,000 >= max_rows=2,000,000. STOP.
⬇️ Downloading: https://data.cityofchicago.org/api/views/n4j6-wkkf/rows.csv?accessType=DOWNLOAD
✅ Saved: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/chicago_traffic/traffic_latest_n4j6_wkkf.csv (0.16 MB)


In [None]:
from pathlib import Path

parts_dir = RAW_DIR / "chicago_traffic" / "sxs8_h27x_mini_parts"
KEEP_PARTS = 100  # 100 * 20,000 = ~2,000,000 rows

parts = sorted(parts_dir.glob("*.parquet"))
print("Total parts before:", len(parts))

for p in parts[KEEP_PARTS:]:
    p.unlink()

parts_after = sorted(parts_dir.glob("*.parquet"))
print("✅ Total parts after:", len(parts_after))

Total parts before: 804
✅ Total parts after: 100


In [None]:
from pathlib import Path

traffic_dir = RAW_DIR / "chicago_traffic"

files = [
    traffic_dir / "traffic_hist_2024_current_4g9f_3jbs.csv",
    traffic_dir / "traffic_latest_n4j6_wkkf.csv",
]

print("=== Files ===")
for f in files:
    print(f.name, "exists:", f.exists(), "| GB:", round(f.stat().st_size/1024**3, 3) if f.exists() else None)

parts_dir = traffic_dir / "sxs8_h27x_mini_parts"
parts = sorted(parts_dir.glob("*.parquet")) if parts_dir.exists() else []
print("\n=== sxs8 parts ===")
print("parts:", len(parts))
print("approx rows:", len(parts) * LIMIT)

=== Files ===
traffic_hist_2024_current_4g9f_3jbs.csv exists: True | GB: 16.977
traffic_latest_n4j6_wkkf.csv exists: True | GB: 0.0

=== sxs8 parts ===
parts: 100
approx rows: 2000000


In [None]:
cta_dir = RAW_DIR / "cta_gtfs"
cta_dir.mkdir(parents=True, exist_ok=True)

cta_url = "https://www.transitchicago.com/downloads/sch_data/google_transit.zip"
cta_zip = cta_dir / "google_transit.zip"
download_file(cta_url, cta_zip)

manifest["cta_gtfs"] = {"url": cta_url, "file": str(cta_zip), "downloaded_at": stamp()}

⬇️ Downloading: https://www.transitchicago.com/downloads/sch_data/google_transit.zip
✅ Saved: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/cta_gtfs/google_transit.zip (69.05 MB)


In [None]:
import zipfile

extract_dir = cta_dir / "extracted"
extract_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(cta_zip, "r") as z:
    z.extractall(extract_dir)

print("✅ Extracted to:", extract_dir)
print("Files:", sorted([p.name for p in extract_dir.glob("*.txt")])[:15])

✅ Extracted to: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/cta_gtfs/extracted
Files: ['agency.txt', 'calendar.txt', 'calendar_dates.txt', 'frequencies.txt', 'routes.txt', 'shapes.txt', 'stop_times.txt', 'stops.txt', 'transfers.txt', 'trips.txt']


In [None]:
import zipfile

extract_dazillow_dir = RAW_DIR / "zillow"
zillow_dir.mkdir(parents=True, exist_ok=True)

zhvi_url = "https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv"
zhvi_csv = zillow_dir / "Zip_ZHVI.csv"
download_file(zhvi_url, zhvi_csv)

manifest["zillow_zhvi_zip"] = {"url": zhvi_url, "file": str(zhvi_csv), "downloaded_at": stamp()}
ir = cta_dir / "extracted"
extract_dir.mkdir(parents=True, exist_ok=True)

with zipfile.ZipFile(cta_zip, "r") as z:
    z.extractall(extract_dir)

print("✅ Extracted to:", extract_dir)
print("Files:", sorted([p.name for p in extract_dir.glob("*.txt")])[:10], "...")

⬇️ Downloading: https://files.zillowstatic.com/research/public_csvs/zhvi/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv
✅ Saved: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/zillow/Zip_ZHVI.csv (118.75 MB)
✅ Extracted to: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/cta_gtfs/extracted
Files: ['agency.txt', 'calendar.txt', 'calendar_dates.txt', 'frequencies.txt', 'routes.txt', 'shapes.txt', 'stops.txt', 'transfers.txt', 'trips.txt'] ...


In [None]:
import osmnx as ox

roads_dir = RAW_DIR / "osm"
roads_dir.mkdir(parents=True, exist_ok=True)

place = "Chicago, Illinois, USA"
roads_geojson = roads_dir / "chicago_roads_drive.geojson"

if roads_geojson.exists():
    print("⏭️ Skip roads (exists):", roads_geojson)
else:
    G = ox.graph_from_place(place, network_type="drive", simplify=True)
    edges = ox.graph_to_gdfs(G, nodes=False, edges=True)
    keep = [c for c in edges.columns if c in ["highway", "name", "length", "lanes", "maxspeed", "geometry"]]
    edges = edges[keep].copy()
    edges.to_file(roads_geojson, driver="GeoJSON")
    print("✅ Saved roads:", roads_geojson, "| rows:", len(edges))

manifest["osm_roads"] = {"place": place, "file": str(roads_geojson), "downloaded_at": stamp()}

✅ Saved roads: /content/drive/MyDrive/UrbanSimAI_Chicago/data_raw/osm/chicago_roads_drive.geojson | rows: 77244


In [None]:
import json

manifest_path = REPORT_DIR / "download_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
print("✅ Saved manifest:", manifest_path)

✅ Saved manifest: /content/drive/MyDrive/UrbanSimAI_Chicago/reports/download_manifest.json


In [None]:
print("\n--- RAW_DIR size check (top files) ---")
for p in sorted(RAW_DIR.rglob("*")):
    if p.is_file() and p.suffix.lower() in [".csv", ".zip", ".geojson", ".parquet"]:
        mb = p.stat().st_size / 1e6
        if mb >= 1:
            print(f"{p.relative_to(PROJECT_ROOT)} | {mb:.2f} MB")

print("\n--- Traffic mini status ---")
print(parts_status(RAW_DIR / "chicago_traffic" / "sxs8_h27x_mini_parts", LIMIT))

import pandas as pd
print("\n--- Latest traffic preview ---")
display(pd.read_csv(RAW_DIR / "chicago_traffic" / "traffic_latest_n4j6_wkkf.csv", nrows=5))

print("\n--- Zillow preview (first 3 rows, first 10 columns) ---")
z = pd.read_csv(RAW_DIR / "zillow" / "Zip_ZHVI.csv", nrows=3)
display(z.iloc[:, :10])


--- RAW_DIR size check (top files) ---
data_raw/chicago_traffic/traffic_hist_2024_current_4g9f_3jbs.csv | 18228.77 MB
data_raw/cta_gtfs/google_transit.zip | 69.05 MB
data_raw/osm/chicago_roads_drive.geojson | 31.37 MB
data_raw/zillow/Zip_ZHVI.csv | 118.75 MB

--- Traffic mini status ---
{'parts': 100, 'rows_approx': 2000000, 'gb': 0.054417804814875126, 'max_offset': 1980000}

--- Latest traffic preview ---


Unnamed: 0,SEGMENTID,STREET,DIRECTION,FROM_STREET,TO_STREET,LENGTH,STREET_HEADING,COMMENTS,START_LONGITUDE,START_LATITUDE,END_LONGITUDE,END_LATITUDE,CURRENT_SPEED,LAST_UPDATED
0,1284,Chicago,WB,Lake Shore Dr,Michigan,0.37,E,,-87.617048,41.896936,-87.624241,41.896835,-1,2011-08-10 00:00:00.0
1,951,Washington,WB,Kedzie,Schraeder,0.28,W,,-87.706169,41.882932,-87.711747,41.882818,-1,2010-07-21 14:51:00.0
2,750,Elston,SE,Milwaukee,Austin,0.33,N,,-87.783224,41.992665,-87.778073,41.989905,-1,2010-07-21 14:51:10.0
3,1164,Harlem,SB,Ogden,Pershing,0.173023,S,Outside City Limits,-87.802922,41.823751,-87.803025,41.821245,-1,2010-07-21 14:51:15.0
4,1122,127th,EB,Western,I-57 Expy,0.907892,W,Outside City Limits,-87.680077,41.662512,-87.66254,41.66285,-1,2010-07-21 14:51:06.0



--- Zillow preview (first 3 rows, first 10 columns) ---


Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,207990.862896
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,115428.100095
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,103617.439109


In [None]:
#END