### EXTRACT DATA BID Data

In [None]:
import requests
import json
import pandas as pd
from datetime import datetime

from dotenv import dotenv_values
from sqlalchemy import create_engine, types
from sqlalchemy.dialects.postgresql import JSONB



# 1) EXTRACT 

dataset_id = "7jdm-inj8"
base_url = f"https://data.cityofnewyork.us/resource/{dataset_id}.json"
headers = {}  

# collect ALL rows
limit = 50000
offset = 0
all_rows = []

while True:
    params = {"$limit": limit, "$offset": offset}
    resp = requests.get(base_url, headers=headers, params=params, timeout=60)
    resp.raise_for_status()
    batch = resp.json()

    if not batch:
        break

    all_rows.extend(batch)
    offset += limit

print(f"Downloaded rows: {len(all_rows)}")

Downloaded rows: 76


## RAW Table

In [2]:
bids_dict = {
    "extracted_at": [],
    "source": [],
    "record_count": [],
    "extracted_data": []
}

bids_dict["extracted_at"].append(datetime.now())
bids_dict["source"].append(base_url)
bids_dict["record_count"].append(len(all_rows))
bids_dict["extracted_data"].append(all_rows)

bids_raw_df = pd.DataFrame(bids_dict)
bids_raw_df

Unnamed: 0,extracted_at,source,record_count,extracted_data
0,2026-02-13 10:44:13.823411,https://data.cityofnewyork.us/resource/7jdm-in...,76,"[{'the_geom': {'type': 'MultiPolygon', 'coordi..."


## FLatten Table

In [3]:
bids_flat_df = pd.json_normalize(all_rows)

# Optional: consistent column naming (like TLC normalization)
bids_flat_df.columns = [c.lower() for c in bids_flat_df.columns]

# add metadata
bids_flat_df["extracted_at"] = datetime.now()
bids_flat_df["source"] = base_url

bids_flat_df.head()

Unnamed: 0,id,objectid_1,objectid_2,shape_leng,f_all_bids,f_all_bi_1,f_all_bi_2,f_all_bi_3,f_all_bi_4,f_all_bi_6,f_all_bi_7,shape_le_1,shape_ar_1,year_found,shape_area,the_geom.type,the_geom.coordinates,extracted_at,source
0,0,0,0,0.0,0.0,Queens,Long Island City Partnership,0.0,https://www.longislandcityqueens.com/,0.0,0.0,0.0,0.0,2005,0.0,MultiPolygon,"[[[[-73.94296015325146, 40.739491660153085], [...",2026-02-13 10:44:19.132981,https://data.cityofnewyork.us/resource/7jdm-in...
1,0,0,0,0.0,0.0,Brooklyn,Cypress Hills Fulton,0.0,https://cypresshillsfultonbid.org/,0.0,0.0,0.0,0.0,2023,0.0,MultiPolygon,"[[[[-73.89166191549769, 40.677599305700014], [...",2026-02-13 10:44:19.132981,https://data.cityofnewyork.us/resource/7jdm-in...
2,0,0,1,0.0,0.0,Manhattan,Union Square Partnership,892.0,https://www.unionsquarenyc.org/,3400000.0,3950623.0,25744.4695903,1275270.53685,1984,206555.808776,MultiPolygon,"[[[[-73.98282323348212, 40.73130538559934], [-...",2026-02-13 10:44:19.132981,https://data.cityofnewyork.us/resource/7jdm-in...
3,0,0,66,0.0,64.0,Bronx,Third Avenue,137.0,https://www.thirdavenuebid.org/,450927.0,630909.0,9019.03876308,333226.139247,1988,54105.6117762,MultiPolygon,"[[[[-73.91761807625159, 40.81590309522811], [-...",2026-02-13 10:44:19.132981,https://data.cityofnewyork.us/resource/7jdm-in...
4,0,0,67,0.0,65.0,Bronx,Throggs Neck BID,264.0,http://www.throggsneckbid.com/,341150.0,355208.0,26621.0244218,1046012.25664,2019,169887.730313,MultiPolygon,"[[[[-73.8171550825718, 40.81859656642491], [-7...",2026-02-13 10:44:19.132981,https://data.cityofnewyork.us/resource/7jdm-in...


## Load to Postgres DB

In [None]:
config = dotenv_values()

pg_user = config["POSTGRES_USER"]
pg_host = config["POSTGRES_HOST"]
pg_port = config["POSTGRES_PORT"]
pg_db   = config["POSTGRES_DB"]
pg_schema = config["POSTGRES_SCHEMA"]
pg_pass = config["POSTGRES_PASS"]

engine = create_engine(
    f"postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}",
    echo=False
)

# RAW: JSONB payload
dtype_raw = {
    "extracted_at": types.DateTime(),
    "source": types.String(),
    "record_count": types.Integer(),
    "extracted_data": JSONB()
}

bids_raw_df.to_sql(
    name="bids_raw",
    con=engine,
    schema=pg_schema,
    if_exists="replace",   # use "append" if you want to keep history of extracts
    dtype=dtype_raw,
    index=False
)

# FLAT: standard columns (lookup-like)

# --- FIX: store geometry/nested columns as JSONB to avoid ARRAY errors ---
from sqlalchemy.dialects.postgresql import JSONB

dtype_dim = {}

# these columns exist in this dataset (after lowercasing)
if "the_geom" in bids_flat_df.columns:
    dtype_dim["the_geom"] = JSONB()
if "the_geom.coordinates" in bids_flat_df.columns:
    dtype_dim["the_geom.coordinates"] = JSONB()

bids_flat_df.to_sql(
    name="dim_bid",
    con=engine,
    schema=pg_schema,
    if_exists="replace",
    index=False,
    dtype=dtype_dim
)

print("✅ Loaded: bids_raw (JSONB) and dim_bid (flattened)")

✅ Loaded: bids_raw (JSONB) and dim_bid (flattened)


In [5]:
import geopandas as gpd

import geopandas as gpd


taxi_zones_path = r"C:\Users\patri\Documents\Data Analytics - neuefische\Capstone-project-NYCTaxi\data\taxi_zones_shp\taxi_zones.shp"

tz = gpd.read_file(taxi_zones_path)

print(tz.shape)
print(tz.columns)
print(tz.crs)
tz.head(3)




(263, 7)
Index(['OBJECTID', 'Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough',
       'geometry'],
      dtype='object')
EPSG:2263


Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,geometry
0,1,0.116357,0.000782,Newark Airport,1,EWR,"POLYGON ((933100.918 192536.086, 933091.011 19..."
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,"MULTIPOLYGON (((1033269.244 172126.008, 103343..."
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,"POLYGON ((1026308.77 256767.698, 1026495.593 2..."


In [6]:
bids_url = "https://data.cityofnewyork.us/resource/7jdm-inj8.geojson"
bids = gpd.read_file(bids_url)

print(bids.shape)
print(bids.columns)
print(bids.crs)
bids.head(3)


(76, 16)
Index(['shape_area', 'objectid_1', 'f_all_bi_1', 'f_all_bi_2', 'f_all_bi_6',
       'shape_leng', 'id', 'year_found', 'f_all_bi_3', 'f_all_bi_7',
       'objectid_2', 'f_all_bi_4', 'shape_ar_1', 'shape_le_1', 'f_all_bids',
       'geometry'],
      dtype='object')
EPSG:4326


Unnamed: 0,shape_area,objectid_1,f_all_bi_1,f_all_bi_2,f_all_bi_6,shape_leng,id,year_found,f_all_bi_3,f_all_bi_7,objectid_2,f_all_bi_4,shape_ar_1,shape_le_1,f_all_bids,geometry
0,0.0,0,Queens,Long Island City Partnership,0.0,0.0,0,2005,0.0,0.0,0,https://www.longislandcityqueens.com/,0.0,0.0,0.0,"MULTIPOLYGON (((-73.94296 40.73949, -73.94304 ..."
1,0.0,0,Brooklyn,Cypress Hills Fulton,0.0,0.0,0,2023,0.0,0.0,0,https://cypresshillsfultonbid.org/,0.0,0.0,0.0,"MULTIPOLYGON (((-73.89166 40.6776, -73.89203 4..."
2,206555.808776,0,Manhattan,Union Square Partnership,3400000.0,0.0,0,1984,892.0,3950623.0,1,https://www.unionsquarenyc.org/,1275270.53685,25744.4695903,0.0,"MULTIPOLYGON (((-73.98282 40.73131, -73.9829 4..."


In [None]:
# Check CRS Coorcinate Reference System
if tz.crs != bids.crs:
    bids = bids.to_crs(tz.crs)

print("TZ CRS:", tz.crs)
print("BIDs CRS:", bids.crs)


TZ CRS: EPSG:2263
BIDs CRS: EPSG:2263


In [None]:
# Spatial join: which BID polygons intersect which taxi zones
j = gpd.sjoin(tz, bids, how="left", predicate="intersects")


zone_id_col = "LocationID"  # <- If it’s different on your end, update it here
zone_name_col = "zone"      # <- If it’s different on your end, update it here
borough_col = "borough"     # <- If it’s different on your end, update it here

# bid_count per zone
out = (
    j.groupby([zone_id_col, zone_name_col, borough_col], dropna=False)
     .size()
     .reset_index(name="bid_count")
)

# Note: With a left join, `.size()` also counts non-matching rows as 1 if there are no NaNs.
# Safer: use `count()` on the BID index column, which is only populated when there is a match:
bid_hit_col = "index_right"
out["bid_count"] = (
    j.groupby([zone_id_col, zone_name_col, borough_col])[bid_hit_col]
     .count()
     .values
)

out["has_bid"] = (out["bid_count"] > 0).astype(int)

out.head(10)


Unnamed: 0,LocationID,zone,borough,bid_count,has_bid
0,1,Newark Airport,EWR,0,0
1,2,Jamaica Bay,Queens,0,0
2,3,Allerton/Pelham Gardens,Bronx,0,0
3,4,Alphabet City,Manhattan,0,0
4,5,Arden Heights,Staten Island,0,0
5,6,Arrochar/Fort Wadsworth,Staten Island,0,0
6,7,Astoria,Queens,1,1
7,8,Astoria Park,Queens,0,0
8,9,Auburndale,Queens,0,0
9,10,Baisley Park,Queens,0,0


In [9]:
from dotenv import dotenv_values
from sqlalchemy import create_engine

config = dotenv_values()

pg_user   = config["POSTGRES_USER"]
pg_host   = config["POSTGRES_HOST"]
pg_port   = config["POSTGRES_PORT"]
pg_db     = config["POSTGRES_DB"]
pg_schema = config["POSTGRES_SCHEMA"]
pg_pass   = config["POSTGRES_PASS"]

engine = create_engine(f"postgresql://{pg_user}:{pg_pass}@{pg_host}:{pg_port}/{pg_db}", echo=False)

out.to_sql(
    name="dim_zone_bid",
    con=engine,
    schema=pg_schema,
    if_exists="replace",
    index=False
)

print(f"✅ Saved: {pg_schema}.dim_zone_bid")


✅ Saved: s_patrickpaubandt.dim_zone_bid


In [10]:
print(bids.columns)

Index(['shape_area', 'objectid_1', 'f_all_bi_1', 'f_all_bi_2', 'f_all_bi_6',
       'shape_leng', 'id', 'year_found', 'f_all_bi_3', 'f_all_bi_7',
       'objectid_2', 'f_all_bi_4', 'shape_ar_1', 'shape_le_1', 'f_all_bids',
       'geometry'],
      dtype='object')


In [11]:
print(tz.columns)

Index(['OBJECTID', 'Shape_Leng', 'Shape_Area', 'zone', 'LocationID', 'borough',
       'geometry'],
      dtype='object')


### Spatial Join + dedupe (bid_count + has_bid)

In [None]:
import geopandas as gpd
import pandas as pd

# 0) Standardize the Zone ID column
if "LocationID" in tz.columns and "locationid" not in tz.columns:
    tz = tz.rename(columns={"LocationID": "locationid"})

# 1) Spatial join: which BIDs intersect which taxi zones?
j = gpd.sjoin(tz, bids, how="left", predicate="intersects")

# 2) Keep only true matches (index_right is only set when a BID matches)
hits = j[j["index_right"].notna()].copy()

# 3) Dedupe: count each BID only once per zone
# BID key: f_all_bi_2 (name) is unique (76 distinct)
hits = hits.drop_duplicates(subset=["locationid", "f_all_bi_2"])

# 4) bid_count per Zone (distinct BIDs)
bid_counts = (
    hits.groupby("locationid")["f_all_bi_2"]
        .nunique()
        .reset_index(name="bid_count")
)

# 5) keep all zones (even without bid match) + has_bid Flag
out_step1 = (
    tz[["locationid", "zone", "borough"]].drop_duplicates()
      .merge(bid_counts, on="locationid", how="left")
)

out_step1["bid_count"] = out_step1["bid_count"].fillna(0).astype(int)
out_step1["has_bid"] = (out_step1["bid_count"] > 0).astype(int)

out_step1.sort_values("bid_count", ascending=False).head(10)


Unnamed: 0,locationid,zone,borough,bid_count,has_bid
63,65,Downtown Brooklyn/MetroTech,Brooklyn,5,1
160,164,Midtown South,Manhattan,5,1
157,161,Midtown Center,Manhattan,5,1
32,33,Brooklyn Heights,Brooklyn,4,1
66,68,East Chelsea,Manhattan,4,1
95,97,Fort Greene,Brooklyn,4,1
98,100,Garment District,Manhattan,4,1
245,249,West Village,Manhattan,4,1
24,25,Boerum Hill,Brooklyn,4,1
47,48,Clinton East,Manhattan,3,1


In [13]:
out_step1["has_bid"].value_counts()

out_step1["bid_count"].describe()

count    260.000000
mean       0.715385
std        1.037778
min        0.000000
25%        0.000000
50%        0.000000
75%        1.000000
max        5.000000
Name: bid_count, dtype: float64

### Create overlap_share
Calculating the proportion of the zone area covered by BIDs for each zone.

In [None]:
# Zone area (m² in EPSG:2263)
zone_area = tz[["locationid"]].drop_duplicates().copy()
zone_area["zone_area"] = tz.drop_duplicates(subset=["locationid"]).set_index("locationid").geometry.area.values

# For overlap, we need the BID geometry for each deduped zone–BID pair.
# hits contains the zone geometry (tz.geometry), and index_right points to bids
hits2 = hits.copy()

# Get BID-Geometrie per index_right
bids_geom = bids.geometry
hits2["bid_geom"] = hits2["index_right"].astype(int).map(bids_geom)

# Intersection area per Zone-BID (m²)
hits2["overlap_area"] = hits2.geometry.intersection(hits2["bid_geom"]).area

# sum up per zone
overlap_sum = (
    hits2.groupby("locationid")["overlap_area"]
         .sum()
         .reset_index()
)

# merge in out_step1
out_step2 = (out_step1
    .merge(zone_area, on="locationid", how="left")
    .merge(overlap_sum, on="locationid", how="left")
)

out_step2["overlap_area"] = out_step2["overlap_area"].fillna(0.0)
out_step2["overlap_share"] = (out_step2["overlap_area"] / out_step2["zone_area"]).fillna(0.0)

# optional overlap in percent
out_step2["overlap_share_pct"] = out_step2["overlap_share"] * 100

out_step2.sort_values("overlap_share", ascending=False).head(10)


Unnamed: 0,locationid,zone,borough,bid_count,has_bid,zone_area,overlap_area,overlap_share,overlap_share_pct
11,12,Battery Park,Manhattan,1,1,1089567.0,905384.8,0.830958,83.095829
257,261,World Trade Center,Manhattan,1,1,3468847.0,2244834.0,0.647141,64.714122
98,100,Garment District,Manhattan,4,1,3782272.0,2400998.0,0.634803,63.480317
158,162,Midtown East,Manhattan,2,1,4834224.0,2877539.0,0.595243,59.524313
160,164,Midtown South,Manhattan,5,1,5615964.0,3294717.0,0.58667,58.666992
85,87,Financial District North,Manhattan,1,1,4182717.0,2388303.0,0.570993,57.099315
86,88,Financial District South,Manhattan,1,1,2621586.0,1418518.0,0.541092,54.109164
226,230,Times Sq/Theatre District,Manhattan,2,1,5661414.0,2826180.0,0.4992,49.920033
121,125,Hudson Sq,Manhattan,1,1,4105927.0,1931451.0,0.470406,47.040559
230,234,Union Sq,Manhattan,2,1,7380801.0,3355221.0,0.454588,45.458765


In [15]:
out_step2["overlap_share"].describe()

count    260.000000
mean       0.056241
std        0.131854
min        0.000000
25%        0.000000
50%        0.000000
75%        0.035162
max        0.830958
Name: overlap_share, dtype: float64

In [None]:
# Top 10 zones by overlap_share (including zone/borough)
out_step2.sort_values("overlap_share", ascending=False)[
    ["locationid", "borough", "zone", "bid_count", "has_bid", "overlap_share", "overlap_share_pct"]
].head(10)


Unnamed: 0,locationid,borough,zone,bid_count,has_bid,overlap_share,overlap_share_pct
11,12,Manhattan,Battery Park,1,1,0.830958,83.095829
257,261,Manhattan,World Trade Center,1,1,0.647141,64.714122
98,100,Manhattan,Garment District,4,1,0.634803,63.480317
158,162,Manhattan,Midtown East,2,1,0.595243,59.524313
160,164,Manhattan,Midtown South,5,1,0.58667,58.666992
85,87,Manhattan,Financial District North,1,1,0.570993,57.099315
86,88,Manhattan,Financial District South,1,1,0.541092,54.109164
226,230,Manhattan,Times Sq/Theatre District,2,1,0.4992,49.920033
121,125,Manhattan,Hudson Sq,1,1,0.470406,47.040559
230,234,Manhattan,Union Sq,2,1,0.454588,45.458765


### Save out_step2 in DB as dim_zone_bid

In [None]:
# out_step2 -> save in postgres
out_step2.to_sql(
    name="dim_zone_bid",
    con=engine,
    schema=pg_schema,
    if_exists="replace",
    index=False
)

print(f"✅ Saved: {pg_schema}.dim_zone_bid")


✅ Saved: s_patrickpaubandt.dim_zone_bid


### Create new View: vw_tableau_tip + dim_zone_bid

Expand the existing tableau view to include the new information from the BID table. Tableau and the new view will then also contain information about the Business Improvement Districts.

* has_bid = 0/1
* bid_count = Count of BIDs in Taxi Zone
* overlap_share (bid) in Percent

In [None]:
from sqlalchemy import text

VIEW_SQL = f"""
CREATE OR REPLACE VIEW {pg_schema}.vw_tableau_tip_bid AS
SELECT
    t.*,
    pu.bid_count          AS pu_bid_count,
    pu.has_bid            AS pu_has_bid,
    pu.overlap_share      AS pu_bid_overlap_share,
    pu.overlap_share_pct  AS pu_bid_overlap_share_pct,
    do_.bid_count         AS do_bid_count,
    do_.has_bid           AS do_has_bid,
    do_.overlap_share     AS do_bid_overlap_share,
    do_.overlap_share_pct AS do_bid_overlap_share_pct
FROM {pg_schema}.vw_tableau_tip t
LEFT JOIN {pg_schema}.dim_zone_bid pu
    ON pu.locationid = t.pulocationid
LEFT JOIN {pg_schema}.dim_zone_bid do_
    ON do_.locationid = t.dolocationid
"""

with engine.begin() as conn:
    conn.execute(text(VIEW_SQL))

print(f"Created/updated {pg_schema}.vw_tableau_tip_bid")


Created/updated s_patrickpaubandt.vw_tableau_tip_bid
