In [1]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from sodapy import Socrata

In [2]:
# --- 1. SETUP & FILE PATHS ---
print("--- 1. Setting up file paths and API clients ---")
# Input from the Walk Score notebook
property_data_path = '../data/processed/manhattan_sales_walkscore.parquet'

# Path to the shapefile we downloaded and renamed
precinct_shapefile_path = '../data/raw/police_precincts/police_precincts.shp'

# The final, golden dataset!
output_path = '../data/processed/manhattan_sales_fully_enriched.parquet' 

# Socrata API setup for NYC Open Data
socrata_domain = "data.cityofnewyork.us"
nypd_dataset_identifier = "qgea-i56i" 
client = Socrata(socrata_domain, None)



--- 1. Setting up file paths and API clients ---


In [3]:
# --- 2. LOAD & PREPARE PROPERTY DATA AS A GEODATAFRAME ---
print("\n--- 2. Loading and preparing property data ---")
try:
    df_props = pd.read_parquet(property_data_path)
    # Convert the pandas DataFrame to a GeoDataFrame for spatial operations
    geometry = [Point(xy) for xy in zip(df_props['longitude'], df_props['latitude'])]
    gdf_props = gpd.GeoDataFrame(df_props, geometry=geometry, crs="EPSG:4326")
    print(f"Successfully converted {len(gdf_props)} properties to a GeoDataFrame.")
except FileNotFoundError:
    raise FileNotFoundError("Run the 05_walkscore notebook first to generate its output file.")


--- 2. Loading and preparing property data ---
Successfully converted 6505 properties to a GeoDataFrame.


In [4]:
# --- 3. ACQUIRE & PROCESS CRIME DATA VIA API ---
print("\n--- 3. Acquiring and processing crime data via API ---")
print("Querying NYC Open Data API for 2024 Manhattan crime data...")
try:
    # SoQL query to get all complaints in Manhattan for the full year of 2024
    results = client.get(
        nypd_dataset_identifier,
        where="cmplnt_fr_dt >= '2024-01-01T00:00:00' AND cmplnt_fr_dt < '2025-01-01T00:00:00' AND boro_nm = 'MANHATTAN'",
        limit=200000 # High limit to ensure we get all records
    )
    df_crime_manhattan = pd.DataFrame.from_records(results)
    if df_crime_manhattan.empty:
        raise ValueError("API returned no crime data. Check the query or API status.")
    print(f"Successfully downloaded {len(df_crime_manhattan)} crime records via API.")
except Exception as e:
    raise RuntimeError(f"Failed to download crime data from API. Error: {e}")

# Clean and calculate crime counts per precinct
df_crime_manhattan['addr_pct_cd'] = pd.to_numeric(df_crime_manhattan['addr_pct_cd'], errors='coerce')
crime_counts = df_crime_manhattan['addr_pct_cd'].value_counts().reset_index()
crime_counts.columns = ['precinct', 'crime_count_in_precinct_2024']
print(f"Calculated crime counts for {len(crime_counts)} precincts.")


--- 3. Acquiring and processing crime data via API ---
Querying NYC Open Data API for 2024 Manhattan crime data...
Successfully downloaded 135921 crime records via API.
Calculated crime counts for 38 precincts.


In [5]:
# --- 4. PERFORM GEOSPATIAL JOIN WITH PRECINCTS ---
print("\n--- 4. Performing geospatial join with precinct boundaries ---")
try:
    gdf_precincts = gpd.read_file(precinct_shapefile_path)
    # Ensure both GeoDataFrames use the same Coordinate Reference System (CRS)
    gdf_precincts = gdf_precincts.to_crs(gdf_props.crs)
    print("Loaded police precinct shapefile.")
except Exception as e:
    raise FileNotFoundError(f"Could not load the precinct shapefile. Make sure it's unzipped and renamed in data/raw/police_precincts/. Error: {e}")

# The SPATIAL JOIN: Find which precinct each property point falls inside
gdf_with_precincts = gpd.sjoin(gdf_props, gdf_precincts[['precinct', 'geometry']], how='left', predicate='within')

# Remove potential duplicates from the join
gdf_with_precincts = gdf_with_precincts[~gdf_with_precincts.index.duplicated(keep='first')]
print("Performed spatial join to assign a precinct to each property.")

# Merge the crime counts onto our properties using the precinct number as the key
gdf_with_precincts['precinct'] = gdf_with_precincts['precinct'].astype(int)
crime_counts['precinct'] = crime_counts['precinct'].astype(int)
df_final = gdf_with_precincts.merge(crime_counts, on='precinct', how='left')
print("Successfully merged crime counts onto properties.")


--- 4. Performing geospatial join with precinct boundaries ---
Loaded police precinct shapefile.
Performed spatial join to assign a precinct to each property.
Successfully merged crime counts onto properties.


In [6]:
# --- 5. FINALIZE AND SAVE THE GOLDEN DATASET ---
# Drop the extra columns created during the join process that we no longer need
cols_to_drop = ['index_right', 'geometry']
df_final = df_final.drop(columns=cols_to_drop)

# Impute any missing crime counts with the median (for properties on boundaries, etc.)
df_final['crime_count_in_precinct_2024'].fillna(df_final['crime_count_in_precinct_2024'].median(), inplace=True)

print(f"\nGeospatial analysis complete. Saving FINAL GOLDEN DATASET to {output_path}")
df_final.to_parquet(output_path)
print("Golden dataset created successfully! You are now ready for the final modeling notebook (07).")



Geospatial analysis complete. Saving FINAL GOLDEN DATASET to ../data/processed/manhattan_sales_fully_enriched.parquet
Golden dataset created successfully! You are now ready for the final modeling notebook (07).


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_final['crime_count_in_precinct_2024'].fillna(df_final['crime_count_in_precinct_2024'].median(), inplace=True)


In [8]:
# ======================================================================
# --- FINAL SANITY CHECK ---
# ======================================================================
print("\n\n--- Starting Final Sanity Check ---")

# 1. Check final shape and for missing values in the new crime column
print("\n--- 1. Checking Shape and Missing Values ---")
print(f"Final dataset shape: {df_final.shape}")
crime_nulls = df_final['crime_count_in_precinct_2024'].isnull().sum()
print(f"Missing values in 'crime_count_in_precinct_2024': {crime_nulls}")
if crime_nulls == 0:
    print("VERDICT: Excellent! All properties have a crime count.")
else:
    print("WARNING: Some properties are missing crime data.")

# 2. Check descriptive statistics for the new crime column
print("\n--- 2. Checking Descriptive Statistics ---")
display(df_final[['crime_count_in_precinct_2024']].describe())
print("VERDICT: Check if the min, max, and mean values seem plausible for annual complaints in a Manhattan precinct.")

# 3. Check a sample of the joined data
print("\n--- 3. Inspecting a Sample of the Final Data ---")
display(df_final[['address', 'precinct', 'crime_count_in_precinct_2024']].sample(10, random_state=42))
print("VERDICT: The 'precinct' and 'crime_count' columns should be populated with integers.")



--- Starting Final Sanity Check ---

--- 1. Checking Shape and Missing Values ---
Final dataset shape: (6505, 33)
Missing values in 'crime_count_in_precinct_2024': 0
VERDICT: Excellent! All properties have a crime count.

--- 2. Checking Descriptive Statistics ---


Unnamed: 0,crime_count_in_precinct_2024
count,6505.0
mean,6815.928517
std,2617.163045
min,2.0
25%,4346.0
50%,7062.0
75%,8460.0
max,15539.0


VERDICT: Check if the min, max, and mean values seem plausible for annual complaints in a Manhattan precinct.

--- 3. Inspecting a Sample of the Final Data ---


Unnamed: 0,address,precinct,crime_count_in_precinct_2024
5638,405 BLEECKER STREET,6,5962.0
3153,310 WEST 52ND STREET,18,10679.0
6270,216 EAST 18TH STREET,13,8460.0
217,383 EAST 10TH STREET,9,5336.0
2339,44-46 MARKET STREET,5,5672.0
465,101 WEST 24TH STREET,13,8460.0
3558,150 WOOSTER STREET,1,8781.0
4700,1 CENTRAL PARK WEST,20,3993.0
378,252 7 AVENUE,10,4307.0
1192,"212 FIFTH AVENUE, 12C",13,8460.0


VERDICT: The 'precinct' and 'crime_count' columns should be populated with integers.
