# Data Integration

**Goal:**  
Merge restaurant inspection data with crime and property data to create features for predicting inspection scores.

**Plan:**
1. Load all three cleaned datasets (DOHMH, NYPD, PLUTO)
2. For each restaurant, count crimes within ~500m radius
3. For each restaurant, find nearest property and get its characteristics
4. Create master dataset with all features combined
5. Save for modeling


In [None]:
#Loading data
# Using cKDTree for fast spatial searches. The datasets we are using are large and checking for every crime against every restaurant would be too slow.
import pandas as pd
import numpy as np
from scipy.spatial import cKDTree

print("Libraries loaded!")


Libraries loaded!


## Load Cleaned Datasets


In [3]:
# Loading data
print("Loading DOHMH restaurant data...")
restaurants = pd.read_csv('../data/processed/dohmh_restaurants_clean.csv')
print(f"Restaurants: {len(restaurants):,} rows")


print("Loading NYPD crime data...")
crimes = pd.read_csv('../data/processed/nypd_complaints_clean.csv')
print(f"Crimes: {len(crimes):,} rows")

 
print("Loading PLUTO property data...")
properties = pd.read_csv('../data/processed/pluto_nyc_clean.csv')
print(f"Properties: {len(properties):,} rows")


Loading DOHMH restaurant data...


FileNotFoundError: [Errno 2] No such file or directory: '../data/processed/dohmh_restaurants_clean.csv'

In [4]:
# Quick look at each dataset
print("Restaurant columns:", restaurants.columns.tolist())
print("\nCrime columns:", crimes.columns.tolist())
print("\nProperty columns:", properties.columns.tolist())


NameError: name 'restaurants' is not defined

## Count Crimes Near Each Restaurant

For each restaurant, we count how many crimes occurred within ~500m.

In [None]:
# Prepare crime coordinates
# Drop any rows with missing coordinates
crimes_clean = crimes.dropna(subset=['Latitude', 'Longitude']).copy()
print(f"Crimes with valid coordinates: {len(crimes_clean):,}")

# Build KD-Tree for crimes (for fast spatial queries)
crime_coords = crimes_clean[['Latitude', 'Longitude']].values
crime_tree = cKDTree(crime_coords)
print("Crime KD-Tree built!")


In [None]:
# Count crimes within radius for each restaurant
# 0.005 degrees is approximately 500 meters
RADIUS = 0.005

print("Counting crimes near each restaurant (this may take a minute)...")

restaurant_coords = restaurants[['Latitude', 'Longitude']].values
crime_counts = []

for i, (lat, lon) in enumerate(restaurant_coords):
    indices = crime_tree.query_ball_point([lat, lon], RADIUS)
    crime_counts.append(len(indices))
    
    if (i + 1) % 5000 == 0:
        print(f"  Processed {i + 1:,} / {len(restaurants):,} restaurants...")

restaurants['crimes_nearby'] = crime_counts
print(f"Done! Average crimes per restaurant: {np.mean(crime_counts):.1f}")

In [None]:
## Match Restaurants to Nearest Property
Find the nearest PLUTO property for each restaurant and get its characteristics.

# Prepare property coordinates
properties_clean = properties.dropna(subset=['latitude', 'longitude']).copy()
print(f"Properties with valid coordinates: {len(properties_clean):,}")

# Build KD-Tree for properties
property_coords = properties_clean[['latitude', 'longitude']].values
property_tree = cKDTree(property_coords)
print("Property KD-Tree built!")

# Find nearest property for each restaurant
print("Finding nearest property for each restaurant...")
distances, indices = property_tree.query(restaurant_coords, k=1)

# Get property features
restaurants['property_yearbuilt'] = properties_clean.iloc[indices]['yearbuilt'].values
restaurants['property_assesstot'] = properties_clean.iloc[indices]['assesstot'].values
restaurants['property_landuse'] = properties_clean.iloc[indices]['landuse'].values

print("Done!")

In [None]:
## Review and Save Master Dataset

In [None]:
# View the integrated dataset
print(f"Master dataset shape: {restaurants.shape}")
print(f"\nNew columns added:")
print("- crimes_nearby")
print("- property_yearbuilt")
print("- property_assesstot")
print("- property_landuse")
restaurants.head()

In [None]:
# Save master dataset
output_path = '../data/processed/master_restaurant_dataset.csv'
restaurants.to_csv(output_path, index=False)

print(f"Saved master dataset to: {output_path}")
print(f"Total rows: {len(restaurants):,}")
print(f"Total columns: {len(restaurants.columns)}")
print("\nData integration complete!")


In [None]:
# Quick summary stats
print("Crime feature stats:")
print(f"  Min crimes nearby: {restaurants['crimes_nearby'].min()}")
print(f"  Max crimes nearby: {restaurants['crimes_nearby'].max()}")
print(f"  Mean crimes nearby: {restaurants['crimes_nearby'].mean():.1f}")
