In [85]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.spatial import cKDTree
import json

In [86]:
df = pd.read_csv('POI.csv', keep_default_na=True, delimiter=',', skipinitialspace=True)
zip_code_data = pd.read_csv('us_zip_codes')

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20595 entries, 0 to 20594
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   the_geom    20595 non-null  object 
 1   SEGMENTID   20595 non-null  int64  
 2   COMPLEXID   20595 non-null  int64  
 3   SAFTYPE     10393 non-null  object 
 4   SOS         20003 non-null  float64
 5   PLACEID     20595 non-null  int64  
 6   FACI_DOM    20595 non-null  int64  
 7   BIN         20595 non-null  int64  
 8   BOROUGH     20384 non-null  float64
 9   CREATED     20595 non-null  object 
 10  MODIFIED    19228 non-null  object 
 11  FACILITY_T  20595 non-null  int64  
 12  SOURCE      20595 non-null  object 
 13  B7SC        10389 non-null  float64
 14  PRI_ADD     20595 non-null  int64  
 15  NAME        20595 non-null  object 
dtypes: float64(3), int64(7), object(6)
memory usage: 2.5+ MB


In [88]:
columns_to_keep = ['FACILITY_T', 'FACI_DOM', 'BOROUGH', 'the_geom', 'NAME']
cleaned_data = df[columns_to_keep]

In [89]:
# Get lat/lng from geom
cleaned_data[['LNG', 'LAT']] = cleaned_data['the_geom'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)').astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data[['LNG', 'LAT']] = cleaned_data['the_geom'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)').astype(float)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_data[['LNG', 'LAT']] = cleaned_data['the_geom'].str.extract(r'POINT \(([^ ]+) ([^ ]+)\)').astype(float)


In [90]:
# Define and apply mappings
facility_type_mapping = {
    2: 'Education Facility', 3: 'Cultural Facility', 4: 'Recreational Facility',
    6: 'Transportation Facility', 9: 'Religious Institution', 10: 'Health Services', 11: 'Public Safety'
}
cleaned_data = cleaned_data[cleaned_data['FACILITY_T'].isin(facility_type_mapping.keys())]
cleaned_data['FACILITY_T'] = cleaned_data['FACILITY_T'].map(facility_type_mapping)

borough_mapping = {1: 'Manhattan', 2: 'Bronx', 3: 'Brooklyn', 4: 'Queens', 5: 'Staten Island'}
cleaned_data['BOROUGH'] = cleaned_data['BOROUGH'].map(borough_mapping)

cleaned_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13014 entries, 0 to 20594
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   FACILITY_T  13014 non-null  object 
 1   FACI_DOM    13014 non-null  int64  
 2   BOROUGH     12823 non-null  object 
 3   the_geom    13014 non-null  object 
 4   NAME        13014 non-null  object 
 5   LNG         13014 non-null  float64
 6   LAT         13014 non-null  float64
dtypes: float64(2), int64(1), object(4)
memory usage: 813.4+ KB


In [91]:
cleaned_data

Unnamed: 0,FACILITY_T,FACI_DOM,BOROUGH,the_geom,NAME,LNG,LAT
0,Transportation Facility,9,Manhattan,POINT (-74.00701717096757 40.724634757833414),HOLLAND,-74.007017,40.724635
1,Transportation Facility,8,Queens,POINT (-73.82661642130311 40.797182526598505),WHITESTONE,-73.826616,40.797183
2,Transportation Facility,8,Brooklyn,POINT (-73.99395441100663 40.70384707235758),BROOKLYN,-73.993954,40.703847
3,Transportation Facility,8,Manhattan,POINT (-73.9919414213091 40.70960010711745),MANHATTAN,-73.991941,40.709600
4,Transportation Facility,8,Brooklyn,POINT (-73.9526609766105 40.73906602249743),PULASKI,-73.952661,40.739066
...,...,...,...,...,...,...,...
20590,Recreational Facility,9,Brooklyn,POINT (-73.94931898144033 40.70268116179689),DE HOSTOS PLAYGROUND,-73.949319,40.702681
20591,Public Safety,1,Queens,POINT (-73.73345230015069 40.66635980429733),116 PRECINCT,-73.733452,40.666360
20592,Recreational Facility,12,Manhattan,POINT (-73.99939639223179 40.73450912110648),JEFFERSON MARKET GARDEN,-73.999396,40.734509
20593,Recreational Facility,9,Staten Island,POINT (-74.1881919612981 40.59019555904301),SCHMUL PARK PLAYGROUND,-74.188192,40.590196


In [92]:
# Map zip
zip_code_tree = cKDTree(zip_code_data[['LAT', 'LNG']].values)
_, idx = zip_code_tree.query(cleaned_data[['LAT', 'LNG']].values)
cleaned_data['ZIP_CODE'] = zip_code_data['ZIP'].iloc[idx].values

In [93]:
facility_domain_mapping = {
    'Education Facility': {1: 'Public Elementary School', 2: 'Public Junior High-Intermediate-Middle', 3: 'Public High School',
                           4: 'Private/Parochial Elementary School', 5: 'Private/Parochial Junior/Middle School', 6: 'Private/Parochial High School',
                           7: 'Post Secondary Degree Granting Institution', 8: 'Other', 9: 'Public Early Childhood',
                           10: 'Public K-8', 11: 'Public K-12 all grades', 12: 'Public Secondary School', 13: 'Public School Building',
                           14: 'Public School Annex', 15: 'Private/Parochial Early Childhood', 16: 'Private/Parochial K-8',
                           17: 'Private/Parochial K-12 all grades', 18: 'Private/Parochial Secondary School'},
    'Cultural Facility': {1: 'Center', 2: 'Library', 3: 'Theater/Concert Hall', 4: 'Museum', 5: 'Other'},
    'Recreational Facility': {1: 'Park', 2: 'Amusement Park', 3: 'Golf Course', 4: 'Beach', 5: 'Botanical Garden', 6: 'Zoo',
                              7: 'Recreational Center', 8: 'Sports', 9: 'Playground', 10: 'Other', 11: 'Pool', 12: 'Garden'},
    'Transportation Facility': {1: 'Bus Terminal', 2: 'Ferry landing/terminal', 3: 'Transit/Maintenance Yard', 4: 'Airport', 5: 'Heliport',
                                6: 'Marina', 7: 'Pier', 8: 'Bridge', 9: 'Tunnel', 10: 'Exit/Entrance', 11: 'Water Navigation', 12: 'Other'},
    'Religious Institution': {1: 'Church', 2: 'Synagogue', 3: 'Temple', 4: 'Convent/Monastery', 5: 'Mosque', 6: 'Other'},
    'Health Services': {1: 'Hospital', 2: 'Inpatient care center', 3: 'Outpatient care center/Clinic', 4: 'Other'},
    'Public Safety': {1: 'NYPD Precinct', 2: 'NYPD Checkpoint', 3: 'FDNY Ladder Company', 4: 'FDNY Battalion', 5: 'Correctional Facility',
                      6: 'FDNY Engine Company', 7: 'FDNY Special Unit', 8: 'FDNY Division', 9: 'FDNY Squad', 10: 'NYPD Other',
                      11: 'Other', 12: 'FDNY Other'}
}

In [94]:
def map_facility_domain(row):
    facility_type = row['FACILITY_T']
    domain_id = row['FACI_DOM']
    return facility_domain_mapping.get(facility_type, {}).get(domain_id, 'Unknown')

cleaned_data['FACILITY_DOMAIN_NAME'] = cleaned_data.apply(map_facility_domain, axis=1)

In [95]:
grouped_data = cleaned_data.groupby(['BOROUGH', 'NAME', 'FACILITY_T', 'FACILITY_DOMAIN_NAME', 'ZIP_CODE', 'LNG', 'LAT']).size().reset_index(name='COUNT')

In [96]:
# Define code range
nyc_zip_range = range(10001, 11697)

#filter
grouped_data = grouped_data[grouped_data['ZIP_CODE'].isin(nyc_zip_range)]

In [97]:
zip_borough_counts = grouped_data.groupby('ZIP_CODE')['BOROUGH'].nunique()
overlapping_zips = zip_borough_counts[zip_borough_counts > 1].index

print("Overlapping ZIP codes:", overlapping_zips.tolist())
print("Number of overlapping ZIP codes:", len(overlapping_zips))

Overlapping ZIP codes: [10002, 10004, 10033, 10034, 10035, 10037, 10038, 10039, 10044, 10162, 10452, 10454, 10463, 11102, 11105, 11109, 11201, 11207, 11211, 11222, 11224, 11235, 11237, 11356, 11357, 11359, 11371, 11385, 11414, 11421, 11425, 11694]
Number of overlapping ZIP codes: 32


In [98]:
def assign_primary_borough(group):
    primary_borough = group['BOROUGH'].value_counts().index[0]
    return group.assign(BOROUGH=primary_borough)

grouped_data = grouped_data.groupby('ZIP_CODE').apply(assign_primary_borough).reset_index(drop=True)

  grouped_data = grouped_data.groupby('ZIP_CODE').apply(assign_primary_borough).reset_index(drop=True)


In [115]:
grouped_data

Unnamed: 0,BOROUGH,NAME,FACILITY_T,FACILITY_DOMAIN_NAME,ZIP_CODE,LNG,LAT,DISTANCE_TO_FACILITY
0,Manhattan,AVENUES NEW YORK SCHOOL,Education Facility,Private/Parochial K-12 all grades,10001,-74.003266,40.749380,0.814942
1,Manhattan,CHELSEA PARK,Recreational Facility,Playground,10001,-74.000483,40.749894,0.448993
2,Manhattan,CHELSEA RECREATION CENTER,Recreational Facility,Recreational Center,10001,-74.002285,40.748338,0.821716
3,Manhattan,COOKE CENTER ACADEMY SKILLS,Education Facility,Private/Parochial High School,10001,-73.995409,40.748742,0.406175
4,Manhattan,FIT BUSINESS & LIBERAL ARTS CTR,Education Facility,Post Secondary Degree Granting Institution,10001,-73.995448,40.747591,0.529525
...,...,...,...,...,...,...,...,...
12760,Queens,THE RAUNT CHANNEL BUOY 6,Transportation Facility,Water Navigation,11694,-73.837107,40.601163,3.390793
12761,Queens,WATERSIDE CHILDRENS STUDIO SCHOOL,Education Facility,Public Elementary School,11694,-73.831557,40.581009,1.769739
12762,Queens,WATERSIDE SCHOOL FOR LEADERSHIP,Education Facility,Public Junior High-Intermediate-Middle,11694,-73.831462,40.580803,1.757436
12763,Queens,WEST END TEMPLE,Religious Institution,Temple,11694,-73.864816,40.573206,2.788108


## Calculate distances

In [99]:
def manhattan_distance(lat1, lon1, lat2, lon2):
    return np.abs(lat1 - lat2) + np.abs(lon1 - lon2)

facility_lats = grouped_data['LAT'].values
facility_lngs = grouped_data['LNG'].values
zipcode_lats = zip_code_data['LAT'].values
zipcode_lngs = zip_code_data['LNG'].values

distances = np.zeros(len(grouped_data))
for i in range(len(grouped_data)):
    distances[i] = np.min(manhattan_distance(facility_lats[i], facility_lngs[i], zipcode_lats, zipcode_lngs))

grouped_data['DISTANCE_TO_FACILITY'] = distances * 111

In [100]:
# clean duplicated rows
print('Number of duplicate (excluding original) rows is:', grouped_data.duplicated().sum())
print('Number of duplicate rows (including first) in the table is:', grouped_data[grouped_data.duplicated(keep=False)].shape[0])
# Show duplicate row data that can be dropped
grouped_data[grouped_data.duplicated(keep=False)]

Number of duplicate (excluding original) rows is: 0
Number of duplicate rows (including first) in the table is: 0


Unnamed: 0,BOROUGH,NAME,FACILITY_T,FACILITY_DOMAIN_NAME,ZIP_CODE,LNG,LAT,COUNT,DISTANCE_TO_FACILITY


In [101]:
grouped_data.isnull().sum()

BOROUGH                 0
NAME                    0
FACILITY_T              0
FACILITY_DOMAIN_NAME    0
ZIP_CODE                0
LNG                     0
LAT                     0
COUNT                   0
DISTANCE_TO_FACILITY    0
dtype: int64

In [102]:
grouped_data

Unnamed: 0,BOROUGH,NAME,FACILITY_T,FACILITY_DOMAIN_NAME,ZIP_CODE,LNG,LAT,COUNT,DISTANCE_TO_FACILITY
0,Manhattan,AVENUES NEW YORK SCHOOL,Education Facility,Private/Parochial K-12 all grades,10001,-74.003266,40.749380,1,0.814942
1,Manhattan,CHELSEA PARK,Recreational Facility,Playground,10001,-74.000483,40.749894,1,0.448993
2,Manhattan,CHELSEA RECREATION CENTER,Recreational Facility,Recreational Center,10001,-74.002285,40.748338,1,0.821716
3,Manhattan,COOKE CENTER ACADEMY SKILLS,Education Facility,Private/Parochial High School,10001,-73.995409,40.748742,1,0.406175
4,Manhattan,FIT BUSINESS & LIBERAL ARTS CTR,Education Facility,Post Secondary Degree Granting Institution,10001,-73.995448,40.747591,1,0.529525
...,...,...,...,...,...,...,...,...,...
12760,Queens,THE RAUNT CHANNEL BUOY 6,Transportation Facility,Water Navigation,11694,-73.837107,40.601163,1,3.390793
12761,Queens,WATERSIDE CHILDRENS STUDIO SCHOOL,Education Facility,Public Elementary School,11694,-73.831557,40.581009,1,1.769739
12762,Queens,WATERSIDE SCHOOL FOR LEADERSHIP,Education Facility,Public Junior High-Intermediate-Middle,11694,-73.831462,40.580803,1,1.757436
12763,Queens,WEST END TEMPLE,Religious Institution,Temple,11694,-73.864816,40.573206,1,2.788108


In [104]:
grouped_data.drop(columns=['COUNT'], inplace=True)

In [105]:
# Group 
grouped_by_zip = grouped_data.groupby('ZIP_CODE')

json_data = {}

for zip_code, group in grouped_by_zip:
    json_data[str(zip_code)] = group.drop('ZIP_CODE', axis=1).to_dict('records')

# Save 
with open('cleaned_amenities.json', 'w') as f:
    json.dump(json_data, f, indent=2)

## Prepare data for modeling

In [None]:
model_data = grouped_data.drop(columns=['NAME', 'BOROUGH', 'FACILITY_DOMAIN_NAME', 'LNG', 'LAT'])
one_hot_encoded_data = pd.get_dummies(model_data, columns=['FACILITY_T'])

In [None]:
for col in one_hot_encoded_data.columns:
    if col.startswith('FACILITY_T_'):
        one_hot_encoded_data[col] = one_hot_encoded_data[col].astype(int)

one_hot_encoded_data.columns = one_hot_encoded_data.columns.str.replace('FACILITY_T_', '')

In [None]:
# Rename columns
column_mapping = {
    'ZIP_CODE': 'zipcode',
    'COUNT': 'count',
    'DISTANCE_TO_FACILITY': 'distance_to_facility',
    'Cultural Facility': 'cultural_facility',
    'Education Facility': 'education_facility',
    'Health Services': 'health_services',
    'Public Safety': 'public_safety',
    'Recreational Facility': 'recreational_facility',
    'Religious Institution': 'religious_institution',
    'Transportation Facility': 'transportation_facility'
}
one_hot_encoded_data.rename(columns=column_mapping, inplace=True)

In [None]:
# Convert data types
binary_columns = ['cultural_facility', 'education_facility', 'health_services', 'public_safety',
                  'recreational_facility', 'religious_institution', 'transportation_facility']
one_hot_encoded_data[binary_columns] = one_hot_encoded_data[binary_columns].astype(int)
one_hot_encoded_data['zipcode'] = one_hot_encoded_data['zipcode'].astype(str)
one_hot_encoded_data['distance_to_facility'] = one_hot_encoded_data['distance_to_facility'].astype(float)

In [None]:
one_hot_encoded_data.to_csv('cleaned_POI.csv', index=False)