In [216]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [217]:
arrest_data = pd.read_csv('data/cleaned_arrest_data.csv')
business_data = pd.read_csv('data/cleaned_business_data.csv')
demographic_data = pd.read_csv('data/cleaned_demographic_data.csv')
noise_data = pd.read_csv('data/cleaned_noise_data.csv')
poi_data = pd.read_csv('data/cleaned_poi.csv')
taxi_data = pd.read_csv('data/cleaned_taxi_data.csv')

In [218]:
# Arrest data
arrest_data = arrest_data.groupby('ZipCode')['AvgMonthlyArrests'].mean().reset_index()

In [219]:
# Business Data
business_data = business_data.pivot(index='ZipCode', columns='BusinessType', values='Count').reset_index()
business_data = business_data.fillna(0)
business_data['TotalBusinesses'] = business_data.sum(axis=1)
business_data['BusinessTypeCount'] = (business_data.set_index('ZipCode') > 0).sum(axis=1).values

In [220]:
# Demographic data
demographic_data = demographic_data.groupby('ZipCode').agg({
    'Population': 'mean',
    'PopulationDensity': 'mean',
    'MedianHouseholdIncome': 'mean',
    'DiversityIndex': 'first',
    'FamilyHousehold': 'mean',
    'SingleHousehold': 'mean'
}).reset_index()

In [221]:
# Noise data
noise_data = noise_data.groupby('ZipCode')['AvgMonthlyNoiseComplaints'].mean().reset_index()

In [222]:
# Places of interest data
poi_data = poi_data.groupby('zipcode').agg({
    'count': 'sum',
    'cultural_facility': 'sum',
    'education_facility': 'sum',
    'health_services': 'sum',
    'public_safety': 'sum',
    'recreational_facility': 'sum',
    'religious_institution': 'sum',
    'transportation_facility': 'sum'
}).reset_index()
poi_data = poi_data.rename(columns={'zipcode': 'ZipCode'})


In [223]:
# Taxi data - use yearly counts for liveliness
taxi_data['date'] = pd.to_datetime(taxi_data['date'])
taxi_data['year'] = taxi_data['date'].dt.year
taxi_data = taxi_data.groupby(['zipcode', 'year'])['passenger_count'].sum().reset_index()
taxi_data = taxi_data.groupby('zipcode')['passenger_count'].mean().reset_index()  # Average yearly counts
taxi_data = taxi_data.rename(columns={'zipcode': 'ZipCode'})

In [224]:
# Merge datasets
merged_data = demographic_data.merge(arrest_data, on='ZipCode', how='left')
merged_data = merged_data.merge(business_data, on='ZipCode', how='left')
merged_data = merged_data.merge(noise_data, on='ZipCode', how='left')
merged_data = merged_data.merge(poi_data, on='ZipCode', how='left')
merged_data = merged_data.merge(taxi_data, on='ZipCode', how='left')

In [225]:
merged_data = merged_data.dropna(subset=['Population'])

In [226]:
merged_data.isnull().sum()

ZipCode                            0
Population                         0
PopulationDensity                  0
MedianHouseholdIncome              0
DiversityIndex                     0
FamilyHousehold                    0
SingleHousehold                    0
AvgMonthlyArrests                  7
entertainment_and_recreation       0
financial_services                 0
food_and_beverage                  0
home_services                      0
parking_and_automotive_services    0
professional_services              0
retail_services                    0
transportation                     0
TotalBusinesses                    0
BusinessTypeCount                  0
AvgMonthlyNoiseComplaints          0
count                              1
cultural_facility                  1
education_facility                 1
health_services                    1
public_safety                      1
recreational_facility              1
religious_institution              1
transportation_facility            1
p

In [227]:
# Impute AvgMonthlyArrests with median
merged_data['AvgMonthlyArrests'] = merged_data['AvgMonthlyArrests'].fillna(merged_data['AvgMonthlyArrests'].median())

In [228]:
# Impute places of interest data with 0
poi_columns = ['count', 'cultural_facility', 'education_facility', 'health_services', 
               'public_safety', 'recreational_facility', 'religious_institution', 'transportation_facility']
merged_data[poi_columns] = merged_data[poi_columns].fillna(0)

In [229]:
# Feature engineering
merged_data['BusinessDensity'] = merged_data['TotalBusinesses'] / merged_data['Population']
merged_data['ArrestsPerCapita'] = merged_data['AvgMonthlyArrests'] / merged_data['Population']
merged_data['NoiseComplaintsPerCapita'] = merged_data['AvgMonthlyNoiseComplaints'] / merged_data['Population']
merged_data['FamilyHouseholdRatio'] = merged_data['FamilyHousehold'] / (merged_data['FamilyHousehold'] + merged_data['SingleHousehold'])

facility_columns = ['cultural_facility', 'education_facility', 'health_services', 'public_safety', 'recreational_facility', 'religious_institution', 'transportation_facility']
for facility in facility_columns:
    merged_data[f'{facility}_density'] = merged_data[facility] / merged_data['Population']

In [230]:
merged_data['AmenityScore'] = merged_data[facility_columns].sum(axis=1)
merged_data['SafetyScore'] = (1 / (merged_data['ArrestsPerCapita'] + 1)) + merged_data['public_safety_density']  # Higher score means safer
merged_data['QuietScore'] = 1 / (merged_data['NoiseComplaintsPerCapita'] + 1)  # Higher score means quieter
merged_data['FamilyFriendlinessScore'] = merged_data['FamilyHouseholdRatio']
merged_data['BusinessEnvironmentScore'] = merged_data['BusinessTypeCount']

In [231]:
# Bin continuous variables
merged_data['IncomeCategory'] = pd.qcut(merged_data['MedianHouseholdIncome'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
merged_data['PopulationDensityCategory'] = pd.qcut(merged_data['PopulationDensity'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

In [232]:
scaler = MinMaxScaler()
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns
numeric_columns = numeric_columns.drop(['ZipCode', 'passenger_count'])
merged_data[numeric_columns] = scaler.fit_transform(merged_data[numeric_columns])

In [233]:
# Separately scale the passenger_count (LivelinessScore)
merged_data['LivelinessScore'] = np.log1p(merged_data['passenger_count'])
merged_data['LivelinessScore'] = (merged_data['LivelinessScore'] - merged_data['LivelinessScore'].min()) / (merged_data['LivelinessScore'].max() - merged_data['LivelinessScore'].min())

In [234]:
# Map latitudes and longitudes to zip codes 
us_zip_codes = pd.read_csv('us_zip_codes')

In [235]:
# Rename columns to match your data
us_zip_codes = us_zip_codes.rename(columns={'ZIP': 'ZipCode', 'LAT': 'Latitude', 'LNG': 'Longitude'})

In [236]:
merged_data = merged_data.merge(us_zip_codes[['ZipCode', 'Latitude', 'Longitude']], on='ZipCode', how='left')

In [237]:
# Save processed data
merged_data.to_csv('processed_data_by_zipcode.csv', index=False)

# Export knn if needed

In [None]:
from sklearn.neighbors import NearestNeighbors
import joblib

# At the end of your data processing script
features = ['LivelinessScore', 'FamilyFriendlinessScore', 'SafetyScore', 
            'BusinessEnvironmentScore', 'AmenityScore', 'DiversityIndex']

X = merged_data[features]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

knn = NearestNeighbors(n_neighbors=6, metric='euclidean')  # 6 because we want 5 neighbors + the point itself
knn.fit(X_scaled)

# Save the KNN model
joblib.dump(knn, 'knn_model.pkl')

# Save the scaler
joblib.dump(scaler, 'knn_scaler.pkl')

# Save the order of zipcodes
zipcode_order = merged_data['ZipCode'].tolist()
joblib.dump(zipcode_order, 'zipcode_order.pkl')

In [238]:
# Group by Borough and list zipcodes with their scores
borough_groups = merged_data.groupby('ZipCode')

for borough, group in borough_groups:
    print(f"\nBorough: {borough}")
    print(f"{'ZipCode':<10} {'LivelinessScore':<20} {'BusinessEnvironmentScore':<25}")
    print("-" * 55)
    for _, row in group.iterrows():
        zipcode = row['ZipCode']
        liveliness_score = row['LivelinessScore']
        business_env_score = row['BusinessEnvironmentScore']
        print(f"{zipcode:<10} {liveliness_score:<20.2f} {business_env_score:<25.2f}")


Borough: 10001
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10001      0.98                 0.83                     

Borough: 10002
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10002      0.85                 0.83                     

Borough: 10003
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10003      0.93                 0.50                     

Borough: 10004
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10004      0.77                 0.83                     

Borough: 10005
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10005      0.72                 0.67                     

Borough: 10006
ZipCode    LivelinessScore      BusinessEnvi

11201      0.65                 0.67                     

Borough: 11203
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
11203      0.49                 0.83                     

Borough: 11204
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
11204      0.37                 0.67                     

Borough: 11205
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
11205      0.62                 0.50                     

Borough: 11206
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
11206      0.60                 0.67                     

Borough: 11207
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
11207      0.52                 0.83                     

B