In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [9]:
arrest_data = pd.read_csv('data/cleaned_arrest_data.csv')
business_data = pd.read_csv('data/cleaned_business_data.csv')
demographic_data = pd.read_csv('data/cleaned_demographic_data.csv')
noise_data = pd.read_csv('data/cleaned_noise_data.csv')
poi_data = pd.read_csv('data/cleaned_poi.csv')
taxi_data = pd.read_csv('data/cleaned_taxi_data.csv')

In [10]:
# Arrest data
arrest_data = arrest_data.groupby('ZipCode')['AvgMonthlyArrests'].mean().reset_index()

In [11]:
# Business Data
business_data = business_data.pivot(index='ZipCode', columns='BusinessType', values='Count').reset_index()
business_data = business_data.fillna(0)
business_data['TotalBusinesses'] = business_data.sum(axis=1)
business_data['BusinessTypeCount'] = (business_data.set_index('ZipCode') > 0).sum(axis=1).values

In [12]:
# Demographic data
demographic_data = demographic_data.groupby('ZipCode').agg({
    'Population': 'mean',
    'PopulationDensity': 'mean',
    'MedianHouseholdIncome': 'mean',
    'DiversityIndex': 'first',
    'FamilyHousehold': 'mean',
    'SingleHousehold': 'mean'
}).reset_index()

In [13]:
# Noise data
noise_data = noise_data.groupby('ZipCode')['AvgMonthlyNoiseComplaints'].mean().reset_index()

In [14]:
# Places of interest data
poi_data = poi_data.groupby('zipcode').agg({
    'cultural_facility': 'sum',
    'education_facility': 'sum',
    'health_services': 'sum',
    'public_safety': 'sum',
    'recreational_facility': 'sum',
    'religious_institution': 'sum',
    'transportation_facility': 'sum'
}).reset_index()
poi_data = poi_data.rename(columns={'zipcode': 'ZipCode'})


In [15]:
# Taxi data - use yearly counts for liveliness
taxi_data['date'] = pd.to_datetime(taxi_data['date'])
taxi_data['year'] = taxi_data['date'].dt.year
taxi_data = taxi_data.groupby(['zipcode', 'year'])['passenger_count'].sum().reset_index()
taxi_data = taxi_data.groupby('zipcode')['passenger_count'].mean().reset_index()  
taxi_data = taxi_data.rename(columns={'zipcode': 'ZipCode'})

In [16]:
# Merge datasets
merged_data = demographic_data.merge(arrest_data, on='ZipCode', how='left')
merged_data = merged_data.merge(business_data, on='ZipCode', how='left')
merged_data = merged_data.merge(noise_data, on='ZipCode', how='left')
merged_data = merged_data.merge(poi_data, on='ZipCode', how='left')
merged_data = merged_data.merge(taxi_data, on='ZipCode', how='left')

In [17]:
merged_data = merged_data.dropna(subset=['Population'])

In [18]:
merged_data.isnull().sum()

ZipCode                            0
Population                         0
PopulationDensity                  0
MedianHouseholdIncome              0
DiversityIndex                     0
FamilyHousehold                    0
SingleHousehold                    0
AvgMonthlyArrests                  7
entertainment_and_recreation       0
financial_services                 0
food_and_beverage                  0
home_services                      0
parking_and_automotive_services    0
professional_services              0
retail_services                    0
transportation                     0
TotalBusinesses                    0
BusinessTypeCount                  0
AvgMonthlyNoiseComplaints          0
cultural_facility                  0
education_facility                 0
health_services                    0
public_safety                      0
recreational_facility              0
religious_institution              0
transportation_facility            0
passenger_count                    0
d

In [19]:
# Impute AvgMonthlyArrests with median
merged_data['AvgMonthlyArrests'] = merged_data['AvgMonthlyArrests'].fillna(merged_data['AvgMonthlyArrests'].median())

In [21]:
# Impute places of interest data with 0
poi_columns = ['cultural_facility', 'education_facility', 'health_services', 
               'public_safety', 'recreational_facility', 'religious_institution', 'transportation_facility']
merged_data[poi_columns] = merged_data[poi_columns].fillna(0)

In [22]:
# Feature engineering
merged_data['BusinessDensity'] = merged_data['TotalBusinesses'] / merged_data['Population']
merged_data['ArrestsPerCapita'] = merged_data['AvgMonthlyArrests'] / merged_data['Population']
merged_data['NoiseComplaintsPerCapita'] = merged_data['AvgMonthlyNoiseComplaints'] / merged_data['Population']
merged_data['FamilyHouseholdRatio'] = merged_data['FamilyHousehold'] / (merged_data['FamilyHousehold'] + merged_data['SingleHousehold'])

facility_columns = ['cultural_facility', 'education_facility', 'health_services', 'public_safety', 'recreational_facility', 'religious_institution', 'transportation_facility']
for facility in facility_columns:
    merged_data[f'{facility}_density'] = merged_data[facility] / merged_data['Population']

In [23]:
merged_data['AmenityScore'] = merged_data[facility_columns].sum(axis=1)
merged_data['SafetyScore'] = (1 / (merged_data['ArrestsPerCapita'] + 1)) + merged_data['public_safety_density']  # Higher score means safer
merged_data['QuietScore'] = 1 / (merged_data['NoiseComplaintsPerCapita'] + 1)  # Higher score means quieter
merged_data['FamilyFriendlinessScore'] = merged_data['FamilyHouseholdRatio']
merged_data['BusinessEnvironmentScore'] = merged_data['BusinessTypeCount']

In [24]:
# Bin continuous variables
merged_data['IncomeCategory'] = pd.qcut(merged_data['MedianHouseholdIncome'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])
merged_data['PopulationDensityCategory'] = pd.qcut(merged_data['PopulationDensity'], q=5, labels=['Very Low', 'Low', 'Medium', 'High', 'Very High'])

In [25]:
scaler = MinMaxScaler()
numeric_columns = merged_data.select_dtypes(include=[np.number]).columns
numeric_columns = numeric_columns.drop(['ZipCode', 'passenger_count'])
merged_data[numeric_columns] = scaler.fit_transform(merged_data[numeric_columns])

In [26]:
# Separately scale the passenger_count (LivelinessScore)
merged_data['LivelinessScore'] = np.log1p(merged_data['passenger_count'])
merged_data['LivelinessScore'] = (merged_data['LivelinessScore'] - merged_data['LivelinessScore'].min()) / (merged_data['LivelinessScore'].max() - merged_data['LivelinessScore'].min())

In [28]:
# Map latitudes and longitudes to zip codes 
us_zip_codes = pd.read_csv('us_zip_codes copy')

In [29]:
# Rename columns to match your data
us_zip_codes = us_zip_codes.rename(columns={'ZIP': 'ZipCode', 'LAT': 'Latitude', 'LNG': 'Longitude'})

In [30]:
merged_data = merged_data.merge(us_zip_codes[['ZipCode', 'Latitude', 'Longitude']], on='ZipCode', how='left')

In [34]:
property_data = pd.read_csv('data/cleaned_property_data.csv')

In [35]:
# Forgot to add borough
borough_info = property_data[['ZIPCODE', 'BOROUGH']].drop_duplicates()
merged_data = merged_data.merge(borough_info, left_on='ZipCode', right_on='ZIPCODE', how='left')
merged_data.drop('ZIPCODE', axis=1, inplace=True)

In [36]:
merged_data

Unnamed: 0,ZipCode,Population,PopulationDensity,MedianHouseholdIncome,DiversityIndex,FamilyHousehold,SingleHousehold,AvgMonthlyArrests,entertainment_and_recreation,financial_services,...,SafetyScore,QuietScore,FamilyFriendlinessScore,BusinessEnvironmentScore,IncomeCategory,PopulationDensityCategory,LivelinessScore,Latitude,Longitude,BOROUGH
0,10001,0.178460,0.229401,0.291567,0.657538,0.116622,0.264750,0.125995,0.260870,0.15625,...,0.944887,0.297030,0.089688,0.833333,High,Medium,0.977530,40.750633,-73.997177,Manhattan
1,10002,0.736222,0.641051,0.061627,0.855261,0.735630,0.642147,0.414068,0.228261,0.09375,...,0.957731,0.614479,0.440576,0.833333,Very Low,Very High,0.847983,40.715775,-73.986212,Manhattan
2,10003,0.501438,0.673463,0.343147,0.453662,0.278357,0.768360,0.422632,0.380435,0.00000,...,0.933428,0.561235,0.038836,0.500000,Very High,Very High,0.930073,40.731829,-73.989181,Manhattan
3,10004,0.011866,0.029665,0.517659,0.544463,0.009048,0.010225,0.200071,0.032609,0.53125,...,0.382492,0.003576,0.165744,0.833333,Very High,Very Low,0.773701,40.688630,-74.018244,Manhattan
4,10005,0.049286,0.672479,0.495625,0.553878,0.023407,0.089543,0.204854,0.065217,0.12500,...,0.728680,0.528810,0.003927,0.666667,Very High,Very High,0.724375,40.706027,-74.008835,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
173,11691,0.538534,0.139689,0.091007,0.803618,0.528865,0.262181,0.300396,0.010870,0.00000,...,0.956775,0.865858,0.699973,0.500000,Very Low,Low,0.264675,40.601278,-73.761651,Queens
174,11692,0.154765,0.121295,0.109729,0.632334,0.166632,0.055123,0.081839,0.010870,0.00000,...,0.962270,0.847654,0.763476,0.666667,Low,Low,0.166121,40.594095,-73.792896,Queens
175,11693,0.093503,0.074831,0.143973,0.744671,0.110811,0.048440,0.153579,0.076087,0.03125,...,0.885879,0.779566,0.608474,0.666667,Low,Very Low,0.230819,40.590692,-73.809749,Queens
176,11694,0.172042,0.095858,0.269134,0.358441,0.186221,0.126599,0.035458,0.043478,0.00000,...,0.990415,0.837260,0.502956,0.500000,High,Very Low,0.233822,40.578270,-73.844762,Queens


In [37]:
# Save processed data
merged_data.to_csv('processed_data_by_zipcode.csv', index=False)

# Check scores

In [38]:
# Group by Borough and list zipcodes with their scores
borough_groups = merged_data.groupby('ZipCode')

for borough, group in borough_groups:
    print(f"\nBorough: {borough}")
    print(f"{'ZipCode':<10} {'LivelinessScore':<20} {'BusinessEnvironmentScore':<25}")
    print("-" * 55)
    for _, row in group.iterrows():
        zipcode = row['ZipCode']
        liveliness_score = row['LivelinessScore']
        business_env_score = row['BusinessEnvironmentScore']
        print(f"{zipcode:<10} {liveliness_score:<20.2f} {business_env_score:<25.2f}")


Borough: 10001
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10001      0.98                 0.83                     

Borough: 10002
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10002      0.85                 0.83                     

Borough: 10003
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10003      0.93                 0.50                     

Borough: 10004
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10004      0.77                 0.83                     

Borough: 10005
ZipCode    LivelinessScore      BusinessEnvironmentScore 
-------------------------------------------------------
10005      0.72                 0.67                     

Borough: 10006
ZipCode    LivelinessScore      BusinessEnvi