In [97]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [98]:
# Load all datasets
arrest_data = pd.read_csv('data/cleaned_arrest_data.csv')
noise_data = pd.read_csv('data/cleaned_noise_data.csv')
business_data = pd.read_csv('data/cleaned_business_data.csv')
demographic_data = pd.read_csv('data/cleaned_demographic_data.csv')
poi_data = pd.read_csv('data/cleaned_poi.csv')
taxi_data = pd.read_csv('data/cleaned_taxi_data.csv')
property_prices = pd.read_csv('data/cleaned_property_data.csv')

In [99]:
# Arrest data
arrest_2023 = arrest_data[arrest_data['Year'] == 2023].groupby('ZipCode')['AvgMonthlyArrests'].mean().reset_index()

In [100]:
# Noise data
noise_2023 = noise_data[noise_data['Year'] == 2023].groupby('ZipCode')['AvgMonthlyNoiseComplaints'].mean().reset_index()

In [101]:
# Taxi data
taxi_2023 = taxi_data[taxi_data['date'].str.startswith('2023')].groupby('zipcode')['passenger_count'].sum().reset_index()
taxi_2023.columns = ['ZipCode', 'TotalPassengers2023']

In [102]:
# Business data
business_agg = business_data.groupby('ZipCode').agg({
    'Count': 'sum',
    'BusinessType': lambda x: ', '.join(x)
}).reset_index()
business_agg.columns = ['ZipCode', 'TotalBusinesses', 'BusinessTypes']

In [103]:
# POI data
poi_agg = poi_data.groupby('zipcode').agg({
    'count': 'sum',
    'distance_to_facility': 'mean',
    'cultural_facility': 'sum',
    'education_facility': 'sum',
    'health_services': 'sum',
    'public_safety': 'sum',
    'recreational_facility': 'sum',
    'religious_institution': 'sum',
    'transportation_facility': 'sum'
}).reset_index()
poi_agg.columns = ['ZipCode'] + list(poi_agg.columns[1:])

In [104]:
merged_data = demographic_data[demographic_data['Year'] == 2023].copy()

In [105]:
merged_data = merged_data.merge(arrest_2023, on='ZipCode', how='left')

In [106]:
merged_data

Unnamed: 0,ZipCode,Year,Population,PopulationDensity,MedianHouseholdIncome,travel_time_to_work_in_minutes,Male,Female,white,black,asian,american_indian,pacific_islander,other,FamilyHousehold,SingleHousehold,DiversityIndex,AvgMonthlyArrests
0,10001,2023,21634.851944,34816.507305,83733.295094,29.276734,10441.158762,11193.693182,14068.497695,1944.901627,3628.364185,76.893844,5.126256,1089.842082,3256.197980,18155.149190,0.501245,52.166667
1,10002,2023,83465.704517,94910.584256,34056.796126,35.559977,39543.940833,43921.763684,26150.058462,7008.617566,38871.376011,383.443969,32.808040,8198.934271,18547.820420,41199.721607,0.644615,155.916667
2,10003,2023,57438.676205,99642.118789,94876.750964,29.120846,27425.471021,30013.205184,43874.602126,2026.921727,8292.232135,103.550377,24.606030,1287.715574,7251.602113,48906.535277,0.353415,168.500000
3,10004,2023,3167.001121,5658.361666,132578.315296,27.330374,1650.654518,1516.346603,2221.719466,99.449372,671.539571,8.202010,1.025251,47.161558,598.746732,2613.365444,0.419255,
4,10005,2023,7315.167691,99498.583614,127818.073727,28.725415,3529.940065,3785.227627,5127.281517,248.110803,1517.371855,10.252513,2.050503,164.040201,953.483665,7456.652364,0.426082,91.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,11691,2023,61550.958982,21719.947798,40404.126635,49.758457,28871.075288,32679.883693,19074.799564,30863.138473,1227.225750,486.994345,63.565578,7402.314048,13440.018677,17998.285749,0.607169,123.166667
208,11692,2023,19008.158233,19034.814766,44448.742828,54.468195,8678.751858,10329.406375,3125.991071,12614.691418,532.105400,119.954397,9.227261,1875.184542,4491.625740,5354.887295,0.482970,34.916667
209,11693,2023,12216.893932,12251.752475,51846.955871,50.766264,5680.917194,6535.976739,6767.683522,3765.747853,381.393466,63.565578,1.025251,838.655525,3112.662804,4946.837296,0.564426,60.916667
210,11694,2023,20923.327574,15321.354727,78886.932421,47.685169,10213.552984,10709.774590,17253.953339,1919.270346,525.953893,39.984799,10.252513,803.796982,4975.544331,9719.381880,0.284370,16.916667


In [107]:
merged_data = merged_data.merge(noise_2023, on='ZipCode', how='left')

In [108]:
merged_data = merged_data.merge(poi_agg, on='ZipCode', how='left')

In [109]:
merged_data

Unnamed: 0,ZipCode,Year,Population,PopulationDensity,MedianHouseholdIncome,travel_time_to_work_in_minutes,Male,Female,white,black,...,AvgMonthlyNoiseComplaints,count,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility
0,10001,2023,21634.851944,34816.507305,83733.295094,29.276734,10441.158762,11193.693182,14068.497695,1944.901627,...,266.500000,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0
1,10002,2023,83465.704517,94910.584256,34056.796126,35.559977,39543.940833,43921.763684,26150.058462,7008.617566,...,729.916667,127.0,0.845870,5.0,29.0,2.0,9.0,46.0,12.0,24.0
2,10003,2023,57438.676205,99642.118789,94876.750964,29.120846,27425.471021,30013.205184,43874.602126,2026.921727,...,484.916667,130.0,0.621874,21.0,70.0,2.0,5.0,6.0,24.0,2.0
3,10004,2023,3167.001121,5658.361666,132578.315296,27.330374,1650.654518,1516.346603,2221.719466,99.449372,...,77.583333,51.0,1.529254,2.0,1.0,0.0,0.0,13.0,0.0,35.0
4,10005,2023,7315.167691,99498.583614,127818.073727,28.725415,3529.940065,3785.227627,5127.281517,248.110803,...,50.416667,32.0,0.612112,4.0,4.0,0.0,1.0,5.0,2.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,11691,2023,61550.958982,21719.947798,40404.126635,49.758457,28871.075288,32679.883693,19074.799564,30863.138473,...,237.666667,66.0,1.440766,1.0,24.0,3.0,3.0,24.0,7.0,4.0
208,11692,2023,19008.158233,19034.814766,44448.742828,54.468195,8678.751858,10329.406375,3125.991071,12614.691418,...,109.250000,20.0,1.200531,1.0,3.0,0.0,2.0,11.0,0.0,3.0
209,11693,2023,12216.893932,12251.752475,51846.955871,50.766264,5680.917194,6535.976739,6767.683522,3765.747853,...,75.000000,59.0,1.822859,2.0,11.0,0.0,4.0,23.0,3.0,16.0
210,11694,2023,20923.327574,15321.354727,78886.932421,47.685169,10213.552984,10709.774590,17253.953339,1919.270346,...,109.083333,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0


In [110]:
merged_data = merged_data.merge(taxi_2023, on='ZipCode', how='left')

In [111]:
merged_data

Unnamed: 0,ZipCode,Year,Population,PopulationDensity,MedianHouseholdIncome,travel_time_to_work_in_minutes,Male,Female,white,black,...,count,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility,TotalPassengers2023
0,10001,2023,21634.851944,34816.507305,83733.295094,29.276734,10441.158762,11193.693182,14068.497695,1944.901627,...,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18539258.0
1,10002,2023,83465.704517,94910.584256,34056.796126,35.559977,39543.940833,43921.763684,26150.058462,7008.617566,...,127.0,0.845870,5.0,29.0,2.0,9.0,46.0,12.0,24.0,5184155.0
2,10003,2023,57438.676205,99642.118789,94876.750964,29.120846,27425.471021,30013.205184,43874.602126,2026.921727,...,130.0,0.621874,21.0,70.0,2.0,5.0,6.0,24.0,2.0,11550509.0
3,10004,2023,3167.001121,5658.361666,132578.315296,27.330374,1650.654518,1516.346603,2221.719466,99.449372,...,51.0,1.529254,2.0,1.0,0.0,0.0,13.0,0.0,35.0,2535038.0
4,10005,2023,7315.167691,99498.583614,127818.073727,28.725415,3529.940065,3785.227627,5127.281517,248.110803,...,32.0,0.612112,4.0,4.0,0.0,1.0,5.0,2.0,16.0,1581394.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,11691,2023,61550.958982,21719.947798,40404.126635,49.758457,28871.075288,32679.883693,19074.799564,30863.138473,...,66.0,1.440766,1.0,24.0,3.0,3.0,24.0,7.0,4.0,18444.0
208,11692,2023,19008.158233,19034.814766,44448.742828,54.468195,8678.751858,10329.406375,3125.991071,12614.691418,...,20.0,1.200531,1.0,3.0,0.0,2.0,11.0,0.0,3.0,7362.0
209,11693,2023,12216.893932,12251.752475,51846.955871,50.766264,5680.917194,6535.976739,6767.683522,3765.747853,...,59.0,1.822859,2.0,11.0,0.0,4.0,23.0,3.0,16.0,14023.0
210,11694,2023,20923.327574,15321.354727,78886.932421,47.685169,10213.552984,10709.774590,17253.953339,1919.270346,...,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,14358.0


In [112]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 29 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ZipCode                         212 non-null    int64  
 1   Year                            212 non-null    int64  
 2   Population                      178 non-null    float64
 3   PopulationDensity               178 non-null    float64
 4   MedianHouseholdIncome           178 non-null    float64
 5   travel_time_to_work_in_minutes  181 non-null    float64
 6   Male                            186 non-null    float64
 7   Female                          186 non-null    float64
 8   white                           186 non-null    float64
 9   black                           186 non-null    float64
 10  asian                           186 non-null    float64
 11  american_indian                 186 non-null    float64
 12  pacific_islander                186 

In [77]:
merged_data = merged_data.merge(property_prices, left_on='ZipCode', right_on='ZIP CODE', how='left')

In [78]:
merged_data

Unnamed: 0,ZipCode,Year,Population,PopulationDensity,MedianHouseholdIncome,travel_time_to_work_in_minutes,Male,Female,white,black,...,TotalPassengers2023,ZIP CODE,SALE YEAR,AVG_SALE_PRICE,COUNT_CONDOS,COUNT_COOPS,COUNT_ONE_FAMILY_DWELLINGS,COUNT_THREE_FAMILY_DWELLINGS,COUNT_TWO_FAMILY_DWELLINGS,AVG_BUILDING_AGE
0,10001,2023,21634.851944,34816.507305,83733.295094,29.276734,10441.158762,11193.693182,14068.497695,1944.901627,...,18539258.0,10001.0,2023.0,2.399265e+06,179.0,85.0,0.0,0.0,0.0,35.950758
1,10002,2023,83465.704517,94910.584256,34056.796126,35.559977,39543.940833,43921.763684,26150.058462,7008.617566,...,5184155.0,10002.0,2023.0,1.482110e+06,189.0,102.0,2.0,0.0,3.0,39.128378
2,10003,2023,57438.676205,99642.118789,94876.750964,29.120846,27425.471021,30013.205184,43874.602126,2026.921727,...,11550509.0,10003.0,2023.0,1.640546e+06,196.0,290.0,4.0,2.0,1.0,78.306288
3,10004,2023,3167.001121,5658.361666,132578.315296,27.330374,1650.654518,1516.346603,2221.719466,99.449372,...,2535038.0,10004.0,2023.0,1.378606e+06,81.0,12.0,0.0,0.0,0.0,74.387097
4,10005,2023,7315.167691,99498.583614,127818.073727,28.725415,3529.940065,3785.227627,5127.281517,248.110803,...,1581394.0,10005.0,2023.0,1.312193e+06,121.0,7.0,0.0,0.0,0.0,77.468750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,11691,2023,61550.958982,21719.947798,40404.126635,49.758457,28871.075288,32679.883693,19074.799564,30863.138473,...,18444.0,11691.0,2023.0,7.483099e+05,6.0,7.0,64.0,14.0,53.0,66.506944
208,11692,2023,19008.158233,19034.814766,44448.742828,54.468195,8678.751858,10329.406375,3125.991071,12614.691418,...,7362.0,11692.0,2023.0,7.284113e+05,0.0,0.0,17.0,10.0,54.0,50.234568
209,11693,2023,12216.893932,12251.752475,51846.955871,50.766264,5680.917194,6535.976739,6767.683522,3765.747853,...,14023.0,11693.0,2023.0,4.944068e+05,20.0,0.0,37.0,2.0,22.0,58.432099
210,11694,2023,20923.327574,15321.354727,78886.932421,47.685169,10213.552984,10709.774590,17253.953339,1919.270346,...,14358.0,11694.0,2023.0,8.621666e+05,25.0,20.0,43.0,5.0,31.0,65.032258


In [96]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 56 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ZipCode                         212 non-null    int64  
 1   Population                      178 non-null    float64
 2   PopulationDensity               178 non-null    float64
 3   MedianHouseholdIncome           178 non-null    float64
 4   travel_time_to_work_in_minutes  181 non-null    float64
 5   Male                            186 non-null    float64
 6   Female                          186 non-null    float64
 7   white                           186 non-null    float64
 8   black                           186 non-null    float64
 9   asian                           186 non-null    float64
 10  american_indian                 186 non-null    float64
 11  pacific_islander                186 non-null    float64
 12  other                           186 

In [79]:
# Drop unnecessary columns and rename some for consistency
merged_data = merged_data.drop(columns=['Year', 'SALE YEAR', 'ZIP CODE'])
merged_data = merged_data.rename(columns={
    'AVG_SALE_PRICE': 'AvgSalePrice',
    'COUNT_CONDOS': 'CountCondos',
    'COUNT_COOPS': 'CountCoops',
    'COUNT_ONE_FAMILY_DWELLINGS': 'CountOneFamilyDwellings',
    'COUNT_THREE_FAMILY_DWELLINGS': 'CountThreeFamilyDwellings',
    'COUNT_TWO_FAMILY_DWELLINGS': 'CountTwoFamilyDwellings',
    'AVG_BUILDING_AGE': 'AvgBuildingAge'
})

In [80]:
# Handle missing values
merged_data = merged_data.fillna({
    'AvgMonthlyArrests': 0,
    'AvgMonthlyNoiseComplaints': 0,
    'count': 0,
    'distance_to_facility': 0,
    'cultural_facility': 0,
    'education_facility': 0,
    'health_services': 0,
    'public_safety': 0,
    'recreational_facility': 0,
    'religious_institution': 0,
    'transportation_facility': 0,
    'TotalPassengers2023': 0,
    'AvgSalePrice': merged_data['AvgSalePrice'].mean(),
    'AvgBuildingAge': merged_data['AvgBuildingAge'].mean()
})

In [81]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Feature Engineering

In [82]:
# a) Safety Score
merged_data['NormalizedArrestScore'] = 1 - scaler.fit_transform(merged_data[['AvgMonthlyArrests']])
merged_data['SafetyPOIScore'] = scaler.fit_transform(merged_data[['public_safety']])
merged_data['SafetyScore'] = (merged_data['NormalizedArrestScore'] * 0.7 + merged_data['SafetyPOIScore'] * 0.3)

In [83]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 38 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   ZipCode                         212 non-null    int64  
 1   Population                      178 non-null    float64
 2   PopulationDensity               178 non-null    float64
 3   MedianHouseholdIncome           178 non-null    float64
 4   travel_time_to_work_in_minutes  181 non-null    float64
 5   Male                            186 non-null    float64
 6   Female                          186 non-null    float64
 7   white                           186 non-null    float64
 8   black                           186 non-null    float64
 9   asian                           186 non-null    float64
 10  american_indian                 186 non-null    float64
 11  pacific_islander                186 non-null    float64
 12  other                           186 

In [85]:
# b) Busyness Score
merged_data['NormalizedTaxiScore'] = scaler.fit_transform(merged_data[['TotalPassengers2023']])
merged_data['NormalizedNoiseScore'] = scaler.fit_transform(merged_data[['AvgMonthlyNoiseComplaints']])
merged_data['BusynessScore'] = (merged_data['NormalizedTaxiScore'] * 0.5 + merged_data['NormalizedNoiseScore'] * 0.5)

In [86]:
# c) Amenity Score
amenity_columns = ['education_facility', 'health_services', 'recreational_facility', 'cultural_facility', 'religious_institution', 'transportation_facility']
for col in amenity_columns:
    merged_data[f'{col}_score'] = scaler.fit_transform(merged_data[[col]])

merged_data['AmenityScore'] = (
    merged_data['education_facility_score'] * 0.3 +
    merged_data['health_services_score'] * 0.2 +
    merged_data['recreational_facility_score'] * 0.1 +
    merged_data['cultural_facility_score'] * 0.1 +
    merged_data['religious_institution_score'] * 0.1 +
    merged_data['transportation_facility_score'] * 0.2
)

In [87]:
# d) House Quality Score
merged_data['NormalizedAgeScore'] = 1 - scaler.fit_transform(merged_data[['AvgBuildingAge']])
merged_data['NormalizedPriceScore'] = scaler.fit_transform(merged_data[['AvgSalePrice']])
merged_data['HouseQualityScore'] = (merged_data['NormalizedAgeScore'] * 0.4 + merged_data['NormalizedPriceScore'] * 0.6)

In [88]:
# e) Demographic Score
merged_data['NormalizedIncomeScore'] = scaler.fit_transform(merged_data[['MedianHouseholdIncome']])
merged_data['DiversityScore'] = merged_data['DiversityIndex']
merged_data['DemographicScore'] = (merged_data['NormalizedIncomeScore'] * 0.5 + merged_data['DiversityScore'] * 0.5)

In [89]:
# f) Proximity Score
merged_data['ProximityScore'] = 1 - scaler.fit_transform(merged_data[['distance_to_facility']])

In [90]:
# Calculate most common property type
property_types = ['CountCondos', 'CountCoops', 'CountOneFamilyDwellings', 'CountThreeFamilyDwellings', 'CountTwoFamilyDwellings']
merged_data['MostCommonPropertyType'] = merged_data[property_types].idxmax(axis=1).map({
    'CountCondos': 'Condo',
    'CountCoops': 'Co-op',
    'CountOneFamilyDwellings': 'One Family',
    'CountThreeFamilyDwellings': 'Three Family',
    'CountTwoFamilyDwellings': 'Two Family'
})

  merged_data['MostCommonPropertyType'] = merged_data[property_types].idxmax(axis=1).map({


In [93]:
# Select final columns
final_columns = [
    'ZipCode', 'SafetyScore', 'BusynessScore', 'AmenityScore', 'HouseQualityScore',
    'DemographicScore', 'ProximityScore', 'AvgSalePrice', 'MostCommonPropertyType',
    'PopulationDensity', 'MedianHouseholdIncome', 'travel_time_to_work_in_minutes',
    'FamilyHousehold', 'SingleHousehold', 'DiversityIndex'
]

final_data = merged_data[final_columns]

In [94]:
final_data

Unnamed: 0,ZipCode,SafetyScore,BusynessScore,AmenityScore,HouseQualityScore,DemographicScore,ProximityScore,AvgSalePrice,MostCommonPropertyType,PopulationDensity,MedianHouseholdIncome,travel_time_to_work_in_minutes,FamilyHousehold,SingleHousehold,DiversityIndex
0,10001,0.612360,0.503611,0.048628,0.478984,0.396406,0.793966,2.399265e+06,Condo,34816.507305,83733.295094,29.276734,3256.197980,18155.149190,0.501245
1,10002,0.573060,0.388518,0.331596,0.390993,0.353121,0.677485,1.482110e+06,Condo,94910.584256,34056.796126,35.559977,18547.820420,41199.721607,0.644615
2,10003,0.491920,0.434278,0.399793,0.236018,0.348281,0.762890,1.640546e+06,Co-op,99642.118789,94876.750964,29.120846,7251.602113,48906.535277,0.353415
3,10004,0.700000,0.084412,0.193392,0.231564,0.468457,0.416923,1.378606e+06,Condo,5658.361666,132578.315296,27.330374,598.746732,2613.365444,0.419255
4,10005,0.561700,0.053421,0.113315,0.212977,0.460853,0.766613,1.312193e+06,Condo,99498.583614,127818.073727,28.725415,953.483665,7456.652364,0.426082
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,11691,0.538080,0.090222,0.175882,0.214204,0.349088,0.450662,7.483099e+05,One Family,21719.947798,40404.126635,49.758457,13440.018677,17998.285749,0.607169
208,11692,0.671340,0.041449,0.039679,0.282295,0.296349,0.542259,7.284113e+05,Two Family,19034.814766,44448.742828,54.468195,4491.625740,5354.887295,0.482970
209,11693,0.657660,0.028649,0.155444,0.228202,0.354199,0.304976,4.944068e+05,One Family,12251.752475,51846.955871,50.766264,3112.662804,4946.837296,0.564426
210,11694,0.731580,0.041538,0.138800,0.229754,0.276752,0.057877,8.621666e+05,One Family,15321.354727,78886.932421,47.685169,4975.544331,9719.381880,0.284370


In [95]:
final_data.to_csv('training_data2.csv', index=False)