In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [243]:
property_df = pd.read_csv('data/cleaned_monthly_avg_prices.csv')
arrest_df = pd.read_csv('data/cleaned_arrest_data.csv')
noise_df = pd.read_csv('data/cleaned_noise_data.csv')
demographic_df = pd.read_csv('data/cleaned_demographic_data.csv')
poi_df = pd.read_csv('data/cleaned_POI.csv')

In [244]:
demographic_df = demographic_df.rename(columns={'year': 'Year'})


In [245]:
# Preprocess property data
property_df['YearMonth'] = pd.to_datetime(property_df['YearMonth'])
property_df['Year'] = property_df['YearMonth'].dt.year
property_df['Month'] = property_df['YearMonth'].dt.month

In [246]:
# Preprocess arrest and noise data
for df in [arrest_df, noise_df]:
    df['YearMonth'] = pd.to_datetime(df['YearMonth'])
    df['Year'] = df['YearMonth'].dt.year
    df['Month'] = df['YearMonth'].dt.month

In [247]:
# Aggregate POI data by ZIP_CODE
poi_agg = poi_df.groupby('zipcode').agg({
    'count': 'sum',
    'distance_to_facility': 'mean',
    'cultural_facility': 'sum',
    'education_facility': 'sum',
    'health_services': 'sum',
    'public_safety': 'sum',
    'recreational_facility': 'sum',
    'religious_institution': 'sum',
    'transportation_facility': 'sum'
}).reset_index()

In [248]:
# Merge property and arrest data
merged_df = property_df.merge(arrest_df, on=['ZipCode', 'Year', 'Month'], how='left')

In [249]:
# Merge noise data
merged_df = merged_df.merge(noise_df, on=['ZipCode', 'Year', 'Month'], how='left')

In [250]:
merged_df = merged_df.rename(columns={'ZipCode': 'zipcode'})

In [251]:
# Merge POI data
merged_df = merged_df.merge(poi_agg, on='zipcode', how='left')

In [252]:
# Merge demographic data
merged_df = merged_df.merge(demographic_df, on=['zipcode', 'Year'], how='left')

In [253]:
merged_df

Unnamed: 0,zipcode,Borough,YearMonth_x,AveragePrice,Year,Month,YearMonth_y,ArrestCount,YearMonth,NoiseComplaints,...,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility,population,average_household_income
0,10001,New York County,2010-01-31,1522433,2010,1,2010-01-01,166.0,2010-01-01,98.0,...,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.0,128706.5596
1,10001,New York County,2010-02-28,1514631,2010,2,2010-02-01,130.0,2010-02-01,93.0,...,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.0,128706.5596
2,10001,New York County,2010-03-31,1509422,2010,3,2010-03-01,157.0,2010-03-01,105.0,...,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.0,128706.5596
3,10001,New York County,2010-04-30,1502416,2010,4,2010-04-01,149.0,2010-04-01,143.0,...,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.0,128706.5596
4,10001,New York County,2010-05-31,1503079,2010,5,2010-05-01,150.0,2010-05-01,127.0,...,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.0,128706.5596
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29751,11694,Queens County,2024-01-31,849472,2024,1,NaT,,2024-01-01,68.0,...,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,,
29752,11694,Queens County,2024-02-29,850100,2024,2,NaT,,2024-02-01,83.0,...,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,,
29753,11694,Queens County,2024-03-31,852616,2024,3,NaT,,2024-03-01,98.0,...,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,,
29754,11694,Queens County,2024-04-30,857508,2024,4,NaT,,2024-04-01,107.0,...,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,,


In [254]:
merged_df = merged_df[merged_df['Year'] < 2024]

In [255]:
merged_df = merged_df.drop(columns=['YearMonth_x'])

# Rename YearMonth_x to YearMonth for clarity
merged_df.rename(columns={'YearMonth_y': 'YearMonth'}, inplace=True)

In [256]:
merged_df.isnull().sum()

zipcode                        0
Borough                        0
AveragePrice                   0
Year                           0
Month                          0
YearMonth                   1257
ArrestCount                 1257
YearMonth                   6233
NoiseComplaints             6233
count                        168
distance_to_facility         168
cultural_facility            168
education_facility           168
health_services              168
public_safety                168
recreational_facility        168
religious_institution        168
transportation_facility      168
population                   168
average_household_income     168
dtype: int64

In [257]:
merged_df.dropna(inplace=True)

In [258]:
merged_df.isnull().sum()

zipcode                     0
Borough                     0
AveragePrice                0
Year                        0
Month                       0
YearMonth                   0
ArrestCount                 0
YearMonth                   0
NoiseComplaints             0
count                       0
distance_to_facility        0
cultural_facility           0
education_facility          0
health_services             0
public_safety               0
recreational_facility       0
religious_institution       0
transportation_facility     0
population                  0
average_household_income    0
dtype: int64

In [259]:
unique_zipcodes = merged_df['zipcode'].nunique()
print(f"Number of unique NYC zip codes: {unique_zipcodes}")

Number of unique NYC zip codes: 166


In [260]:
merged_df

Unnamed: 0,zipcode,Borough,AveragePrice,Year,Month,YearMonth,ArrestCount,YearMonth.1,NoiseComplaints,count,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility,population,average_household_income
0,10001,New York County,1522433,2010,1,2010-01-01,166.0,2010-01-01,98.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600
1,10001,New York County,1514631,2010,2,2010-02-01,130.0,2010-02-01,93.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600
2,10001,New York County,1509422,2010,3,2010-03-01,157.0,2010-03-01,105.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600
3,10001,New York County,1502416,2010,4,2010-04-01,149.0,2010-04-01,143.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600
4,10001,New York County,1503079,2010,5,2010-05-01,150.0,2010-05-01,127.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29746,11694,Queens County,852217,2023,8,2023-08-01,24.0,2023-08-01,145.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358
29747,11694,Queens County,853475,2023,9,2023-09-01,11.0,2023-09-01,110.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358
29748,11694,Queens County,853296,2023,10,2023-10-01,16.0,2023-10-01,95.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358
29749,11694,Queens County,851537,2023,11,2023-11-01,12.0,2023-11-01,67.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358


In [261]:
# Map zip codes to boroughs
zip_to_borough = {
    # Bronx
    '10453': 'Bronx', '10457': 'Bronx', '10460': 'Bronx',
    '10458': 'Bronx', '10467': 'Bronx', '10468': 'Bronx',
    '10451': 'Bronx', '10452': 'Bronx', '10456': 'Bronx',
    '10454': 'Bronx', '10455': 'Bronx', '10459': 'Bronx', '10474': 'Bronx',
    '10463': 'Bronx', '10471': 'Bronx',
    '10466': 'Bronx', '10469': 'Bronx', '10470': 'Bronx', '10475': 'Bronx',
    '10461': 'Bronx', '10462': 'Bronx', '10464': 'Bronx', '10465': 'Bronx', '10472': 'Bronx', '10473': 'Bronx',

    # Brooklyn
    '11212': 'Brooklyn', '11213': 'Brooklyn', '11216': 'Brooklyn', '11233': 'Brooklyn', '11238': 'Brooklyn',
    '11209': 'Brooklyn', '11214': 'Brooklyn', '11228': 'Brooklyn',
    '11204': 'Brooklyn', '11218': 'Brooklyn', '11219': 'Brooklyn', '11230': 'Brooklyn',
    '11234': 'Brooklyn', '11236': 'Brooklyn', '11239': 'Brooklyn',
    '11223': 'Brooklyn', '11224': 'Brooklyn', '11229': 'Brooklyn', '11235': 'Brooklyn',
    '11201': 'Brooklyn', '11205': 'Brooklyn', '11215': 'Brooklyn', '11217': 'Brooklyn', '11231': 'Brooklyn',
    '11203': 'Brooklyn', '11210': 'Brooklyn', '11225': 'Brooklyn', '11226': 'Brooklyn',
    '11207': 'Brooklyn', '11208': 'Brooklyn',
    '11211': 'Brooklyn', '11222': 'Brooklyn',
    '11220': 'Brooklyn', '11232': 'Brooklyn',
    '11206': 'Brooklyn', '11221': 'Brooklyn', '11237': 'Brooklyn',

    # Manhattan
    '10026': 'Manhattan', '10027': 'Manhattan', '10030': 'Manhattan', '10037': 'Manhattan', '10039': 'Manhattan',
    '10001': 'Manhattan', '10011': 'Manhattan', '10018': 'Manhattan', '10019': 'Manhattan', '10020': 'Manhattan', '10036': 'Manhattan',
    '10029': 'Manhattan', '10035': 'Manhattan',
    '10010': 'Manhattan', '10016': 'Manhattan', '10017': 'Manhattan', '10022': 'Manhattan',
    '10012': 'Manhattan', '10013': 'Manhattan', '10014': 'Manhattan',
    '10004': 'Manhattan', '10005': 'Manhattan', '10006': 'Manhattan', '10007': 'Manhattan', '10038': 'Manhattan', '10280': 'Manhattan',
    '10002': 'Manhattan', '10003': 'Manhattan', '10009': 'Manhattan',
    '10021': 'Manhattan', '10028': 'Manhattan', '10044': 'Manhattan', '10065': 'Manhattan', '10075': 'Manhattan', '10128': 'Manhattan',
    '10023': 'Manhattan', '10024': 'Manhattan', '10025': 'Manhattan',
    '10031': 'Manhattan', '10032': 'Manhattan', '10033': 'Manhattan', '10034': 'Manhattan', '10040': 'Manhattan',

    # Queens
    '11361': 'Queens', '11362': 'Queens', '11363': 'Queens', '11364': 'Queens',
    '11354': 'Queens', '11355': 'Queens', '11356': 'Queens', '11357': 'Queens', '11358': 'Queens', '11359': 'Queens', '11360': 'Queens',
    '11365': 'Queens', '11366': 'Queens', '11367': 'Queens',
    '11412': 'Queens', '11423': 'Queens', '11432': 'Queens', '11433': 'Queens', '11434': 'Queens', '11435': 'Queens', '11436': 'Queens',
    '11101': 'Queens', '11102': 'Queens', '11103': 'Queens', '11104': 'Queens', '11105': 'Queens', '11106': 'Queens',
    '11374': 'Queens', '11375': 'Queens', '11379': 'Queens', '11385': 'Queens',
    '11691': 'Queens', '11692': 'Queens', '11693': 'Queens', '11694': 'Queens', '11695': 'Queens', '11697': 'Queens',
    '11004': 'Queens', '11005': 'Queens', '11411': 'Queens', '11413': 'Queens', '11422': 'Queens', '11426': 'Queens', '11427': 'Queens', '11428': 'Queens', '11429': 'Queens',
    '11414': 'Queens', '11415': 'Queens', '11416': 'Queens', '11417': 'Queens', '11418': 'Queens', '11419': 'Queens', '11420': 'Queens', '11421': 'Queens',
    '11368': 'Queens', '11369': 'Queens', '11370': 'Queens', '11372': 'Queens', '11373': 'Queens', '11377': 'Queens', '11378': 'Queens',

    # Staten Island
    '10302': 'Staten Island', '10303': 'Staten Island', '10310': 'Staten Island',
    '10306': 'Staten Island', '10307': 'Staten Island', '10308': 'Staten Island', '10309': 'Staten Island', '10312': 'Staten Island',
    '10301': 'Staten Island', '10304': 'Staten Island', '10305': 'Staten Island',
    '10314': 'Staten Island'
}

# Map zip codes to boroughs
def get_borough(zipcode):
    str_zip = str(zipcode).zfill(5)  
    return zip_to_borough.get(str_zip, 'Unknown') 

merged_df = merged_df.drop(columns=['Borough'])

# Apply 
merged_df['Borough'] = merged_df['zipcode'].apply(get_borough)

In [262]:
merged_df

Unnamed: 0,zipcode,AveragePrice,Year,Month,YearMonth,ArrestCount,YearMonth.1,NoiseComplaints,count,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility,population,average_household_income,Borough
0,10001,1522433,2010,1,2010-01-01,166.0,2010-01-01,98.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
1,10001,1514631,2010,2,2010-02-01,130.0,2010-02-01,93.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
2,10001,1509422,2010,3,2010-03-01,157.0,2010-03-01,105.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
3,10001,1502416,2010,4,2010-04-01,149.0,2010-04-01,143.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
4,10001,1503079,2010,5,2010-05-01,150.0,2010-05-01,127.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29746,11694,852217,2023,8,2023-08-01,24.0,2023-08-01,145.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29747,11694,853475,2023,9,2023-09-01,11.0,2023-09-01,110.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29748,11694,853296,2023,10,2023-10-01,16.0,2023-10-01,95.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29749,11694,851537,2023,11,2023-11-01,12.0,2023-11-01,67.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens


In [263]:
merged_df

Unnamed: 0,zipcode,AveragePrice,Year,Month,YearMonth,ArrestCount,YearMonth.1,NoiseComplaints,count,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility,population,average_household_income,Borough
0,10001,1522433,2010,1,2010-01-01,166.0,2010-01-01,98.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
1,10001,1514631,2010,2,2010-02-01,130.0,2010-02-01,93.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
2,10001,1509422,2010,3,2010-03-01,157.0,2010-03-01,105.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
3,10001,1502416,2010,4,2010-04-01,149.0,2010-04-01,143.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
4,10001,1503079,2010,5,2010-05-01,150.0,2010-05-01,127.0,18.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29746,11694,852217,2023,8,2023-08-01,24.0,2023-08-01,145.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29747,11694,853475,2023,9,2023-09-01,11.0,2023-09-01,110.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29748,11694,853296,2023,10,2023-10-01,16.0,2023-10-01,95.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29749,11694,851537,2023,11,2023-11-01,12.0,2023-11-01,67.0,47.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens


In [264]:
# Drop 'Year' and 'Month' columns as they are redundant
merged_df = merged_df.drop(columns=['Year', 'Month', 'count'])
merged_df = merged_df.loc[:, ~merged_df.columns.duplicated()]

In [265]:
merged_df

Unnamed: 0,zipcode,AveragePrice,YearMonth,ArrestCount,NoiseComplaints,distance_to_facility,cultural_facility,education_facility,health_services,public_safety,recreational_facility,religious_institution,transportation_facility,population,average_household_income,Borough
0,10001,1522433,2010-01-01,166.0,98.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
1,10001,1514631,2010-02-01,130.0,93.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
2,10001,1509422,2010-03-01,157.0,105.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
3,10001,1502416,2010-04-01,149.0,143.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
4,10001,1503079,2010-05-01,150.0,127.0,0.540371,0.0,9.0,0.0,0.0,5.0,4.0,0.0,18158.000000,128706.559600,Manhattan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29746,11694,852217,2023-08-01,24.0,145.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29747,11694,853475,2023-09-01,11.0,110.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29748,11694,853296,2023-10-01,16.0,95.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens
29749,11694,851537,2023-11-01,12.0,67.0,2.470934,1.0,7.0,0.0,4.0,14.0,3.0,18.0,15520.288889,103495.462358,Queens


In [266]:
merged_df.to_csv('data/training_data.csv', index=False)

In [267]:
# Ensure unique column names
def make_column_names_unique(df):
    cols = pd.Series(df.columns)
    for dup in cols[cols.duplicated()].unique():
        cols[cols[cols == dup].index.values.tolist()] = [dup + '_' + str(i) if i != 0 else dup for i in range(sum(cols == dup))]
    df.columns = cols
    return df

# Apply the function to the merged_df
merged_df = make_column_names_unique(merged_df)

In [268]:
# Save to JSON
merged_df.to_json('data/training_data.json', orient='records', indent=4)