In [93]:
from uszipcode import SearchEngine
import pandas as pd
import numpy as np
from scipy import stats
import json

In [94]:
search = SearchEngine(simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive)

# NY 
zipcodes = search.by_state("NY", returns=None)

zipcode_info_list = []
for zipcode in zipcodes:
    zipcode_info = zipcode.to_dict()
    zipcode_info_list.append(zipcode_info)

# Convert 
df = pd.DataFrame(zipcode_info_list)

In [95]:
# Features 
features = [
    "zipcode",
    "population_by_year",
    "average_household_income_over_time"
]

In [96]:
df_demographic = df[features]

In [97]:
df_demographic

Unnamed: 0,zipcode,population_by_year,average_household_income_over_time
0,10001,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 1...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 9..."
1,10002,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 7...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 3..."
2,10003,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 3...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 1..."
3,10004,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 2...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 1..."
4,10005,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 4...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 8..."
...,...,...,...
1663,14901,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 1...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 2..."
1664,14903,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 7...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 4..."
1665,14904,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 1...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 3..."
1666,14905,"[{'key': 'Data', 'values': [{'x': 2005, 'y': 8...","[{'key': 'Data', 'values': [{'x': 2005, 'y': 5..."


In [98]:
nyc_zipcodes = [
    '10001', '10002', '10003', '10004', '10005', '10006', '10007', '10009', '10010', '10011', '10012',
    '10013', '10014', '10016', '10017', '10018', '10019', '10020', '10021', '10022', '10023', '10024',
    '10025', '10026', '10027', '10028', '10029', '10030', '10031', '10032', '10033', '10034', '10035',
    '10036', '10037', '10038', '10039', '10040', '10044', '10045', '10055', '10060', '10065', '10069',
    '10075', '10080', '10103', '10110', '10111', '10112', '10115', '10119', '10128', '10151', '10152',
    '10153', '10154', '10162', '10165', '10167', '10168', '10169', '10170', '10171', '10172', '10173',
    '10174', '10177', '10199', '10271', '10278', '10279', '10280', '10281', '10282', '10301', '10302',
    '10303', '10304', '10305', '10306', '10307', '10308', '10309', '10310', '10311', '10312', '10314',
    '10451', '10452', '10453', '10454', '10455', '10456', '10457', '10458', '10459', '10460', '10461',
    '10462', '10463', '10464', '10465', '10466', '10467', '10468', '10469', '10470', '10471', '10472',
    '10473', '10474', '10475', '11004', '11005', '11101', '11102', '11103', '11104', '11105', '11106',
    '11109', '11201', '11203', '11204', '11205', '11206', '11207', '11208', '11209', '11210', '11211',
    '11212', '11213', '11214', '11215', '11216', '11217', '11218', '11219', '11220', '11221', '11222',
    '11223', '11224', '11225', '11226', '11228', '11229', '11230', '11231', '11232', '11233', '11234',
    '11235', '11236', '11237', '11238', '11239', '11249', '11251', '11354', '11355', '11356', '11357',
    '11358', '11359', '11360', '11361', '11362', '11363', '11364', '11365', '11366', '11367', '11368',
    '11369', '11370', '11372', '11373', '11374', '11375', '11377', '11378', '11379', '11385', '11411',
    '11412', '11413', '11414', '11415', '11416', '11417', '11418', '11419', '11420', '11421', '11422',
    '11423', '11426', '11427', '11428', '11429', '11430', '11432', '11433', '11434', '11435', '11436',
    '11691', '11692', '11693', '11694', '11697'
]

In [99]:
df_demographic = df_demographic[df_demographic['zipcode'].isin(nyc_zipcodes)]

In [100]:
df_demographic = df_demographic.dropna()

In [101]:
# clean demographic data
def extract_clean_demographic_data(df, column_name):
    clean_data = []
    for index, row in df.iterrows():
        zipcode = row['zipcode']
        data = row[column_name]
        for entry in data:
            for value in entry['values']:
                year = value['x']
                if year >= 2010:  
                    clean_data.append([zipcode, year, value['y']])
    return clean_data

In [102]:
population_data = extract_clean_demographic_data(df_demographic, 'population_by_year')

In [103]:
income_data = extract_clean_demographic_data(df_demographic, 'average_household_income_over_time')

In [104]:
population_df = pd.DataFrame(population_data, columns=['zipcode', 'year', 'population'])
income_df = pd.DataFrame(income_data, columns=['zipcode', 'year', 'average_household_income'])

In [105]:
demographic_cleaned = pd.merge(population_df, income_df, on=['zipcode', 'year'])

In [106]:
demographic_cleaned

Unnamed: 0,zipcode,year,population,average_household_income
0,10001,2010,18158,128706.5596
1,10001,2011,18684,129849.7425
2,10001,2012,18280,144040.5263
3,10001,2013,18840,152670.6997
4,10001,2014,19370,167894.8864
...,...,...,...,...
1617,11697,2014,3530,94975.3846
1618,11697,2015,3740,102215.6863
1619,11697,2016,3900,99142.0814
1620,11697,2017,3920,106587.0536


In [107]:
from sklearn.linear_model import LinearRegression

# Fill missing data for 2019 to 2023 using linear regression
def fill_missing_years(df, start_year, end_year):
    filled_data = []
    for zipcode in df['zipcode'].unique():
        zipcode_df = df[df['zipcode'] == zipcode]
        for feature in ['population', 'average_household_income']:
            X = zipcode_df[['year']]
            y = zipcode_df[feature]
            model = LinearRegression()
            model.fit(X, y)
            for year in range(start_year, end_year + 1):
                prediction = model.predict(np.array([[year]]))[0]
                filled_data.append([zipcode, year, feature, prediction])
    return filled_data

filled_data = fill_missing_years(demographic_cleaned, 2019, 2023)



In [108]:
filled_df = pd.DataFrame(filled_data, columns=['zipcode', 'year', 'feature', 'value'])

In [109]:
filled_pivot = filled_df.pivot_table(index=['zipcode', 'year'], columns='feature', values='value').reset_index()
final_df = pd.concat([demographic_cleaned, filled_pivot])

In [110]:
final_df = final_df.sort_values(['zipcode', 'year'])

In [113]:
final_df.isnull().sum()

zipcode                     0
year                        0
population                  0
average_household_income    0
dtype: int64

In [114]:
final_df

Unnamed: 0,zipcode,year,population,average_household_income
0,10001,2010,18158.000000,128706.559600
1,10001,2011,18684.000000,129849.742500
2,10001,2012,18280.000000,144040.526300
3,10001,2013,18840.000000,152670.699700
4,10001,2014,19370.000000,167894.886400
...,...,...,...,...
910,11697,2019,3961.000000,110465.424356
911,11697,2020,4001.466667,113336.626616
912,11697,2021,4041.933333,116207.828876
913,11697,2022,4082.400000,119079.031136


In [115]:
final_df.to_csv('cleaned_demographic_data.csv', index=False)