In [26]:
import pandas as pd
import json
from uszipcode import SearchEngine


search = SearchEngine(simple_or_comprehensive=SearchEngine.SimpleOrComprehensiveArgEnum.comprehensive)

# Fetch NY zipcodes
zipcodes = search.by_state("NY", returns=None)

# collect info
zipcode_info_list = []
for zipcode in zipcodes:
    zipcode_info = zipcode.to_dict()
    zipcode_info_list.append(zipcode_info)

# Convert dictionaries to a DataFrame
df = pd.DataFrame(zipcode_info_list)

In [27]:
columns_to_keep = [
    "zipcode", "lat", "lng", "population",  "population_density", "median_home_value", "median_household_income"
]

In [28]:
df_cleaned = df[columns_to_keep]


In [29]:
df_cleaned.shape

(1668, 7)

In [30]:
# check duplicated rows
print('Number of duplicate (excluding original) rows is:', df_cleaned.duplicated().sum())
print('Number of duplicate rows (including first) in the table is:', df_cleaned[df_cleaned.duplicated(keep=False)].shape[0])
# Show duplicate row data that can be dropped
df_cleaned[df_cleaned.duplicated(keep=False)]

Number of duplicate (excluding original) rows is: 0
Number of duplicate rows (including first) in the table is: 0


Unnamed: 0,zipcode,lat,lng,population,population_density,median_home_value,median_household_income


In [31]:
# Check 
missing_data = df_cleaned[df_cleaned.isnull().any(axis=1) | (df_cleaned == '').any(axis=1)]

# Display 
print("Rows with missing values:")
print(missing_data)

Rows with missing values:
     zipcode    lat    lng  population  population_density  median_home_value  \
13     10015  40.71 -74.00         NaN                 NaN                NaN   
18     10020  40.76 -73.98         NaN                 NaN                NaN   
39     10041  40.71 -73.99         NaN                 NaN                NaN   
41     10045  40.71 -73.99         NaN                 NaN                NaN   
42     10048  40.71 -74.01         NaN                 NaN                NaN   
...      ...    ...    ...         ...                 ...                ...   
1520   14604  43.16 -77.61         NaN                 NaN                NaN   
1530   14614  43.16 -77.62         NaN                 NaN                NaN   
1631   14853  42.45 -76.48         NaN                 NaN                NaN   
1650   14881  42.40 -76.36         NaN                 NaN                NaN   
1667   14925  42.08 -76.80         NaN                 NaN                NaN   

 

In [32]:
df_cleaned['population_density'].fillna(df_cleaned['population_density'].median(), inplace=True)
df_cleaned['median_home_value'].fillna(df_cleaned['median_home_value'].median(), inplace=True)
df_cleaned['median_household_income'].fillna(df_cleaned['median_household_income'].median(), inplace=True)
df_cleaned['population'].fillna(df_cleaned['population'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_cleaned['population_density'].fillna(df_cleaned['population_density'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['population_density'].fillna(df_cleaned['population_density'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, i

In [36]:
# verify
df_cleaned.isnull().sum()


zipcode                    0
lat                        0
lng                        0
population                 0
population_density         0
median_home_value          0
median_household_income    0
dtype: int64

In [37]:
df_cleaned.to_csv('cleaned_price_data.csv', index=False)