In [244]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

In [245]:
data = pd.read_csv('NY-House-Dataset.csv')

In [246]:
zip_borough_neighborhood = pd.read_csv('NYC Zipcodes.csv')

In [247]:
data['PRICE'] = data['PRICE'].astype(str)

In [248]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4801 entries, 0 to 4800
Data columns (total 17 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   BROKERTITLE                  4801 non-null   object 
 1   TYPE                         4801 non-null   object 
 2   PRICE                        4801 non-null   object 
 3   BEDS                         4801 non-null   int64  
 4   BATH                         4801 non-null   float64
 5   PROPERTYSQFT                 4801 non-null   float64
 6   ADDRESS                      4801 non-null   object 
 7   STATE                        4801 non-null   object 
 8   MAIN_ADDRESS                 4801 non-null   object 
 9   ADMINISTRATIVE_AREA_LEVEL_2  4801 non-null   object 
 10  LOCALITY                     4801 non-null   object 
 11  SUBLOCALITY                  4801 non-null   object 
 12  STREET_NAME                  4801 non-null   object 
 13  LONG_NAME         

In [249]:
columns_to_drop = [
    'BROKERTITLE', 'MAIN_ADDRESS', 'ADMINISTRATIVE_AREA_LEVEL_2',
    'FORMATTED_ADDRESS', 'LOCALITY', 'SUBLOCALITY',
    'STATE'
]
data_cleaned = data.drop(columns=columns_to_drop)

In [250]:
# Drop rows with specified "TYPE" values
types_to_drop = ['Pending', 'Contingent', 'Land for sale', 'For sale', 
                 'Foreclosure', 'Condop for sale', 'Coming Soon', 
                 'Mobile house for sale']
data_cleaned = data_cleaned[~data_cleaned['TYPE'].isin(types_to_drop)]

In [251]:
# Convert numerical columns to the correct data types
data_cleaned['PRICE'] = pd.to_numeric(data_cleaned['PRICE'].str.replace(',', ''), errors='coerce')
data_cleaned['BEDS'] = pd.to_numeric(data_cleaned['BEDS'], errors='coerce')
data_cleaned['BATH'] = pd.to_numeric(data_cleaned['BATH'], errors='coerce')
data_cleaned['PROPERTYSQFT'] = pd.to_numeric(data_cleaned['PROPERTYSQFT'], errors='coerce')

In [252]:
data_cleaned = data_cleaned[data_cleaned['PRICE'] >= 100000]

In [253]:
# Load the shapefile
shapefile_path = 'cb_2018_us_zcta510_500k/cb_2018_us_zcta510_500k.shp'
zip_shapefile  = gpd.read_file(shapefile_path)

In [254]:
# Create a GeoDataFrame from the cleaned dataset
geometry = [Point(xy) for xy in zip(data_cleaned['LONGITUDE'], data_cleaned['LATITUDE'])]
geo_data_cleaned = gpd.GeoDataFrame(data_cleaned, geometry=geometry)

# Ensure the GeoDataFrame and shapefile use the same coordinate reference system (CRS)
geo_data_cleaned = geo_data_cleaned.set_crs(zip_shapefile.crs, allow_override=True)

# Perform the spatial join to map latitude and longitude to ZIP codes
joined_data = gpd.sjoin(geo_data_cleaned, zip_shapefile, how='left', op='within')

# Extract the relevant columns (including the ZIP code from the shapefile)
joined_data = joined_data[['PRICE', 'BEDS', 'TYPE', 'BATH', 'PROPERTYSQFT', 'LATITUDE', 'LONGITUDE', 'STREET_NAME', 'LONG_NAME', 'geometry', 'ZCTA5CE10']]

# Rename the ZIP code column for clarity
joined_data = joined_data.rename(columns={'ZCTA5CE10': 'ZIPCODE'})

  if await self.run_code(code, result, async_=asy):


In [255]:
# Convert ZIPCODE columns to string type for merging
joined_data['ZIPCODE'] = joined_data['ZIPCODE'].astype(str)
zip_borough_neighborhood['ZIP Codes'] = zip_borough_neighborhood['ZIP Codes'].astype(str)

In [256]:
joined_data = joined_data.merge(zip_borough_neighborhood, left_on='ZIPCODE', right_on='ZIP Codes', how='left')

In [257]:
joined_data

Unnamed: 0,PRICE,BEDS,TYPE,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE,STREET_NAME,LONG_NAME,geometry,ZIPCODE,Borough,Neighborhood,ZIP Codes
0,315000,2,Condo for sale,2.000000,1400.000000,40.761255,-73.974483,East 55th Street,Regis Residence,POINT (-73.97448 40.76125),10022,Manhattan,Gramercy Park and Murray Hill,10022
1,195000000,7,Condo for sale,10.000000,17545.000000,40.766393,-73.980991,New York,West 57th Street,POINT (-73.98099 40.76639),10019,Manhattan,Chelsea and Clinton,10019
2,260000,4,House for sale,2.000000,2015.000000,40.541805,-74.196109,Staten Island,Sinclair Avenue,POINT (-74.19611 40.54181),10312,Staten Island,South Shore,10312
3,55000000,7,Townhouse for sale,2.373861,14175.000000,40.767224,-73.969856,New York,East 64th Street,POINT (-73.96986 40.76722),10065,Manhattan,Upper East Side,10065
4,690000,5,House for sale,2.000000,4004.000000,40.674363,-73.958725,Brooklyn,Park Place,POINT (-73.95872 40.67436),11238,Brooklyn,Central Brooklyn,11238
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4362,599000,1,Co-op for sale,1.000000,2184.207862,40.774350,-73.955879,Manhattan,222,POINT (-73.95588 40.77435),10075,Manhattan,Upper East Side,10075
4363,245000,1,Co-op for sale,1.000000,2184.207862,40.732538,-73.860152,Queens,62nd Drive,POINT (-73.86015 40.73254),11374,Queens,West Central Queens,11374
4364,1275000,1,Co-op for sale,1.000000,2184.207862,40.745882,-74.003398,New York,West 21st Street,POINT (-74.00340 40.74588),10011,Manhattan,Chelsea and Clinton,10011
4365,598125,2,Condo for sale,1.000000,655.000000,40.742770,-73.872752,Flushing,91-23,POINT (-73.87275 40.74277),11373,Queens,West Queens,11373


In [258]:
joined_data.drop(columns=['geometry'])

Unnamed: 0,PRICE,BEDS,TYPE,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE,STREET_NAME,LONG_NAME,ZIPCODE,Borough,Neighborhood,ZIP Codes
0,315000,2,Condo for sale,2.000000,1400.000000,40.761255,-73.974483,East 55th Street,Regis Residence,10022,Manhattan,Gramercy Park and Murray Hill,10022
1,195000000,7,Condo for sale,10.000000,17545.000000,40.766393,-73.980991,New York,West 57th Street,10019,Manhattan,Chelsea and Clinton,10019
2,260000,4,House for sale,2.000000,2015.000000,40.541805,-74.196109,Staten Island,Sinclair Avenue,10312,Staten Island,South Shore,10312
3,55000000,7,Townhouse for sale,2.373861,14175.000000,40.767224,-73.969856,New York,East 64th Street,10065,Manhattan,Upper East Side,10065
4,690000,5,House for sale,2.000000,4004.000000,40.674363,-73.958725,Brooklyn,Park Place,11238,Brooklyn,Central Brooklyn,11238
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4362,599000,1,Co-op for sale,1.000000,2184.207862,40.774350,-73.955879,Manhattan,222,10075,Manhattan,Upper East Side,10075
4363,245000,1,Co-op for sale,1.000000,2184.207862,40.732538,-73.860152,Queens,62nd Drive,11374,Queens,West Central Queens,11374
4364,1275000,1,Co-op for sale,1.000000,2184.207862,40.745882,-74.003398,New York,West 21st Street,10011,Manhattan,Chelsea and Clinton,10011
4365,598125,2,Condo for sale,1.000000,655.000000,40.742770,-73.872752,Flushing,91-23,11373,Queens,West Queens,11373


In [259]:
joined_data = joined_data.drop(columns=['geometry'])

In [260]:
joined_data

Unnamed: 0,PRICE,BEDS,TYPE,BATH,PROPERTYSQFT,LATITUDE,LONGITUDE,STREET_NAME,LONG_NAME,ZIPCODE,Borough,Neighborhood,ZIP Codes
0,315000,2,Condo for sale,2.000000,1400.000000,40.761255,-73.974483,East 55th Street,Regis Residence,10022,Manhattan,Gramercy Park and Murray Hill,10022
1,195000000,7,Condo for sale,10.000000,17545.000000,40.766393,-73.980991,New York,West 57th Street,10019,Manhattan,Chelsea and Clinton,10019
2,260000,4,House for sale,2.000000,2015.000000,40.541805,-74.196109,Staten Island,Sinclair Avenue,10312,Staten Island,South Shore,10312
3,55000000,7,Townhouse for sale,2.373861,14175.000000,40.767224,-73.969856,New York,East 64th Street,10065,Manhattan,Upper East Side,10065
4,690000,5,House for sale,2.000000,4004.000000,40.674363,-73.958725,Brooklyn,Park Place,11238,Brooklyn,Central Brooklyn,11238
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4362,599000,1,Co-op for sale,1.000000,2184.207862,40.774350,-73.955879,Manhattan,222,10075,Manhattan,Upper East Side,10075
4363,245000,1,Co-op for sale,1.000000,2184.207862,40.732538,-73.860152,Queens,62nd Drive,11374,Queens,West Central Queens,11374
4364,1275000,1,Co-op for sale,1.000000,2184.207862,40.745882,-74.003398,New York,West 21st Street,10011,Manhattan,Chelsea and Clinton,10011
4365,598125,2,Condo for sale,1.000000,655.000000,40.742770,-73.872752,Flushing,91-23,11373,Queens,West Queens,11373


In [261]:
# Remove "for sale" from the TYPE column
joined_data['TYPE'] = joined_data['TYPE'].str.replace(' for sale', '', regex=False)

In [262]:
# Round bath and bedroom numbers
joined_data['BEDS'] = joined_data['BEDS'].apply(lambda x: round(x * 2) / 2)
joined_data['BATH'] = joined_data['BATH'].apply(lambda x: round(x * 2) / 2)

In [263]:
# Feature Engineering
joined_data['PRICE_PER_SQFT'] = joined_data['PRICE'] / joined_data['PROPERTYSQFT']

In [264]:
# Remove outliers 
mean_price = joined_data['PRICE'].mean()
std_price = joined_data['PRICE'].std()
joined_data = joined_data[(joined_data['PRICE'] > mean_price - 4*std_price) & (joined_data['PRICE'] < mean_price + 4*std_price)]

In [265]:
# Remove outliers using IQR method
Q1 = joined_data['PRICE'].quantile(0.25)
Q3 = joined_data['PRICE'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
joined_data = joined_data[(joined_data['PRICE'] >= lower_bound) & (joined_data['PRICE'] <= upper_bound)]

In [266]:
# Ensure correct data types
joined_data['PRICE'] = joined_data['PRICE'].astype(float)
joined_data['BEDS'] = joined_data['BEDS'].astype(float)
joined_data['BATH'] = joined_data['BATH'].astype(float)
joined_data['PROPERTYSQFT'] = joined_data['PROPERTYSQFT'].astype(float)

In [267]:
joined_data.replace([np.inf, -np.inf], np.nan, inplace=True)
joined_data = joined_data.dropna()
joined_data = joined_data[(joined_data != 0).all(axis=1)]

In [268]:
# Print highest and lowest value sales
print("Lowest value sale:", joined_data['PRICE'].min())
print("Highest value sale:", joined_data['PRICE'].max())

Lowest value sale: 100000.0
Highest value sale: 3000000.0


In [269]:
print("Price statistics after removing outliers:")
print(joined_data['PRICE'].describe())

Price statistics after removing outliers:
count    3.845000e+03
mean     9.182846e+05
std      6.341462e+05
min      1.000000e+05
25%      4.500000e+05
50%      7.499990e+05
75%      1.200000e+06
max      3.000000e+06
Name: PRICE, dtype: float64


In [270]:
joined_data.to_csv('cleaned_property_data.csv', index=False)

In [272]:
joined_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3845 entries, 0 to 4366
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PRICE           3845 non-null   float64
 1   BEDS            3845 non-null   float64
 2   TYPE            3845 non-null   object 
 3   BATH            3845 non-null   float64
 4   PROPERTYSQFT    3845 non-null   float64
 5   LATITUDE        3845 non-null   float64
 6   LONGITUDE       3845 non-null   float64
 7   STREET_NAME     3845 non-null   object 
 8   LONG_NAME       3845 non-null   object 
 9   ZIPCODE         3845 non-null   object 
 10  Borough         3845 non-null   object 
 11  Neighborhood    3845 non-null   object 
 12  ZIP Codes       3845 non-null   object 
 13  PRICE_PER_SQFT  3845 non-null   float64
dtypes: float64(7), object(7)
memory usage: 450.6+ KB


In [271]:
import json

# Save data as JSON by ZIP code
grouped_data = joined_data.groupby('ZIPCODE').apply(lambda x: x.to_dict(orient='records')).to_dict()
with open('property_data_by_zipcode.json', 'w') as json_file:
    json.dump(grouped_data, json_file)

  grouped_data = joined_data.groupby('ZIPCODE').apply(lambda x: x.to_dict(orient='records')).to_dict()
