In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
# Set some display options for better viewing
pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Load your golden dataset
df_raw = pd.read_parquet('../data/processed/nyc_sales_combined.parquet')

print(df_raw.shape)
df_raw.info()
df_raw.head()

(583557, 21)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 583557 entries, 0 to 583556
Data columns (total 21 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   borough                         583557 non-null  float64       
 1   neighborhood                    583557 non-null  object        
 2   building_class_category         583557 non-null  object        
 3   tax_class_at_present            583557 non-null  object        
 4   block                           583557 non-null  float64       
 5   lot                             583557 non-null  float64       
 6   easement                        0 non-null       float64       
 7   building_class_at_present       582990 non-null  object        
 8   address                         583557 non-null  object        
 9   apartment_number                583557 non-null  object        
 10  zip_code                        583508 non-

Unnamed: 0,borough,neighborhood,building_class_category,tax_class_at_present,block,lot,easement,building_class_at_present,address,apartment_number,zip_code,residential_units,commercial_units,total_units,land_square_feet,gross_square_feet,year_built,tax_class_at_time_of_sale,building_class_at_time_of_sale,sale_price,sale_date
0,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5395.0,8.0,,A1,18 EDWIN STREET,,10312.0,1.0,0.0,1.0,7000.0,1976.0,1980.0,1.0,A1,890000.0,2024-01-31
1,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5395.0,15.0,,A1,10 EDWIN STREET,,10312.0,1.0,0.0,1.0,7000.0,2200.0,1980.0,1.0,A1,950000.0,2024-02-29
2,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5396.0,13.0,,A1,524 SYCAMORE STREET,,10312.0,1.0,0.0,1.0,10296.0,2975.0,1986.0,1.0,A1,1250000.0,2024-09-16
3,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5396.0,41.0,,A1,21 TALLMAN STREET,,10312.0,1.0,0.0,1.0,9968.0,3940.0,1990.0,1.0,A1,0.0,2024-02-14
4,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5401.0,26.0,,A1,7 WEAVER STREET,,10312.0,1.0,0.0,1.0,10800.0,4743.0,1980.0,1.0,A1,1415000.0,2024-10-28


In [6]:
print(df_raw['building_class_category'].value_counts().head(20))

building_class_category
01 ONE FAMILY DWELLINGS              130152
02 TWO FAMILY DWELLINGS              109048
10 COOPS - ELEVATOR APARTMENTS        91781
13 CONDOS - ELEVATOR APARTMENTS       89853
03 THREE FAMILY DWELLINGS             30781
07 RENTALS - WALKUP APARTMENTS        20033
09 COOPS - WALKUP APARTMENTS          18447
04 TAX CLASS 1 CONDOS                 11406
15 CONDOS - 2-10 UNIT RESIDENTIAL     10418
44 CONDO PARKING                       9492
17 CONDO COOPS                         8105
05 TAX CLASS 1 VACANT LAND             7645
12 CONDOS - WALKUP APARTMENTS          6672
22 STORE BUILDINGS                     6118
14 RENTALS - 4-10 UNIT                 4143
29 COMMERCIAL GARAGES                  3674
08 RENTALS - ELEVATOR APARTMENTS       3225
47 CONDO NON-BUSINESS STORAGE          2723
21 OFFICE BUILDINGS                    2412
31 COMMERCIAL VACANT LAND              2394
Name: count, dtype: int64


In [8]:
# Look at the lower percentiles of your raw sale_price data
# Focus on prices that are not zero first
non_zero_prices = df_raw[df_raw['sale_price'] > 0]['sale_price']
print(non_zero_prices.describe(percentiles=[.01, .05, .10, .25]))

count       404082.00
mean       1953871.91
std       12715843.24
min              1.00
1%              10.00
5%          145000.00
10%         250000.00
25%         465000.00
50%         750000.00
max     2397501899.00
Name: sale_price, dtype: float64


In [7]:
# Look at the lower percentiles for square footage
non_zero_sqft = df_raw[df_raw['gross_square_feet'] > 0]['gross_square_feet']
print(non_zero_sqft.describe(percentiles=[.01, .05, .10, .25]))

count    329411.00
mean       5696.54
std       43583.70
min           1.00
1%          564.00
5%          918.00
10%        1092.00
25%        1374.00
50%        1950.00
max     8942176.00
Name: gross_square_feet, dtype: float64


In [9]:
# --- Data-Driven Filtering ---

# Justification for Sale Price Filter:
# Our percentile analysis showed that 5% of non-zero sales are below $145,000.
# These are highly unlikely to be arm's-length market transactions (e.g., deed transfers, sales between family).
# We will set a lower bound threshold of $100,000 as a conservative filter to remove these non-market sales.
df_filtered = df_raw[df_raw['sale_price'] > 100000].copy()
print(f"Shape after price > $100k filter: {df_filtered.shape}")


# Justification for Square Footage Filter:
# Our percentile analysis showed that 5% of properties have listed sqft below 918 sq ft, and the max is over 8 million.
# To ensure we are modeling standard habitable residential units and remove outliers/data errors,
# we will filter for properties between 250 and 20,000 gross square feet.
df_filtered = df_filtered[(df_filtered['gross_square_feet'] > 250) & (df_filtered['gross_square_feet'] < 20000)]
print(f"Shape after square footage filter (250 < sqft < 20k): {df_filtered.shape}")


# --- Select Core Residential Property Types ---

# Justification for Building Class Filter:
# Our model focuses on predicting the value of residential dwellings where people live.
# We are including single-family homes, small multi-family (2-3 units), condos, and co-ops.
# We are explicitly excluding large rental buildings, commercial properties, vacant land,
# and miscellaneous property types (e.g., parking spots) as their valuation drivers are different.
residential_categories = [
    '01 ONE FAMILY DWELLINGS',
    '02 TWO FAMILY DWELLINGS',
    '03 THREE FAMILY DWELLINGS',
    '10 COOPS - ELEVATOR APARTMENTS',
    '13 CONDOS - ELEVATOR APARTMENTS',
    '09 COOPS - WALKUP APARTMENTS',
    '04 TAX CLASS 1 CONDOS', # Often single-family style condos
    '15 CONDOS - 2-10 UNIT RESIDENTIAL',
    '17 CONDO COOPS',
    '12 CONDOS - WALKUP APARTMENTS',
    '16 CONDOS - 2-10 UNIT WITH COMMERCIAL UNIT' # Keep these as they are primarily residential
]

df_residential = df_filtered[df_filtered['building_class_category'].isin(residential_categories)].copy()
print(f"Final shape after filtering for residential categories: {df_residential.shape}")


# --- Final Feature Engineering and Type Conversion ---

# Create an 'age' feature. Replace invalid 'year_built' values (like 0) with NaN first.
df_residential['year_built'] = df_residential['year_built'].replace(0, np.nan)
df_residential['age'] = pd.to_datetime('now').year - df_residential['year_built']

# Create a 'price_per_sqft' feature, which is often a very useful metric
df_residential['price_per_sqft'] = df_residential['sale_price'] / df_residential['gross_square_feet']

# The 'borough' column is numeric; let's map it to actual names for clarity in plots
borough_map = {1: 'Manhattan', 2: 'Bronx', 3: 'Brooklyn', 4: 'Queens', 5: 'Staten Island'}
df_residential['borough_name'] = df_residential['borough'].map(borough_map)

# Convert integer-like floats to integers for cleanliness
int_cols = ['zip_code', 'residential_units', 'commercial_units', 'total_units', 'year_built', 'block', 'lot']
for col in int_cols:
    # Use Int64 (capital I) to handle potential missing values (NaNs)
    df_residential[col] = df_residential[col].astype('Int64')
    
# --- Final Inspection ---
print("\n--- Final DataFrame Info ---")
df_residential.info()

print("\n--- Final DataFrame Head ---")
display(df_residential.head())

print("\n--- Final DataFrame Description ---")
display(df_residential[['sale_price', 'gross_square_feet', 'age', 'price_per_sqft']].describe())

Shape after price > $100k filter: (387270, 21)
Shape after square footage filter (250 < sqft < 20k): (184825, 21)
Final shape after filtering for residential categories: (164865, 21)

--- Final DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
Index: 164865 entries, 0 to 583305
Data columns (total 24 columns):
 #   Column                          Non-Null Count   Dtype         
---  ------                          --------------   -----         
 0   borough                         164865 non-null  float64       
 1   neighborhood                    164865 non-null  object        
 2   building_class_category         164865 non-null  object        
 3   tax_class_at_present            164865 non-null  object        
 4   block                           164865 non-null  Int64         
 5   lot                             164865 non-null  Int64         
 6   easement                        0 non-null       float64       
 7   building_class_at_present       164865 non-null  object

Unnamed: 0,borough,neighborhood,building_class_category,tax_class_at_present,block,lot,easement,building_class_at_present,address,apartment_number,zip_code,residential_units,commercial_units,total_units,land_square_feet,gross_square_feet,year_built,tax_class_at_time_of_sale,building_class_at_time_of_sale,sale_price,sale_date,age,price_per_sqft,borough_name
0,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5395,8,,A1,18 EDWIN STREET,,10312,1,0,1,7000.0,1976.0,1980,1.0,A1,890000.0,2024-01-31,45.0,450.4,Staten Island
1,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5395,15,,A1,10 EDWIN STREET,,10312,1,0,1,7000.0,2200.0,1980,1.0,A1,950000.0,2024-02-29,45.0,431.82,Staten Island
2,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5396,13,,A1,524 SYCAMORE STREET,,10312,1,0,1,10296.0,2975.0,1986,1.0,A1,1250000.0,2024-09-16,39.0,420.17,Staten Island
4,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5401,26,,A1,7 WEAVER STREET,,10312,1,0,1,10800.0,4743.0,1980,1.0,A1,1415000.0,2024-10-28,45.0,298.33,Staten Island
6,5.0,ANNADALE,01 ONE FAMILY DWELLINGS,1,5403,4,,A2,260 SHIRLEY AVENUE,,10312,1,0,1,7500.0,1600.0,1970,1.0,A2,995000.0,2024-11-26,55.0,621.88,Staten Island



--- Final DataFrame Description ---


Unnamed: 0,sale_price,gross_square_feet,age,price_per_sqft
count,164865.0,164865.0,162393.0,164865.0
mean,1016500.12,1887.37,79.62,549.43
std,1490135.27,878.55,32.89,517.66
min,101000.0,256.0,1.0,18.01
25%,568540.0,1280.0,60.0,340.02
50%,760000.0,1697.0,90.0,456.73
75%,1040000.0,2304.0,105.0,617.28
max,87400000.0,18814.0,225.0,67049.81
