In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
import joblib

In [40]:
joined_data = pd.read_csv('cleaned_property_data.csv')

In [41]:
# Prepare the data for modeling
le_type = LabelEncoder()
le_zipcode = LabelEncoder()
le_borough = LabelEncoder()

joined_data['TYPE_ENCODED'] = le_type.fit_transform(joined_data['TYPE'])
joined_data['ZIPCODE_ENCODED'] = le_zipcode.fit_transform(joined_data['ZIPCODE'])
joined_data['BOROUGH_ENCODED'] = le_borough.fit_transform(joined_data['BOROUGH'])

In [43]:
# Create the model
X = joined_data[['BEDS', 'BATH', 'TYPE_ENCODED', 'ZIPCODE_ENCODED', 'BOROUGH_ENCODED']]
y = joined_data['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = XGBRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [44]:
# Calculate performance metrics
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Model Performance:")
print(f"Training R2 Score: {train_r2:.4f}")
print(f"Testing R2 Score: {test_r2:.4f}")
print(f"Training MAE: ${train_mae:,.2f}")
print(f"Testing MAE: ${test_mae:,.2f}")

Model Performance:
Training R2 Score: 0.8946
Testing R2 Score: 0.6332
Training MAE: $126,585.87
Testing MAE: $228,631.95


In [46]:
def predict_price_and_top_zipcodes_in_borough(borough, beds, baths, house_type):
    # Encode inputs
    type_encoded = le_type.transform([house_type])[0]
    borough_encoded = le_borough.transform([borough])[0]

    # Filter the data for the specified borough
    borough_data = joined_data[joined_data['BOROUGH'] == borough]
    
    # Initialize a dictionary to store predictions
    zipcode_predictions = {}
    
    for zipcode in borough_data['ZIPCODE'].unique():
        zipcode_encoded = le_zipcode.transform([zipcode])[0]
        
        # Create a DataFrame for prediction to avoid the warning
        input_data = pd.DataFrame({
            'BEDS': [beds],
            'BATH': [baths],
            'TYPE_ENCODED': [type_encoded],
            'ZIPCODE_ENCODED': [zipcode_encoded],
            'BOROUGH_ENCODED': [borough_encoded]
        })
        
        # Predict price
        prediction = model.predict(input_data)[0]
        zipcode_predictions[zipcode] = prediction
    
    # Get top 5 zipcodes based on predicted price
    top_zipcodes = sorted(zipcode_predictions.items(), key=lambda item: item[1], reverse=True)[:5]
    
    return top_zipcodes

# Example usage
borough = 'Brooklyn'  
beds = 2
baths = 2
house_type = 'Condo'

top_zipcodes = predict_price_and_top_zipcodes_in_borough(borough, beds, baths, house_type)

print(f"\nTop 5 zipcodes in {borough} for a {beds} bed, {baths} bath {house_type}:")
for zipcode, price in top_zipcodes:
    print(f"Zipcode: {zipcode}, Predicted Price: ${price:,.2f}")


Top 5 zipcodes in Brooklyn for a 2 bed, 2 bath Condo:
Zipcode: 11201.0, Predicted Price: $1,787,064.38
Zipcode: 11231.0, Predicted Price: $1,711,089.50
Zipcode: 11211.0, Predicted Price: $1,693,335.62
Zipcode: 11217.0, Predicted Price: $1,679,205.38
Zipcode: 11215.0, Predicted Price: $1,621,222.00
