In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans
from sklearn.metrics import r2_score, mean_absolute_error
from scipy.spatial.distance import cdist
import joblib

In [56]:
data = pd.read_csv('cleaned_property_data.csv')

In [57]:
def add_location_cluster(df, n_clusters=10):
    coords = df[['LATITUDE', 'LONGITUDE']].dropna()
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['LOCATION_CLUSTER'] = kmeans.fit_predict(coords)
    return df, kmeans.cluster_centers_

data, cluster_centers = add_location_cluster(data)

In [58]:
# Add distance to cluster centers
def add_distance_features(df, centers):
    coords = df[['LATITUDE', 'LONGITUDE']].dropna()
    distances = cdist(coords, centers)
    for i in range(distances.shape[1]):
        df[f'DISTANCE_TO_CLUSTER_{i}'] = distances[:, i]
    return df
data = add_distance_features(data, cluster_centers)

In [59]:
# Prepare features and target
categorical_features = ['TYPE', 'ZIPCODE', 'BOROUGH', 'NEIGHBORHOOD']
numeric_features = ['BEDS', 'BATH', 'PROPERTYSQFT', 'LOCATION_CLUSTER'] + [f'DISTANCE_TO_CLUSTER_{i}' for i in range(10)]

X = data[categorical_features + numeric_features]
y = data['PRICE']

In [60]:
# Create preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [61]:
# Create a pipeline with preprocessor and model
rf_model = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

In [62]:
# Split the data and train the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_model.fit(X_train, y_train)

In [63]:
# Calculate performance metrics
y_train_pred = rf_model.predict(X_train)
y_test_pred = rf_model.predict(X_test)
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

# Calculate MAE as a percentage
avg_train_price = y_train.mean()
avg_test_price = y_test.mean()
train_mae_percent = (train_mae / avg_train_price) * 100
test_mae_percent = (test_mae / avg_test_price) * 100

print("Model Performance:")
print(f"Training R2 Score: {train_r2:.4f}")
print(f"Testing R2 Score: {test_r2:.4f}")
print(f"Training MAE: ${train_mae:,.2f} ({train_mae_percent:.2f}%)")
print(f"Testing MAE: ${test_mae:,.2f} ({test_mae_percent:.2f}%)")

Model Performance:
Training R2 Score: 0.9669
Testing R2 Score: 0.6957
Training MAE: $70,219.33 (7.85%)
Testing MAE: $206,660.53 (22.89%)


In [64]:
# Define the HousePricePredictor class
class HousePricePredictor:
    def __init__(self, model, cluster_centers, data):
        self.model = model
        self.cluster_centers = cluster_centers
        self.data = data

    def predict_price(self, borough, beds, baths, house_type, zipcode, sqft):
        # Get lat lng for the zipcode
        zipcode_data = self.data[self.data['ZIPCODE'] == zipcode]
        if zipcode_data.empty:
            return None
        
        median_lat = zipcode_data['LATITUDE'].median()
        median_lon = zipcode_data['LONGITUDE'].median()
        
        # Distances to cluster centers
        distances = cdist([[median_lat, median_lon]], self.cluster_centers)[0]
        location_cluster = np.argmin(distances)
        
        # Most common neighborhood for this zipcode
        neighborhood = zipcode_data['NEIGHBORHOOD'].mode()[0]
        
        # Create input data
        input_data = pd.DataFrame({
            'BEDS': [beds],
            'BATH': [baths],
            'PROPERTYSQFT': [sqft],
            'TYPE': [house_type],
            'ZIPCODE': [zipcode],
            'BOROUGH': [borough],
            'NEIGHBORHOOD': [neighborhood],
            'LOCATION_CLUSTER': [location_cluster]
        })
        
        # Add distance features
        for i, distance in enumerate(distances):
            input_data[f'DISTANCE_TO_CLUSTER_{i}'] = [distance]
       
        # Predict price
        prediction = self.model.predict(input_data)[0]
        return prediction

# Create an instance of the HousePricePredictor
predictor = HousePricePredictor(rf_model, cluster_centers, data)

# Save the predictor as a single pickle file
joblib.dump(predictor, 'house_price_predictor.pkl')

['house_price_predictor.pkl']