In [13]:
!pip install xgboost



In [14]:
#Importing Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib

In [15]:
REALTOR_CSV = "realtor-data.zip.csv"
ZIP_CSV = "uszips.csv"
MODEL_PATH = "house_model.pkl"
ZIP_FEATURES_PATH = "zip_features_general.csv"

In [16]:
#DataSet
housing = pd.read_csv(REALTOR_CSV)
zips = pd.read_csv(ZIP_CSV)[['zip', 'lat', 'lng', 'population', 'density']]
zips = zips.rename(columns={'zip': 'zip_code'})
zips['zip_code'] = zips['zip_code'].astype(int)


In [17]:
housing = housing[['zip_code', 'price']].dropna()
housing['zip_code'] = housing['zip_code'].astype(int)
housing = housing[housing['zip_code'].isin(zips['zip_code'])]

In [18]:
#Cleaning

df = housing.merge(zips, on='zip_code', how='left').dropna()
df['price'] = np.log1p(df['price'])  # Log-transform price


features = ['lat', 'lng', 'population', 'density']
X = df[features]
y = df['price']

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [20]:
model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=300,
    learning_rate=0.1,
    max_depth=6,
    random_state=42
)
model.fit(X_train, y_train)

In [21]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)


In [22]:
joblib.dump(model, MODEL_PATH)
zips.to_csv(ZIP_FEATURES_PATH, index=False)
print("Done.")

Done.


In [23]:
#Prediction funciton
def predict_price_by_zip(zip_code_input):
    model = joblib.load(MODEL_PATH)
    zip_data = pd.read_csv(ZIP_FEATURES_PATH)

    try:
        zip_code_input = int(zip_code_input)
    except ValueError:
        print("ZIP code must be numeric.")
        return None

    if zip_code_input not in zip_data['zip_code'].values:
        print(f"ZIP code {zip_code_input} not found in ZIP dataset.")
        return None

    row = zip_data[zip_data['zip_code'] == zip_code_input].iloc[0]

    input_df = pd.DataFrame([{
        'lat': row['lat'],
        'lng': row['lng'],
        'population': row['population'],
        'density': row['density']
    }])

    log_price = model.predict(input_df)[0]
    price = np.expm1(log_price)

    print(f"Estimated price: ${price:,.2f}")
    return price


In [24]:
predict_price_by_zip(75072)

Estimated price: $407,617.12


407617.12