In [24]:
!pip install xgboost



In [25]:
#Importing Libraries
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import joblib
from sklearn.preprocessing import LabelEncoder
import numpy as np

In [26]:
#DataSet
df = pd.read_csv('realtor-data.zip.csv')  
print(df.columns)

Index(['brokered_by', 'status', 'price', 'bed', 'bath', 'acre_lot', 'street',
       'city', 'state', 'zip_code', 'house_size', 'prev_sold_date'],
      dtype='object')


In [27]:
features = ['bed', 'bath', 'acre_lot', 'house_size', 'zip_code', 'price']
df = df.dropna(subset=features)

In [28]:
q_hi = df['price'].quantile(0.99)
df = df[df['price'] < q_hi]

In [29]:
df['price'] = np.log1p(df['price'])
df['house_size'] = np.log1p(df['house_size'])
df['acre_lot'] = np.log1p(df['acre_lot'] + 1e-3) 

In [30]:
zip_avg_price = df.groupby('zip_code')['price'].mean()
df['zip_price_avg'] = df['zip_code'].map(zip_avg_price)

In [31]:
final_features = ['bed', 'bath', 'acre_lot', 'house_size', 'zip_price_avg']
X = df[final_features]
y = df['price']

In [32]:
#le = LabelEncoder()
#df['zip_code_encoded'] = le.fit_transform(df['zip_code'])

# Final feature list with encoded zip
#final_features = ['bed', 'bath', 'acre_lot', 'house_size', 'zip_code_encoded']
#X = df[final_features]
#y = df[target]

In [33]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [34]:
xgb_model = xgb.XGBRegressor(
    objective='reg:squarederror',
    n_estimators=500,
    learning_rate=0.05,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)
xgb_model.fit(X_train, y_train)

In [37]:
predictions = xgb_model.predict(X_test)
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"MSE: {mse:.2f}")
print(f"R²: {r2:.4f}")

MSE: 0.13
R²: 0.7887


In [12]:
joblib.dump(xgb_model, 'housing_model_xgb.pkl')
joblib.dump(le, 'zip_label_encoder.pkl')

['zip_label_encoder.pkl']

In [36]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(xgb_model, X, y, cv=5, scoring='r2')
print("Average CV R²:", scores.mean())

Average CV R²: 0.7627228923903461
