In [13]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
df = pd.read_csv("cleaned_realtor_data.csv")

In [5]:
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949.0,1192.0,2019-06-28,110000.0
1,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949.0,1192.0,2019-06-28,110000.0
2,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949.0,1192.0,2019-06-28,110000.0
3,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949.0,1192.0,2019-06-28,110000.0
4,for_sale,7.0,3.0,0.09,Dorado,Puerto Rico,949.0,1192.0,2019-06-28,110000.0


In [7]:
df.isnull().sum()

status            0
bed               0
bath              0
acre_lot          0
city              0
state             0
zip_code          0
house_size        0
prev_sold_date    0
price             0
dtype: int64

In [38]:
df['prev_sold_date'] = pd.to_numeric(df['prev_sold_date'])

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23604 entries, 0 to 23603
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   status          23604 non-null  int32  
 1   bed             23604 non-null  float64
 2   bath            23604 non-null  float64
 3   acre_lot        23604 non-null  float64
 4   city            23604 non-null  int32  
 5   state           23604 non-null  int32  
 6   zip_code        23604 non-null  float64
 7   house_size      23604 non-null  float64
 8   prev_sold_date  23604 non-null  int64  
 9   price           23604 non-null  float64
dtypes: float64(6), int32(3), int64(1)
memory usage: 1.5 MB


In [42]:
le = LabelEncoder()

In [44]:
col = ["status", "city", "state"]

In [46]:
for encode in col:
    df[encode] = le.fit_transform(df[encode])

In [48]:
x = df.drop(columns = "price")
y = df["price"]

In [50]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.3, random_state = 45)

In [52]:
rf_model = RandomForestRegressor(n_estimators = 7)

In [54]:
rf_model.fit(x_train,y_train)

In [58]:
y_pred = rf_model.predict(x_test)

In [60]:
xgb_model = XGBRegressor(n_estimators=7)

In [62]:
xgb_model.fit(x_train, y_train)

In [78]:
xgb_preds = xgb_model.predict(x_test)

In [80]:
def evaluate_model(y_true, y_pred, model_name):
    print(f'\n{model_name} Performance:')
    print(f'MAE: {mean_absolute_error(y_true, y_pred):.2f}')
    print(f'RMSE: {np.sqrt(mean_squared_error(y_true, y_pred)):.2f}')
    print(f'R2 Score: {r2_score(y_true, y_pred):.2f}')

In [82]:
evaluate_model(y_test, y_pred, 'Random Forest')
evaluate_model(y_test, xgb_preds, 'XGBoost')


Random Forest Performance:
MAE: 27456.61
RMSE: 625159.04
R2 Score: 0.84

XGBoost Performance:
MAE: 117101.07
RMSE: 378355.02
R2 Score: 0.94
