In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [2]:
# Load dataset again (update file path accordingly)
df = pd.read_csv("realtor-data.zip.csv")

In [3]:
df.head()

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.1,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0


In [4]:
df.isnull().sum()

status                0
bed               16770
bath              15586
acre_lot          29882
city                 22
state                 0
zip_code             32
house_size        35311
prev_sold_date    51883
price                18
dtype: int64

In [5]:
 df.drop(columns=['prev_sold_date'])

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,67000.0
3,for_sale,4.0,2.0,0.10,Ponce,Puerto Rico,731.0,1800.0,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,65000.0
...,...,...,...,...,...,...,...,...,...
110096,for_sale,2.0,2.0,,New York City,New York,10463.0,1200.0,399000.0
110097,for_sale,2.0,2.0,,New York City,New York,10463.0,,359000.0
110098,for_sale,2.0,2.0,,Bronx,New York,10463.0,1200.0,299000.0
110099,for_sale,,1.0,,Bronx,New York,10463.0,,120000.0


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110101 entries, 0 to 110100
Data columns (total 10 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   status          110101 non-null  object 
 1   bed             93331 non-null   float64
 2   bath            94515 non-null   float64
 3   acre_lot        80219 non-null   float64
 4   city            110079 non-null  object 
 5   state           110101 non-null  object 
 6   zip_code        110069 non-null  float64
 7   house_size      74790 non-null   float64
 8   prev_sold_date  58218 non-null   object 
 9   price           110083 non-null  float64
dtypes: float64(6), object(4)
memory usage: 8.4+ MB
None


In [7]:
# Step 1: Handle Missing Values
df.dropna(subset=['price', 'house_size', 'city'], inplace=True)  # Drop rows with critical missing values
df.fillna({'bed': df['bed'].median(), 'bath': df['bath'].median()}, inplace=True)


In [8]:
# Fill missing values in 'acre_lot' and 'house_size' based on city median (to retain locality trends)
df['acre_lot'] = df.groupby('city')['acre_lot'].transform(lambda x: x.fillna(x.median()))
df['house_size'] = df.groupby('city')['house_size'].transform(lambda x: x.fillna(x.median()))


In [9]:
# Step 2: Remove Duplicates
df.drop_duplicates(inplace=True)

In [10]:
# Step 1: Preprocess Data
# Encode categorical variables
label_encoders = {}
categorical_cols = ['city', 'state', 'zip_code']

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le  # Save the encoders for later use


In [11]:
# Define feature columns and target variable
features = ['bed', 'bath', 'house_size', 'acre_lot', 'city', 'state', 'zip_code']
target = 'price'

In [12]:
# Scale numerical features (important for models like RandomForest)
scaler = StandardScaler()
df[features] = scaler.fit_transform(df[features])


In [13]:
# Step 2: Split Data
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=43)


In [20]:
# Step 3: Train Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)


In [15]:
# Step 4: Feature Importance
feature_importances = pd.DataFrame({
    'Feature': features,
    'Importance': rf_model.feature_importances_
}).sort_values(by='Importance', ascending=False)

print("Feature Importances:\n", feature_importances)


Feature Importances:
       Feature  Importance
6    zip_code    0.317510
0         bed    0.271537
2  house_size    0.203842
1        bath    0.113159
3    acre_lot    0.057246
4        city    0.029227
5       state    0.007478


In [16]:
# Step 5: Model Predictions
y_pred = rf_model.predict(X_test)


In [17]:
# Model Evaluation
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Model Evaluation:\n MAE: {mae}\n MSE: {mse}\n RMSE: {rmse}")


Model Evaluation:
 MAE: 338218.81347254384
 MSE: 2150323253913.7207
 RMSE: 1466398.0543882758


In [18]:
r2_score(y_test, y_pred)

0.6429938600030503

In [19]:
df.to_csv("real_estate_predictions.csv", index=False)
print("Predictions saved successfully!")


Predictions saved successfully!
