Predicting House Price Based on City

Goal

Build a model to predict house price based on city

In [20]:
#Import Libraries
import pandas as pd
import warnings
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
from category_encoders import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.utils.validation import check_is_fitted
warnings.simplefilter(action="ignore", category=UserWarning)

For this project i would be using a wrangle function to clean my data

In [21]:
def wrangle(filepath):
    #Read csv file
    df = pd.read_csv(filepath)
    df.drop(columns="prev_sold_date", inplace=True)
    #drop NAN values
    df.dropna(how="any", inplace=True)
    #Create a new column called "price_in_USD" using "price"
    df["price_in_USD"] = df["price"].astype(int)
    #Convert "acre_lot" to m2
    df["area_in_m2"] = (df["acre_lot"] * 4_046.86).astype(int)
    #Calculate price per m2
    df["price_per_m2"] = (df["price_in_USD"] / df["area_in_m2"]).round(0)
    #Subset Data: i would be focusing on the houses in the state "Puerto Rico" < 400,000 and that are for_sale
    mask_sale=df["status"]=="for_sale"
    mask_state=df["state"]=="Puerto Rico"
    mask_price = df["price_in_USD"]< 400_000
    df=df[mask_sale & mask_state & mask_price]
    #Subset Data: Remove outliers in "area_in_m2"
    low,high=df["area_in_m2"].quantile([0.1,0.9])
    mask_area = df["area_in_m2"].between(low,high)
    df = df[mask_area]
    df.drop(columns=["street","state","zip_code","house_size","brokered_by","status","price","bed","bath","acre_lot"], inplace=True)
    df = df.reset_index(drop=True)
    return df
    

In [None]:
df = wrangle("data/realtor-data.csv")

In [None]:
df.tail()

Spliting my data to creating my test dataset

In [None]:
cutoff = int(len(df) * 0.8)

In [None]:
dt = df.iloc[:cutoff]
dt.tail()

In [None]:
Test_data = df.iloc[cutoff:]
Test_data.head()

Saving my test_data DataFrame in a CSV format

Test_data.to_csv("realtor-data_test.csv", index=False)

Creating my feature matrix X_train and target vector y_train

In [None]:
target = "price_per_m2"
features = ["city"]
y_train = dt[target]
X_train = dt[features]

calculating the baseline mean absolute error for my model

In [None]:
 y_mean = y_train.mean()
 y_pred_baseline = [y_mean] * len(y_train)
 Baseline_MAE = mean_absolute_error(y_train, y_pred_baseline)
 rounded_y_mean = round(y_mean, 2)
 rounded_Baseline_MAE = round(Baseline_MAE, 2)
 print("Mean_House_Price:", rounded_y_mean)
 print("Baseline_MAE:", rounded_Baseline_MAE)

Now i build my model by creating a pipeline that contain all the transformers neccesary
and a predictor

In [None]:
 model = make_pipeline(
     OneHotEncoder(use_cat_names=True),
     SimpleImputer(),
     Ridge()
 )

In [None]:
  model.fit(X_train, y_train)

Calculating the training mean absolute error

In [None]:
 y_pred_training = model.predict(X_train)
mae_training = mean_absolute_error(y_train, y_pred_training)
print("Training MAE:", round(mae_training, 2))

With a Training MAE less than my Baseline MAE This suggests that my model is effective at reducing error 
compared to a naive model.

Importing my test Data

In [None]:
X_test = pd.read_csv("data/realtor-data_test.csv")

In [None]:
X_test.drop(columns= ["price_in_USD","area_in_m2","price_per_m2"],inplace=True)

Using my model to generate a series of prediction for X_test

In [None]:
y_test_pred = pd.Series(model.predict(X_test))

In [None]:
y_test_pred.astype(int)

In [None]:
coefficients = model.named_steps["ridge"].coef_
features = model.named_steps["onehotencoder"].get_feature_names_out()
feat_imp = pd.Series(coefficients, index=features).sort_values(key=abs).round(2)
feat_imp

Interpretation by visible output above: From the result above, the feature "city_Guaynabo" has the highest
positive coefficient ("274.28"), meaning it has more significant positive impact on the target variable 
("house price") when compaired to "city_San Juan" ("216.68"). The closer the coefficient is to zero, 
the less influence that feature ("city") has on the target variable ("house price")

creating a horizontal bar chart that shows the 10 most influential coefficients for my model

In [None]:
feat_imp.tail(10).plot(kind="barh")
plt.xlabel("Importance [USD")
plt.ylabel("Feature")
plt.title("Feature Importance for Apartment Price");

Printing the equation that my model has determined for predicting house price based on city

y = B0 + (B1*X1) + (B2*X2) + ... + (Bn*Xn)

In [None]:
intercept = model.named_steps["ridge"].intercept_
intercept

In [None]:
coefficients = model.named_steps["ridge"].coef_
print(coefficients[:5])

In [18]:
equation_parts = []
feat_items = list(feat_imp.items())

# Get the first two items
for i, (f, c) in enumerate(feat_items[:2]):
    equation_parts.append(f"({round(c, 2)} * {f})")

# Add the last item
f_last, c_last = feat_items[-1]
equation_parts.append(f"... + ({round(c_last, 2)} * {f_last})")

# Combine and print the equation
equation = f"price = {intercept.round(2)} + " + " + ".join(equation_parts)
print(equation)

NameError: name 'feat_imp' is not defined