# Harris County Home Price Estimations

In [None]:
import pandas as pd
import numpy as np
import math
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from haversine import haversine


from Load_to_DataFrame import load_data_frame

from itables import init_notebook_mode, show

init_notebook_mode(all_interactive=True)

In [None]:
df = load_data_frame()

In [None]:
# from math import sin, cos, sqrt, radians, atan2


# def haversine_distance(lat1, lon1, lat2, lon2):
#     """
#     Calculates the Haversine distance between two points on a sphere.
#     """
#     # R = 6371  # Earth radius in kilometers
#     R = 3956  # Eath radius in miles
#     dlat = radians(lat2 - lat1)
#     dlon = radians(lon2 - lon1)
#     a = (
#         sin(dlat / 2) ** 2
#         + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2) ** 2
#     )
#     c = 2 * atan2(sqrt(a), sqrt(1 - a))
#     return R * c

# # Reference point coordinates
# ref_lat = 29.760100
# ref_lon = -95.370100

# # Add a new column with calculated distances
# df["distance"] = df.apply(
#     lambda row: haversine_distance(ref_lat, ref_lon, row["lat"], row["long"]),
#     axis=1,
# )

In [None]:
# Define the haversine function
def haversine(lat1, lon1, lat2, lon2):
    # Radius of Earth in miles
    R = 3958.8
    # Convert degrees to radians
    phi1, phi2 = np.radians(lat1), np.radians(lat2)
    d_phi = np.radians(lat2 - lat1)
    d_lambda = np.radians(lon2 - lon1)
    # Haversine formula
    a = np.sin(d_phi / 2) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(d_lambda / 2) ** 2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c


# # Example DataFrame with latitude and longitude
# data = {
#     "latitude": [34.0522, 36.1699, 40.7128],  # Los Angeles, Las Vegas, New York
#     "longitude": [-118.2437, -115.1398, -74.0060]
# }
# df = pd.DataFrame(data)

# Define the single point (latitude, longitude) to calculate the distance from
single_point = (-95.370100, 29.760100)  # Houston

# Add a new column with distances
df["distance_miles"] = df.apply(
    lambda row: haversine(single_point[0], single_point[1], row["lat"], row["long"]),
    axis=1,
)

In [None]:
df["assessed_per_sqft"] = df["assessed_per_sqft"].replace([np.inf, -np.inf], np.nan)

In [None]:
show(df)

In [None]:
df["assessed_per_sqft"].describe()

# Reduce data set to manageable size
The current data set is over 1.1M rows. This will take days to train a model on a regular computer. 

The help reduce the dataset I will start by removing all homes where the im_sq_ft is less than 50, since it is not really livable when there are not even 50 square feet of space. 

Next I will use the IQR to assess the market value and remove outliers so the data will not be as skewed.

In [None]:
# Filter data_df to values between Lower IQR and Upper IQR
# IQR
Q1 = df["assessed_val"].quantile(0.25)
Q3 = df["assessed_val"].quantile(0.75)
IQR = Q3 - Q1

lower_iqr = Q1 - 1.5 * IQR
upper_iqr = Q3 + 1.5 * IQR
print(f"Lower IQR: {lower_iqr} | Upper IQR: {upper_iqr}")

In [None]:
df = df[df["assessed_val"] <= 2000000]

y_market = df["tot_mkt_val"]
y_assessed = df["assessed_val"]
y_bld_val = df["bld_val"]
y_per_sqft = df["assessed_per_sqft"]

In [None]:
fig, axs = plt.subplots(2, 2)

# Total Market Price
axs[0, 0].hist(y_market, bins="auto")
axs[0, 0].set_title("Total Market Price")

# Assessed values
axs[1, 0].hist(y_assessed, bins="auto")
axs[1, 0].set_title("Assessed Price")

# Building value
axs[0, 1].hist(y_bld_val, bins="auto")
axs[0, 1].set_title("Building Price")

# Price per square foot
axs[1, 1].hist(y_per_sqft, bins="auto")
axs[1, 1].set_title("Price / SqFt")

fig.tight_layout()
plt.show()

In [None]:
# # Removed features: ,, 'solar_panel', 'lat', 'long'
# y_market = df["tot_mkt_val"]
# y_assessed = df["assessed_val"]
# y_bld_val = df["bld_val"]
# y_per_sqft = df["assessed_per_sqft"]

In [None]:
# IQR
Q1 = df["assessed_val"].quantile(0.25)
Q3 = df["assessed_val"].quantile(0.75)
IQR = Q3 - Q1

lower_iqr = Q1 - 1.5 * IQR
upper_iqr = Q3 + 1.5 * IQR
print(f"Lower IQR: {lower_iqr} | Upper IQR: {upper_iqr}")

In [None]:
plt.hist(df["tot_mkt_val"], bins="auto")
plt.xlabel("Total Market Value ($)")
plt.ylabel("Frequency")
plt.title("Histogram of Total Market Value")
plt.show()

In [None]:
plt.hist(df["assessed_val"], bins="auto")
plt.xlabel("Total Market Value ($)")
plt.ylabel("Frequency")
plt.title("Histogram of Assessed Value")
plt.show()

## Sample Size Reduction
Since the data set is still over 1M homes, we need to reduce the size to create a model.

In [None]:
sample_df = df.sample(n=25000, random_state=42)
show(sample_df)

In [None]:
corr_matrix = sample_df.corr()
plt.figure(figsize=(25, 12))
sns.heatmap(data=corr_matrix, annot=True, cmap="coolwarm")
plt.tight_layout()
plt.show()

In [None]:
X = sample_df[
    [
        "date_erected",
        "im_sq_ft",
        "land_ar",
        "perimeter",
        "bedrooms",
        "full_bath",
        "half_bath",
        "total_rooms",
        "dscr_good",
        "dscr_low",
        "dscr_very_low",
        "dscr_average",
        "dscr_excellent",
        "dscr_superior",
        "dscr_poor",
        "frame_detached_garage",
        "gunite_pool",
        "pool_heater",
        "brick_garage",
        "canopy_residential",
        "frame_abov",
        "frame_shed",
        "carport_residential",
        "foundation_repaired",
        "cracked_slab",
        "distance_miles",
    ]
]

y_assessed = sample_df["assessed_val"]
print(X.shape)

# Train, Test, Split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(
    X, y_assessed, test_size=0.3, random_state=42
)

In [None]:
etc = ExtraTreesRegressor(random_state=42)

In [None]:
param_grid = {
    "n_estimators": [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000],
    "min_samples_leaf": [1, 2],
    "max_features": ["sqrt", "log2", 10, 20, 26],
    "criterion": ["friedman_mse", "absolute_error"],
}

In [None]:
etc_cv = GridSearchCV(etc, param_grid, cv=5, n_jobs=-1, scoring="r2")

In [None]:
etc_cv.fit(x_train, y_train)

In [None]:
etc_cv.best_params_

In [None]:
etc_cv.best_score_

In [None]:
etc_cv_results = pd.DataFrame(etc_cv.cv_results_)
etc_cv_results["param_n_estimators"] = etc_cv_results["param_n_estimators"].astype(
    "category"
)
etc_cv_results.info()

In [None]:
plt.bar(
    etc_cv_results["param_n_estimators"], etc_cv_results["mean_test_score"], width=20
)
plt.xlabel("n_estimators")
plt.ylabel("Mean Test Score")
plt.title("Effect of n_estimators on Mean Test Score")
plt.show()

## Analysis of testing data

In [None]:
etc_pred = etc_cv.predict(x_test)
etc_mae = mean_absolute_error(y_test, etc_pred)
etc_mse = mean_squared_error(y_test, etc_pred)
etc_r2 = r2_score(y_test, etc_pred)
print(f"MAE: {etc_mae}\nMSE: {etc_mse}\nR Squared: {etc_r2}\n")

In [None]:
# Save model
import joblib
from datetime import datetime

# Get the current date
current_date = datetime.now()

# Format the date as YYYYmmdd
formatted_date = current_date.strftime("%Y%m%d")

joblib.dump(etc_cv, f"Models/etc_{formatted_date}.pkl")

# Residual Analysis

In [None]:
actual = y_test.to_list()
predicted = etc_cv.predict(x_test)
etc_residuals = pd.DataFrame({"actual": actual, "predicted": predicted})
etc_residuals["residuals"] = etc_residuals["actual"] - etc_residuals["predicted"]

In [None]:
fig, ax = plt.subplots(figsize=(9, 9))

x = etc_residuals["actual"]
y = etc_residuals["predicted"]

# Scatter plot
ax.scatter(x, y, s=60, alpha=0.7, edgecolors="k")
plt.xlabel("Actual Prices")
plt.ylabel("Predicted Prices")
plt.title("Actual vs Predicted Prices ($) ")
plt.show()