<a href="https://colab.research.google.com/github/Sambradshaw19011/CSE-450/blob/main/starter_housing_data_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install scikit-learn==1.5.2

Collecting scikit-learn==1.5.2
  Downloading scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.6.1
    Uninstalling scikit-learn-1.6.1:
      Successfully uninstalled scikit-learn-1.6.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
umap-learn 0.5.11 requires scikit-learn>=1.6, but you have scikit-learn 1.5.2 which is incompatible.
hdbscan 0.8.41 requires scikit-learn>=1.6, but you have scikit-learn 1.5.2 which is incompatible.[0m[31m
[0mSuccessfully installed 

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import r2_score

housing = pd.read_csv('https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing.csv')

In [3]:
def build_features(df, reference_df=None, drop_outliers=False):
    df = df.copy()

    # ---------- BASIC FEATURES ----------
    df["neighborhood"] = df["id"].astype(str).str[:6]

    dt = pd.to_datetime(df["date"], errors="coerce")
    df["year"] = dt.dt.year
    df["month"] = dt.dt.month
    df["day"] = dt.dt.day
    df["dayofweek"] = dt.dt.dayofweek

    df["total_sqft"] = df["sqft_above"] + df["sqft_basement"].fillna(0)

    df["house_age"] = df["year"] - df["yr_built"]
    df["renovated_flag"] = (df["yr_renovated"].fillna(0) > 0).astype(int)
    df["reno_age"] = np.where(
        df["renovated_flag"] == 1,
        df["year"] - df["yr_renovated"],
        0
    )

    df["bathrooms_per_bedroom"] = (
        df["bathrooms"] / df["bedrooms"].replace(0, np.nan)
    ).fillna(0)

    # ---------- OUTLIER DROPPING ----------
    if drop_outliers and reference_df is not None:
        for col in ["sqft_living", "sqft_lot", "sqft_above", "sqft_basement", "total_sqft"]:
            if col in df.columns and col in reference_df.columns:
                hi = reference_df[col].quantile(0.995)
                df = df[df[col] <= hi]

        for col, hi in [("bedrooms", 10), ("bathrooms", 7.5)]:
            if col in df.columns:
                df = df[df[col] <= hi]

    return df


In [4]:
housing_fe = build_features(
    housing,
    reference_df=housing,
    drop_outliers=True
)

X = pd.get_dummies(
    housing_fe.drop(columns=["price", "id", "date"]),
    drop_first=True
)
y = np.log1p(housing_fe["price"])


# Split into train (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Split temp into dev (10%) and test (10%)
X_dev, X_test, y_dev, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42
)

model = XGBRegressor(
    n_estimators=5000,        # total number of trees (more trees = more learning capacity, but slower)
    learning_rate=0.03,       # how much each tree contributes (smaller = more gradual learning, usually better generalization)
    max_depth= 8,             # max depth of each tree (controls complexity; deeper = can fit more patterns but risk overfitting)
    subsample=0.8,            # percent of rows used per tree (adds randomness to reduce overfitting)
    colsample_bytree=0.8,     # percent of features used per tree (adds randomness + helps prevent reliance on a few features)
    gamma=0.1,                # minimum loss reduction needed to split a node (higher = fewer splits = less overfitting)
    min_child_weight=10,      # minimum "weight" in a leaf (higher = more conservative splits, helps prevent overfitting)
    reg_alpha=0.0,            # L1 regularization (pushes some feature effects toward 0, can help if too many features)
    reg_lambda=1.0,           # L2 regularization (stabilizes weights, reduces overfitting; default is usually 1)
    random_state=42,          # makes results reproducible (same split/training behavior each run)
    n_jobs=-1                 # uses all CPU cores to train faster
)


# Fit on TRAIN only
model.fit(X_train, y_train)

# Predict on DEV
dev_pred_log = model.predict(X_dev)

# Convert back to dollars
dev_pred_price = np.expm1(dev_pred_log)
y_dev_price = np.expm1(y_dev)

# Metrics on DEV
dev_rmse = root_mean_squared_error(y_dev_price, dev_pred_price)
dev_r2 = r2_score(y_dev_price, dev_pred_price)

print("DEV RMSE (dollars):", dev_rmse)
print("DEV R^2 (dollars):", dev_r2)

DEV RMSE (dollars): 93352.09458225532
DEV R^2 (dollars): 0.9076769974817528


In [5]:
# Final evaluation on TEST (only after tuning is done)
test_pred_log = model.predict(X_test)
test_pred_price = np.expm1(test_pred_log)
y_test_price = np.expm1(y_test)

test_rmse = root_mean_squared_error(y_test_price, test_pred_price)
test_r2 = r2_score(y_test_price, test_pred_price)

print("TEST RMSE (dollars):", test_rmse)
print("TEST R^2 (dollars):", test_r2)

TEST RMSE (dollars): 105028.726413489
TEST R^2 (dollars): 0.9056613112959091


In [6]:
mini = pd.read_csv(
    "https://raw.githubusercontent.com/byui-cse/cse450-course/master/data/housing_holdout_test_mini.csv"
)

mini_fe = build_features(
    mini,
    reference_df=housing,
    drop_outliers=False
)

X_mini = pd.get_dummies(
    mini_fe.drop(columns=["id", "date"]),
    drop_first=True
)

X_mini = pd.get_dummies(
    mini_fe.drop(columns=["id", "date"]),
    drop_first=True
)

X_mini = X_mini.reindex(columns=X.columns, fill_value=0)

mini_pred_log = model.predict(X_mini)
mini_pred_price = np.expm1(mini_pred_log)

print("Mini predictions generated. Count:", len(mini_pred_price))

Mini predictions generated. Count: 81


In [7]:
TEAM_NAME = "team3"
out_file = f"{TEAM_NAME}-module3-predictions.csv"

pd.DataFrame({"price": mini_pred_price}).to_csv(out_file, index=False)

print("Saved:", out_file)
print("Shape:", (len(mini_pred_price), 1))

Saved: team3-module3-mini-predictions.csv
Shape: (81, 1)
