In [136]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [137]:
data = pd.read_csv("../house_prices_selection.csv")

In [138]:
data.shape

(1460, 32)

    DROPNA

In [139]:
data_dropna = data.dropna()

In [140]:
data_dropna.shape

(1164, 32)

In [141]:
X_dropna = data_dropna.drop("SalePrice", axis=1)
y_dropna = data_dropna["SalePrice"]

numeric_columns = X_dropna.select_dtypes(include=["number"]).columns
categorical_columns = X_dropna.select_dtypes(include=["object"]).columns

In [142]:
X_train_dropna, X_test_dropna, y_train_dropna, y_test_dropna = train_test_split(
    X_dropna, y_dropna, test_size=0.2, random_state=42
)

In [143]:
preprocessor_dropna = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("num", StandardScaler(), numeric_columns),
    ]
)

model_dropna = Pipeline(steps=[
    ("preprocess", preprocessor_dropna),
    ("model", LinearRegression())
])

In [144]:
model_dropna.fit(X_train_dropna, y_train_dropna)

In [145]:
preds_dropna = model_dropna.predict(X_test_dropna)
rmse_dropna = np.sqrt(mean_squared_error(y_test_dropna, preds_dropna))
mae_dropna = mean_absolute_error(y_test_dropna, preds_dropna)
mse_dropna = mean_squared_error(y_test_dropna, preds_dropna)
r2_dropna = r2_score(y_test_dropna, preds_dropna)

print(f"\nRMSE with dropna: {rmse_dropna:.3f}")
print(f"MAE dropna: {mae_dropna:.3f}")
print(f"MSE dropna: {mse_dropna:.3f}")
print(f"R2 dropna: {r2_dropna:.4f}")


RMSE with dropna: 35644.286
MAE dropna: 21785.021
MSE dropna: 1270515095.004
R2 dropna: 0.8330


    SIMPLEIMPUTER

In [146]:
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [147]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor_imputer = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

model_imputer = Pipeline(steps=[
    ("preprocess", preprocessor_imputer),
    ("model", LinearRegression())
])

In [148]:
model_imputer.fit(X_train_i, y_train_i)

In [149]:
preds_imputer = model_imputer.predict(X_test_i)
rmse_imputer = np.sqrt(mean_squared_error(y_test_i, preds_imputer))
mae_imputer = mean_absolute_error(y_test_i, preds_imputer)
mse_imputer = mean_squared_error(y_test_i, preds_imputer)
r2_imputer = r2_score(y_test_i, preds_imputer)

print(f"\nRMSE: {rmse_imputer:.3f}")
print(f"MAE: {mae_imputer:.3f}")
print(f"MSE: {mse_imputer:.3f}")
print(f"R2: {r2_imputer:.4f}")


RMSE: 33930.642
MAE: 21539.103
MSE: 1151288461.372
R2: 0.8499


In [150]:
print(f"RMSE Dropna: {rmse_dropna:.3f}")
print(f"RMSE SimpleImputer: {rmse_imputer:.3f}")

print(f"\nMAE Dropna: {mae_dropna:.3f}")
print(f"MAE SimpleImputer: {mae_imputer:.3f}")

print(f"\nMSE Dropna: {mse_dropna:.3f}")
print(f"MSE SimpleImputer: {mse_imputer:.3f}")

print(f"\nR² Dropna: {r2_dropna:.4f}")
print(f"R² SimpleImputer: {r2_imputer:.4f}")

RMSE Dropna: 35644.286
RMSE SimpleImputer: 33930.642

MAE Dropna: 21785.021
MAE SimpleImputer: 21539.103

MSE Dropna: 1270515095.004
MSE SimpleImputer: 1151288461.372

R² Dropna: 0.8330
R² SimpleImputer: 0.8499


    Conclusion

    The model trained with dropna performs worse on all evaluation metrics. This happens because dropna eliminates a significant portion of the dataset reducing the amount of useful information available to the model.

    The model trained with SimpleImputer approach keeps all rows by filling in missing values. As a result, the model has more data to learn from and achieves better predictive performance.

    Therefore, using SimpleImputer is more appropriate for this dataset.

    DecisionTreeRegressor

In [151]:
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

X_train_t, X_test_t, y_train_t, y_test_t = train_test_split(
    X, y, test_size=0.2, random_state=42
)

numeric_columns = X_train_t.select_dtypes(include=["number"]).columns
categorical_columns = X_train_t.select_dtypes(include=["object"]).columns

In [152]:
preprocessor_tree = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="mean"), numeric_columns),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_columns),
    ]
)

model_tree = Pipeline(steps=[
    ("preprocess", preprocessor_tree),
    ("model", DecisionTreeRegressor(random_state=42))
])

In [153]:
model_tree.fit(X_train_t, y_train_t)

In [154]:
param_grid_dt = {
    'model__max_depth': [None, 5, 10, 15, 20],
    'model__min_samples_split': [2, 5, 10],
    'model__min_samples_leaf': [1, 2, 4, 8],
    'model__max_features': ['sqrt', 'log2', None],
    'model__criterion': ['squared_error', 'friedman_mse']
}

grid_dt = GridSearchCV(
    estimator=model_tree,
    param_grid=param_grid_dt,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=1
)

In [155]:
grid_dt.fit(X_train_t, y_train_t)

print(grid_dt.best_params_)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
{'model__criterion': 'squared_error', 'model__max_depth': 15, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 1, 'model__min_samples_split': 10}


In [156]:
best_tree_model = grid_dt.best_estimator_

In [157]:
preds_tree = best_tree_model.predict(X_test_t)

In [163]:
rmse_tree = np.sqrt(mean_squared_error(y_test_t, preds_tree))
mae_tree = mean_absolute_error(y_test_t, preds_tree)
mse_tree = mean_squared_error(y_test_t, preds_tree)
r2_tree = r2_score(y_test_t, preds_tree)

print(f"RMSE: {rmse_tree:.3f}")
print(f"MAE: {mae_tree:.3f}")
print(f"MSE: {mse_tree:.3f}")
print(f"R²: {r2_tree:.4f}")

RMSE: 40809.951
MAE: 27680.337
MSE: 1665452128.506
R²: 0.7829


    RandomForestRegressor

In [159]:
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

numeric_columns = X_train.select_dtypes(include=["number"]).columns
categorical_columns = X_train.select_dtypes(include=["object"]).columns

In [160]:
preprocessor_rf = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), numeric_columns),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("encoder", OneHotEncoder(handle_unknown="ignore"))
        ]), categorical_columns)
    ]
)

rf_model = Pipeline(steps=[
    ("preprocess", preprocessor_rf),
    ("regressor", RandomForestRegressor(random_state=42))
])

In [164]:
rf_model.fit(X_train, y_train)

In [166]:
param_grid_rf = {
    'regressor__n_estimators': [100, 300, 500],
    'regressor__max_depth': [None, 10, 20],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__max_features': ['sqrt', 'log2', None],
    'regressor__bootstrap': [True]
}

grid_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    cv=5,
    scoring="neg_root_mean_squared_error",
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


In [167]:
best_rf = grid_rf.best_estimator_
preds_rf = best_rf.predict(X_test)

In [169]:
rmse_rf = np.sqrt(mean_squared_error(y_test, preds_rf))
mae_rf  = mean_absolute_error(y_test, preds_rf)
mse_rf  = mean_squared_error(y_test, preds_rf)
r2_rf   = r2_score(y_test, preds_rf)

print(f"RMSE: {rmse_rf:.3f}")
print(f"MAE: {mae_rf:.3f}")
print(f"MSE: {mse_rf:.3f}")
print(f"R²: {r2_rf:.3f}")

RMSE: 27004.197
MAE: 15956.537
MSE: 729226671.591
R²: 0.905


In [170]:
print(f"RMSE DecisionTree: {rmse_tree:.3f}")
print(f"RMSE RandomForest: {rmse_rf:.3f}")

print(f"MAE DecisionTree: {mae_tree:.3f}")
print(f"MAE RandomForest: {mae_rf:.3f}")

print(f"MSE DecisionTree: {mse_tree:.3f}")
print(f"MSE RandomForest: {mse_rf:.3f}")

print(f"R² DecisionTree: {r2_tree:.4f}")
print(f"R² RandomForest: {r2_rf:.4f}")

RMSE DecisionTree: 33311.395
RMSE RandomForest: 28178.131
MAE DecisionTree: 21482.640
MAE RandomForest: 16398.172
MSE DecisionTree: 1109649064.620
MSE RandomForest: 794007078.615
R² DecisionTree: 0.8553
R² RandomForest: 0.8965


    RandomForest model performed better than the DecisionTree, it achieved lower RMSE, MAE, and MSE, and a higher R² score. 
    This means it predicts house prices more accurately and generalizes better