In [150]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.discriminant_analysis import StandardScaler
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer

In [151]:
data = pd.read_csv("../house_prices_selection.csv")

In [152]:
data.shape

(1460, 32)

    DROPNA

In [153]:
data_dropna = data.dropna()

In [154]:
data_dropna.shape

(1164, 32)

In [155]:
X_dropna = data_dropna.drop("SalePrice", axis=1)
y_dropna = data_dropna["SalePrice"]

numeric_columns = X_dropna.select_dtypes(include=["number"]).columns
categorical_columns = X_dropna.select_dtypes(include=["object"]).columns

In [None]:
X_train_dropna, X_test_dropna, y_train_dropna, y_test_dropna = train_test_split(
    X_dropna, y_dropna, test_size=0.2, random_state=42
)

In [156]:
preprocessor_dropna = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_columns),
        ("num", StandardScaler(), numeric_columns),
    ]
)

model_dropna = Pipeline(steps=[
    ("preprocess", preprocessor_dropna),
    ("model", LinearRegression())
])

In [157]:
model_dropna.fit(X_train_dropna, y_train_dropna)

In [158]:
preds_dropna = model_dropna.predict(X_test_dropna)
rmse_dropna = np.sqrt(mean_squared_error(y_test_dropna, preds_dropna))
mae_dropna = mean_absolute_error(y_test_dropna, preds_dropna)
mse_dropna = mean_squared_error(y_test_dropna, preds_dropna)
r2_dropna = r2_score(y_test_dropna, preds_dropna)

print(f"\nRMSE with dropna: {rmse_dropna:.3f}")
print(f"MAE dropna: {mae_dropna:.3f}")
print(f"MSE dropna: {mse_dropna:.3f}")
print(f"R2 dropna: {r2_dropna:.4f}")


RMSE with dropna: 35644.286
MAE dropna: 21785.021
MSE dropna: 1270515095.004
R2 dropna: 0.8330


    SIMPLEIMPUTER

In [160]:
X = data.drop("SalePrice", axis=1)
y = data["SalePrice"]

X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [161]:
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor_imputer = ColumnTransformer(
    transformers=[
        ("num", numerical_transformer, numeric_columns),
        ("cat", categorical_transformer, categorical_columns),
    ]
)

model_imputer = Pipeline(steps=[
    ("preprocess", preprocessor_imputer),
    ("model", LinearRegression())
])

In [162]:
model_imputer.fit(X_train_i, y_train_i)

In [164]:
preds_imputer = model_imputer.predict(X_test_i)
rmse_imputer = np.sqrt(mean_squared_error(y_test_i, preds_imputer))
mae_imputer = mean_absolute_error(y_test_i, preds_imputer)
mse_imputer = mean_squared_error(y_test_i, preds_imputer)
r2_imputer = r2_score(y_test_i, preds_imputer)

print(f"\nRMSE: {rmse_imputer:.3f}")
print(f"MAE: {mae_imputer:.3f}")
print(f"MSE: {mse_imputer:.3f}")
print(f"R2: {r2_imputer:.4f}")


RMSE with SimpleImputer: 33930.642
MAE imputer: 21539.103
MSE imputer: 1151288461.372
R2 imputer: 0.8499


In [169]:
print(f"RMSE Dropna: {rmse_dropna:.3f}")
print(f"RMSE SimpleImputer: {rmse_imputer:.3f}")

print(f"\nMAE Dropna: {mae_dropna:.3f}")
print(f"MAE SimpleImputer: {mae_imputer:.3f}")

print(f"\nMSE Dropna: {mse_dropna:.3f}")
print(f"MSE SimpleImputer: {mse_imputer:.3f}")

print(f"\nR² Dropna: {r2_dropna:.4f}")
print(f"R² SimpleImputer: {r2_imputer:.4f}")

RMSE Dropna: 35644.286
RMSE SimpleImputer: 33930.642

MAE Dropna: 21785.021
MAE SimpleImputer: 21539.103

MSE Dropna: 1270515095.004
MSE SimpleImputer: 1151288461.372

R² Dropna: 0.8330
R² SimpleImputer: 0.8499


    Conclusion

    The model trained with dropna performs worse on all evaluation metrics. This happens because dropna eliminates a significant portion of the dataset reducing the amount of useful information available to the model.

    The model trained with SimpleImputer approach keeps all rows by filling in missing values. As a result, the model has more data to learn from and achieves better predictive performance.

    Therefore, using SimpleImputer is more appropriate for this dataset.