DROPNA Method

In [428]:
import pandas as pd

In [429]:
import sklearn
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
import numpy as np


In [430]:
df = pd.read_csv("house_prices_selection.csv")

In [431]:
df.shape

(1460, 32)

Spliting - numerical / categorical data

In [432]:
df_dropna = df.copy()

In [433]:
numerical_columns = df_dropna.select_dtypes(include=["number"]).columns.tolist()

In [434]:
categorical_columns = df_dropna.select_dtypes(include=["object", "category"]).columns.tolist()

In [435]:
print(numerical_columns)

['EVI', 'ZoningScore', 'Price_per_m^2', 'PDI', 'SalePrice', 'LotFrontage', 'LotArea', 'WoodDeckSF', 'OpenPorchSF', 'FullBath', 'GarageCars', 'GarageArea', 'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF', '1stFlrSF']


In [436]:
print(categorical_columns)

['KitchenQual', 'PavedDrive', 'BsmtQual', 'CentralAir', 'Foundation']


In [437]:
df_dropna.isnull().any()


EVI                  False
ZoningScore          False
MSZoning_RL          False
LotConfig_CulDSac    False
LandContour_HLS      False
LotShape_IR2         False
Condition1_Norm      False
Condition1_Feedr     False
LotShape_Reg         False
MSZoning_RM          False
Price_per_m^2        False
PDI                  False
SalePrice            False
LotFrontage           True
LotArea              False
WoodDeckSF           False
OpenPorchSF          False
FullBath             False
GarageCars           False
GarageArea           False
KitchenQual          False
PavedDrive           False
OverallQual          False
YearBuilt            False
YearRemodAdd         False
MasVnrArea            True
GrLivArea            False
TotalBsmtSF          False
1stFlrSF             False
BsmtQual              True
CentralAir           False
Foundation           False
dtype: bool

In [438]:
df_dropna.isnull().sum()


EVI                    0
ZoningScore            0
MSZoning_RL            0
LotConfig_CulDSac      0
LandContour_HLS        0
LotShape_IR2           0
Condition1_Norm        0
Condition1_Feedr       0
LotShape_Reg           0
MSZoning_RM            0
Price_per_m^2          0
PDI                    0
SalePrice              0
LotFrontage          259
LotArea                0
WoodDeckSF             0
OpenPorchSF            0
FullBath               0
GarageCars             0
GarageArea             0
KitchenQual            0
PavedDrive             0
OverallQual            0
YearBuilt              0
YearRemodAdd           0
MasVnrArea             8
GrLivArea              0
TotalBsmtSF            0
1stFlrSF               0
BsmtQual              37
CentralAir             0
Foundation             0
dtype: int64

In [439]:

df_dropna = df_dropna.dropna(axis=0)

In [440]:
df_dropna.isnull().sum()


EVI                  0
ZoningScore          0
MSZoning_RL          0
LotConfig_CulDSac    0
LandContour_HLS      0
LotShape_IR2         0
Condition1_Norm      0
Condition1_Feedr     0
LotShape_Reg         0
MSZoning_RM          0
Price_per_m^2        0
PDI                  0
SalePrice            0
LotFrontage          0
LotArea              0
WoodDeckSF           0
OpenPorchSF          0
FullBath             0
GarageCars           0
GarageArea           0
KitchenQual          0
PavedDrive           0
OverallQual          0
YearBuilt            0
YearRemodAdd         0
MasVnrArea           0
GrLivArea            0
TotalBsmtSF          0
1stFlrSF             0
BsmtQual             0
CentralAir           0
Foundation           0
dtype: int64

In [441]:
df_dropna.shape

(1164, 32)

In [442]:
X_dropna = df_dropna.drop('SalePrice', axis=1)
y_dropna = df_dropna['SalePrice']


In [443]:
X_train, X_test, y_train, y_test = train_test_split(
    X_dropna, y_dropna, test_size=0.2, random_state=42
)

In [444]:
numerical_features = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_features = X_train.select_dtypes(include=["object", "category"]).columns.tolist()

In [445]:
categorical_transformer = Pipeline([
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])






In [446]:
preprocessor = ColumnTransformer([
    ('num', 'passthrough', numerical_features),
    ('cat', categorical_transformer, categorical_features)
])

In [447]:
model = Pipeline([
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])

In [448]:

model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [449]:
rmse_dropna = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse_dropna)

RMSE: 35777.823749415664


SimpleImputer Method

In [450]:
df_simple_imputer = df.copy()

In [451]:
df.shape

(1460, 32)

In [452]:
df_simple_imputer.shape

(1460, 32)

In [453]:
df.isnull().sum()


EVI                    0
ZoningScore            0
MSZoning_RL            0
LotConfig_CulDSac      0
LandContour_HLS        0
LotShape_IR2           0
Condition1_Norm        0
Condition1_Feedr       0
LotShape_Reg           0
MSZoning_RM            0
Price_per_m^2          0
PDI                    0
SalePrice              0
LotFrontage          259
LotArea                0
WoodDeckSF             0
OpenPorchSF            0
FullBath               0
GarageCars             0
GarageArea             0
KitchenQual            0
PavedDrive             0
OverallQual            0
YearBuilt              0
YearRemodAdd           0
MasVnrArea             8
GrLivArea              0
TotalBsmtSF            0
1stFlrSF               0
BsmtQual              37
CentralAir             0
Foundation             0
dtype: int64

In [454]:
df_simple_imputer.isnull().sum()


EVI                    0
ZoningScore            0
MSZoning_RL            0
LotConfig_CulDSac      0
LandContour_HLS        0
LotShape_IR2           0
Condition1_Norm        0
Condition1_Feedr       0
LotShape_Reg           0
MSZoning_RM            0
Price_per_m^2          0
PDI                    0
SalePrice              0
LotFrontage          259
LotArea                0
WoodDeckSF             0
OpenPorchSF            0
FullBath               0
GarageCars             0
GarageArea             0
KitchenQual            0
PavedDrive             0
OverallQual            0
YearBuilt              0
YearRemodAdd           0
MasVnrArea             8
GrLivArea              0
TotalBsmtSF            0
1stFlrSF               0
BsmtQual              37
CentralAir             0
Foundation             0
dtype: int64

In [455]:

X = df_simple_imputer.drop('SalePrice', axis=1)  
y = df_simple_imputer['SalePrice']               




In [456]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [457]:
numerical_columns = X_train.select_dtypes(include=["number"]).columns.tolist()
categorical_columns = X_train.select_dtypes(include=["object", "category"]).columns.tolist()


In [458]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

In [459]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_columns),
        ('cat', categorical_transformer, categorical_columns)
    ]
)


In [460]:

model = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('regressor', LinearRegression())
])


In [461]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)


In [462]:
rmse_simple_imputer = np.sqrt(mean_squared_error(y_test, predictions))
print("RMSE:", rmse_simple_imputer)

RMSE: 33925.979571605385


Conclusions

In [463]:
print(rmse_dropna)

35777.823749415664


In [464]:
print(rmse_simple_imputer)

33925.979571605385


In conclusion, the method using SimpleImputer performed better, because the RMSE is lower, meaning it is closer to the real values. It is also a better method than using the dropna one, because it doesn't remove missing data, but uses values to fill them (no data from the original dataset is lost, which helps the RMSE to stay low).