In [69]:
import pandas as pd
ames_housing = pd.read_csv("../datasets/house_prices.csv", na_values="?")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [70]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score, mean_squared_error

lin_mod = LinearRegression()
dt_mod = DecisionTreeRegressor(max_depth=6)

data_numerical_imputed = SimpleImputer().fit_transform(data_numerical)
data_numerical_scaled = StandardScaler().fit_transform(data_numerical_imputed)

scores = ["r2", "mean_squared_error"]

cv_scores_lin = cross_validate(lin_mod, data_numerical_scaled, target,
                              cv=10, scoring="r2")
cv_scores_dt = cross_validate(dt_mod, data_numerical_imputed, target,
                             cv=10, scoring="r2")

In [71]:
print("Linear R2: ", cv_scores_lin["test_score"].mean())
print("DT R2: ",cv_scores_dt["test_score"].mean())

Linear R2:  0.7190704144353554
DT R2:  0.6923683497095743


In [72]:
from sklearn.model_selection import GridSearchCV

params = {"max_depth": list(range(1,16))}

dt_mod = DecisionTreeRegressor()

dt_grid_cv = GridSearchCV(DecisionTreeRegressor(), param_grid=params)
dt_grid_cv.fit(data_numerical_imputed, target)
dt_grid_cv.best_estimator_

DecisionTreeRegressor(max_depth=8)

In [74]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
import numpy as np

dt_fin = DecisionTreeRegressor()

transformer = make_column_transformer(
       (StandardScaler(),
        make_column_selector(dtype_include=np.number)),
       (OneHotEncoder(),
        make_column_selector(dtype_include=object)))

hp = transformer.fit_transform(data)
hp = SimpleImputer().fit_transform(hp)

dt_fin.fit(hp, target)

cv_scores_dt2 = cross_validate(dt_fin, hp, target,
                             cv=10, scoring="r2")

print("Categorical: ", cv_scores_dt2["test_score"].mean())
print("Numerical: ", cv_scores_dt["test_score"].mean())

print("Difference: ", cv_scores_dt2["test_score"].mean() - cv_scores_dt["test_score"].mean())


Categorical:  0.751821971422152
Numerical:  0.6923683497095743
Difference:  0.05945362171257773
