In [13]:
import pandas as pd

ames_housing = pd.read_csv(
    "../datasets/ames_housing_no_missing.csv",
    na_filter=False,  # required for pandas>2.0
)
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]

In [14]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [15]:
from sklearn.model_selection import cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

linear_regression = make_pipeline(StandardScaler(), LinearRegression())
cv_results_linear_regression = cross_validate(
    linear_regression, data_numerical, target, cv=10, return_estimator=True,
    n_jobs=2
)
scores_lr = cv_results_linear_regression["test_score"]
scores_lr

array([0.76129977, 0.80635105, 0.81358636, 0.66592199, 0.79964891,
       0.76868787, 0.75635094, 0.71822127, 0.31479306, 0.78635221])

In [16]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(random_state=0)
cv_results_tree = cross_validate(
    tree, data_numerical, target, cv=10, n_jobs=2
)
score_tree = cv_results_tree["test_score"]

In [17]:
print(
    "Linear regression is better than decision tree for "
    f"{sum(scores_lr > score_tree)} CV iterations out of 10 folds."
)

Linear regression is better than decision tree for 9 CV iterations out of 10 folds.


In [None]:
import numpy as np
from sklearn.model_selection import GridSearchCV

params = {"max_depth": np.arange(1, 16)}
search = GridSearchCV(tree, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2,
)

In [None]:
for search_cv in cv_results_tree_optimal_depth["estimator"]:
    print(search_cv.best_params_)

In [None]:
search = GridSearchCV(tree, params, cv=10)
cv_results_tree_optimal_depth = cross_validate(
    search, data_numerical, target, cv=10, return_estimator=True, n_jobs=2,
)
cv_results_tree_optimal_depth["test_score"].mean()

In [None]:
print(
    "A tree with an optimized depth is better than linear regression for "
    f"{sum(cv_results_tree_optimal_depth['test_score'] > scores_lr)} CV "
    "iterations out of 10 folds."
)

In [None]:
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

categorical_processor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

preprocessor = make_column_transformer(
    (categorical_processor, selector(dtype_include=object)),
    ("passthrough", numerical_features)
)
tree = make_pipeline(preprocessor, DecisionTreeRegressor(max_depth=7, random_state=0))

In [None]:
cv_results = cross_validate(
    tree, data, target, cv=10, return_estimator=True, n_jobs=2
)
cv_results["test_score"].mean()

In [None]:
import matplotlib.pyplot as plt

test_score_num = cv_results_tree_optimal_depth["test_score"]
test_score_all = cv_results["test_score"]

indices = np.arange(len(test_score_num))
plt.scatter(
    indices, test_score_num, color="tab:blue", label="numerical features only"
)
plt.scatter(
    indices,
    test_score_all,
    color="tab:red",
    label="all features",
)
plt.ylim((0, 1))
plt.xlabel("Cross-validation iteration")
plt.ylabel("R2 score")
_ = plt.legend(loc="lower right")

print(
    "A tree model using both numerical and categorical features is better than a "
    "tree with optimal depth using only numerical features for "
    f"{sum(cv_results['test_score'] > cv_results_tree_optimal_depth['test_score'])} CV "
    "iterations out of 10 folds."
)