# Combine features with decision trees

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

from feature_engine.creation import DecisionTreeFeatures



In [2]:
# load the California House price data from Scikit-learn
X, y = fetch_california_housing(return_X_y=True, as_frame=True)

# Remove 2 variables:
X.drop(labels=["Latitude", "Longitude"], axis=1, inplace=True)

# display top 5 rows
X.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467


In [3]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)

X_train.shape, X_test.shape

((14448, 6), (6192, 6))

In [4]:
for var in X_train.columns:
    pearson = np.corrcoef(X_train[var], y_train)[0, 1]
    pearson = np.round(pearson, 2)
    print(
        f"corr {var} vs target: {pearson}")

corr MedInc vs target: 0.69
corr HouseAge vs target: 0.1
corr AveRooms vs target: 0.16
corr AveBedrms vs target: -0.05
corr Population vs target: -0.03
corr AveOccup vs target: -0.03


## Making all combinations of 2 variables

In [5]:
# set up the grid with parameters to optimize each tree

param_grid = {"max_depth": [2, 3, 4, None]}

In [6]:
# variables to combine

variables = ["AveRooms", "AveBedrms"]

In [7]:
dtf = DecisionTreeFeatures(
    variables=variables,
    features_to_combine=None,
    cv=5,
    param_grid=param_grid,
    scoring="neg_mean_squared_error",
    regression=True,
)

dtf.fit(X_train, y_train)

In [8]:
# the variables on which trees are trained

dtf.input_features_

['AveRooms', 'AveBedrms', ['AveRooms', 'AveBedrms']]

In [9]:
# the trained trees

dtf.estimators_

[GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
              param_grid={'max_depth': [2, 3, 4, None]},
              scoring='neg_mean_squared_error'),
 GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
              param_grid={'max_depth': [2, 3, 4, None]},
              scoring='neg_mean_squared_error'),
 GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
              param_grid={'max_depth': [2, 3, 4, None]},
              scoring='neg_mean_squared_error')]

In [10]:
trained_trees = dict()
for var, tree in zip(dtf.input_features_, dtf.estimators_):
    trained_trees[f"{var}"] = tree
    
trained_trees

{'AveRooms': GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
              param_grid={'max_depth': [2, 3, 4, None]},
              scoring='neg_mean_squared_error'),
 'AveBedrms': GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
              param_grid={'max_depth': [2, 3, 4, None]},
              scoring='neg_mean_squared_error'),
 "['AveRooms', 'AveBedrms']": GridSearchCV(cv=5, estimator=DecisionTreeRegressor(random_state=0),
              param_grid={'max_depth': [2, 3, 4, None]},
              scoring='neg_mean_squared_error')}

In [11]:
train_t = dtf.transform(X_train)
test_t = dtf.transform(X_test)

In [12]:
tree_features = [var for var in test_t.columns if "tree" in var ]

tree_features

['tree(AveRooms)', 'tree(AveBedrms)', "tree(['AveRooms', 'AveBedrms'])"]

In [13]:
test_t[tree_features].head()

Unnamed: 0,tree(AveRooms),tree(AveBedrms),"tree(['AveRooms', 'AveBedrms'])"
14740,1.999776,2.080254,2.099977
10101,1.999776,2.165554,2.438937
20566,1.999776,2.165554,2.099977
2670,1.786777,1.882763,1.728401
15709,1.786777,2.165554,1.821467


In [14]:
# original variables

for var in variables:
    pearson = np.corrcoef(X_test[var], y_test)[0, 1]
    pearson = np.round(pearson, 2)
    print(
        f"corr {var} vs target: {pearson}")

corr AveRooms vs target: 0.14
corr AveBedrms vs target: -0.03


In [15]:
# tree derived features

for var in tree_features:
    pearson = np.corrcoef(test_t[var], y_test)[0, 1]
    pearson = np.round(pearson, 2)
    print(
        f"corr {var} vs target: {pearson}")

corr tree(AveRooms) vs target: 0.37
corr tree(AveBedrms) vs target: 0.12
corr tree(['AveRooms', 'AveBedrms']) vs target: 0.47


## Combining specific features

In [16]:
features = (('Population'), ('Population', 'AveOccup'),
            ('Population', 'AveOccup', 'HouseAge'))

In [17]:
dtf = DecisionTreeFeatures(
    variables=None,
    features_to_combine=features,
    cv=5,
    param_grid=param_grid,
    scoring="neg_mean_squared_error"
)

dtf.fit(X_train, y_train)

In [18]:
dtf.input_features_

['Population',
 ['Population', 'AveOccup'],
 ['Population', 'AveOccup', 'HouseAge']]

In [19]:
train_t = dtf.transform(X_train)
test_t = dtf.transform(X_test)

In [20]:
tree_features = [var for var in test_t.columns if "tree" in var]

test_t[tree_features].head()

Unnamed: 0,tree(Population),"tree(['Population', 'AveOccup'])","tree(['Population', 'AveOccup', 'HouseAge'])"
14740,2.00749,1.484939,1.443097
10101,2.00749,2.059187,2.257968
20566,2.00749,2.059187,2.257968
2670,2.148072,2.235743,2.257968
15709,2.148072,2.74739,3.111251


## Linear model

In [21]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import cross_validate

In [22]:
lasso = Lasso(random_state=0, alpha=0.0001)

In [23]:
cv_results = cross_validate(lasso, X_train, y_train, cv=3)
mean = cv_results['test_score'].mean()
std = cv_results['test_score'].std()
print(f"Results: {mean} +/- {std}")

Results: 0.5480403481478856 +/- 0.004214649109293269


In [24]:
variables = ["AveRooms", "AveBedrms", "Population"]
train_t = train_t.drop(variables, axis=1)
cv_results = cross_validate(lasso, train_t, y_train, cv=3)
mean = cv_results['test_score'].mean()
std = cv_results['test_score'].std()
print(f"Results: {mean} +/- {std}")

Results: 0.5800993721099441 +/- 0.002845475651622909
