In [48]:
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import load_boston
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [33]:
linreg = LinearRegression()
tree = RandomForestRegressor(random_state=42)

In [34]:
data = load_boston()
x = data['data']
y = data['target']
# x, y = make_regression(n_features=300, random_state=42, n_informative=20)

In [35]:
cross_val_score(linreg, x, y, cv=5).mean(), cross_val_score(tree, x, y, cv=5, scoring='r2').mean()

(0.3532759243958772, 0.6270701879637265)

In [36]:
def find_best_features(estimator, x, y):
    best_features = []
    best_score = -np.inf
    while True:
        best_i = None
        for i in range(x.shape[1]):
            if i in best_features:
                continue
            feats = best_features + [i]
            subset = x[:, feats]
            score = cross_val_score(estimator, subset, y, cv=5, scoring='r2').mean()
            if score > best_score:
                best_score = score
                best_i = i
        if best_i is None:
            break
        best_features.append(best_i)
    return best_features

In [37]:
%%time
best_features = find_best_features(linreg, x, y)

Wall time: 378 ms


In [39]:
x.shape[1], len(best_features)

(13, 8)

In [40]:
cross_val_score(linreg, x[:, best_features], y, cv=5).mean()

0.47542937233204363

In [41]:
best_features

[12, 10, 3, 5, 0, 11, 7, 2]

In [42]:
%%time
best_features = find_best_features(tree, x, y)

Wall time: 33.6 s


In [43]:
len(best_features)

6

In [44]:
cross_val_score(tree, x[:, best_features], y, cv=5).mean()

0.6614603714442995

In [45]:
best_features

[12, 5, 4, 10, 3, 11]

In [47]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

In [49]:
tree.fit(x_train, y_train)
preds = tree.predict(x_test)

mean_absolute_error(y_test, preds), mean_squared_error(y_test, preds), r2_score(y_test, preds)

(2.2325568862275444, 10.619262365269464, 0.8596792855244546)

In [50]:
tree.fit(x_train[:, best_features], y_train)
preds = tree.predict(x_test[:, best_features])

mean_absolute_error(y_test, preds), mean_squared_error(y_test, preds), r2_score(y_test, preds)

(2.323041916167664, 13.4928948502994, 0.8217077061840372)

In [53]:
def find_best_features(estimator, x_train, y_train, x_test, y_test):
    best_features = []
    best_score = -np.inf
    while True:
        best_i = None
        for i in range(x_train.shape[1]):
            if i in best_features:
                continue
            feats = best_features + [i]
            subset = x_train[:, feats]
            estimator.fit(subset, y_train)
            score = r2_score(y_test, estimator.predict(x_test[:, feats]))
            if score > best_score:
                best_score = score
                best_i = i
        if best_i is None:
            break
        best_features.append(best_i)
    return best_features

In [54]:
%%time
best_features = find_best_features(tree, x_train, y_train, x_test, y_test)

Wall time: 7.68 s


In [55]:
best_features

[4, 5, 12, 7, 0, 10, 9, 3]

In [56]:
tree.fit(x_train[:, best_features], y_train)
preds = tree.predict(x_test[:, best_features])

mean_absolute_error(y_test, preds), mean_squared_error(y_test, preds), r2_score(y_test, preds)

(2.1461976047904194, 10.298240664670661, 0.8639211992130647)