In [9]:
import pandas as pd

In [10]:
d = pd.read_csv('auto-mpg.csv')

In [11]:
label = 'mpg'
features = [column for column in d.columns if column != label]
X, y = d[features], d[label]

# Example with a Linear Regression model

In [12]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

We add support for 1-hot-encoded features using the following `get_model` function:

In [14]:
def get_model(features):
    categorical_features = ['origin'] if 'origin' in features else []
    numerical_features = [col for col in features if col != 'origin']

    return make_pipeline(
        ColumnTransformer(transformers=[
            ('categorical', OneHotEncoder(), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]),
        LinearRegression())

## We estimate the MSE of the model via 5-fold CV

In [15]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

## Variable selection with stepwise forward selection

In [16]:
from itertools import combinations
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

class Solution:
    features = list()
    mse = float('Inf')
    
    def __init__(self, y):
        pred = [y.mean()] * len(y)
        self.mse = mean_squared_error(y, pred)
    
    def update(self, features, mse):
        if mse < self.mse:
            self.features = features
            self.mse = mse
            return True
        return False

all_features = list(features)
current_features = list()
best = Solution(y)

while current_features != all_features:
    selected_feature = None
    
    for feature in set(all_features) - set(current_features):
        new_features = current_features + [feature]
        mses = cross_val_score(
            estimator=get_model(new_features),
            X=X[new_features], y=y,
            cv=kfold, scoring='neg_mean_squared_error')
        mse = -np.average(mses)
        
        if best.update(new_features, mse):
            selected_feature = feature
            
    if selected_feature:
        current_features.append(selected_feature)
    else:
        break

In [17]:
print('Selected features: ', end='')
print(', '.join(best.features))

Selected features: weight, year, origin, displacement, hp
