In [2]:
import pandas as pd

In [3]:
d = pd.read_csv('auto-mpg.csv')

In [4]:
label = 'mpg'
features = [column for column in d.columns if column != label]
X, y = d[features], d[label]

# Example with a Linear Regression model

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline

def get_model(features):
    categorical_features = ['origin'] if 'origin' in features else []
    numerical_features = [col for col in features if col != 'origin']
    
    return make_pipeline(
        ColumnTransformer(transformers=[
            ('categorical', OneHotEncoder(), categorical_features),
            ('numerical', StandardScaler(), numerical_features)
        ]),
        LinearRegression())

## We estimate the MSE of the model via 5-fold CV

In [6]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

## Variable selection with stepwise backward selection

In [7]:
from itertools import combinations
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

class Solution:
    features = list()
    mse = float('Inf')
    
    def __init__(self, y, features=None, mse=None):
        if features and mse:
            self.features = features
            self.mse = mse
        else:
            pred = [y.mean()] * len(y)
            self.features = list()
            self.mse = mean_squared_error(y, pred)
    
    def update(self, features, mse):
        if mse < self.mse:
            self.features = features
            self.mse = mse
            return True
        return False

current_features = list(features)
best = Solution(
    y=y,
    features=current_features,
    mse=-np.average(
        cross_val_score(
            estimator=get_model(current_features),
            X=X, y=y, cv=kfold,
            scoring='neg_mean_squared_error')))

while len(current_features) > 0:
    removed_feature = None
    
    for feature in current_features:
        new_features = [f for f in current_features if f != feature]
        mses = cross_val_score(
            estimator=get_model(new_features),
            X=X[new_features], y=y, cv=kfold,
            scoring='neg_mean_squared_error')
        mse = -np.average(mses)
        
        if best.update(new_features, mse):
            removed_feature = feature
            
    if removed_feature:
        current_features.remove(removed_feature)
    else:
        break

In [9]:
print('Selected features: ', end='')
print(', '.join(best.features))

Selected features: displacement, hp, weight, year, origin
