In [1]:
import pandas as pd

In [2]:
d = pd.read_csv('auto-mpg.csv')

In [3]:
label = 'mpg'
features = [column for column in d.columns if column != label]
X, y = d[features], d[label]

# Example with a Linear Regression model

In [4]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

model = make_pipeline(StandardScaler(), LinearRegression())

## We estimate the MSE of the model via 5-fold CV

In [5]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits=5, shuffle=True, random_state=0)

## Variable selection with all possible subsets of features

In [12]:
from itertools import combinations
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np

class Solution:
    features = list()
    mse = float('Inf')
    
    def __init__(self, y):
        pred = [y.mean()] * len(y)
        self.mse = mean_squared_error(y, pred)
    
    def update(self, features, mse):
        if mse < self.mse:
            self.features = features
            self.mse = mse
            return True
        return False
            
best = Solution(y)

for n_features in range(1, len(features)+1):
    subsets = combinations(features, n_features)
    
    for subset in subsets:
        Xr = X[list(subset)]
        mses = cross_val_score(estimator=model, X=Xr, y=y, cv=kfold, scoring='neg_mean_squared_error')
        mse = -np.average(mses)
        best.update(features=subset, mse=mse)

In [13]:
print('Selected features: ', end='')
print(', '.join([str(x) for x in best.features]))

Selected features: cylinders, displacement, hp, weight, year, origin
