In [32]:
import pandas as pd
import numpy as np

d = pd.read_csv('../data/auto-mpg.csv')

In [33]:
label = 'mpg'
features = [c for c in d.columns if c != label]

In [34]:
X = d[features]
y = d[label]
p = len(features)

In [35]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error
from itertools import combinations

model = make_pipeline(StandardScaler(), LinearRegression())
kfold = KFold(n_splits=5, shuffle=True, random_state=0)

def get_base_mse(y):
    predictions = [np.mean(y)] * len(y)
    return mean_squared_error(y, predictions)

def k_fold_mse(X, y):
    scores = cross_validate(model, X, y, scoring='neg_mean_squared_error', cv=kfold)
    return np.mean(scores['test_score']) * -1

def backward_stepwise_selection(X, y):
    current_mse = k_fold_mse(X, y)
    current_features = features[:] # Use [:] to make a copy
    
    best_mse = current_mse
    best_features = current_features[:]
    
    while len(current_features) > 0:
        selected_feature = None
        
        for feature in current_features:
            new_features = [f for f in current_features if f != feature]
            mse = k_fold_mse(X[new_features], y)
            
            if mse < current_mse:
                selected_feature = feature
                current_mse = mse
        
        if selected_feature is not None:
            current_features.remove(selected_feature)
        else:
            break
            
    if current_mse < get_base_mse(y):
        return current_features
    else:
        return []

In [36]:
selected = backward_stepwise_selection(X, y)

In [37]:
for f in selected:
    print(f"Selected feature: {f}")

for f in (set(features) - set(selected)):
    print(f"Non-selected feature: {f}")

Selected feature: cylinders
Selected feature: displacement
Selected feature: hp
Selected feature: weight
Selected feature: year
Selected feature: origin
Non-selected feature: acceleration
