In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
import statsmodels.api as sm
from scipy import stats

# Load and preprocess data
cut_outliers = cut_outliers['original']
toyota = cut_outliers.copy()

X = toyota.drop(columns=["Price"], axis=1)
y = toyota["Price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [None]:
def calculate_metrics(X_train, y_train, X_test, selected_features):
    X_selected = X_train[selected_features]
    X_selected = sm.add_constant(X_selected)
    model = sm.OLS(y_train, X_selected).fit()

    
    X_selected_train = X_test[selected_features]
    X_selected_train = sm.add_constant(X_selected_train)
    y_pred = model.predict(X_selected_train)
    
    n = len(y_train)
    p = len(selected_features)
    
    # Calculate metrics
    r2 = model.rsquared
    rss = model.ssr
    bic = model.bic
    radj = model.rsquared_adj
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    
    # Calculate Mallow's Cp
    mse_full = model.mse_resid
    cp = (rss / mse_full) - (n - 2 * (p + 1))
    
    return r2, rss, bic, cp, radj, rmse

# Initialize lists to store results
n_features = range(1, 25)
metrics = {
    'R2': [],
    'RSS': [],
    'BIC': [],
    'Cp': [],
    'R2_adj': [],
    'RMSE': []
}
selected_features_history = []

# Perform stepwise forward selection
for n in n_features:
    sfs = SequentialFeatureSelector(
        LinearRegression(),
        n_features_to_select=n,
        direction="forward"
    )
    sfs.fit(X_train, y_train)
    
    # Get selected features
    selected_features = X_train.columns[sfs.get_support()].tolist()
    selected_features_history.append(selected_features)
    
    # Calculate metrics
    r2, rss, bic, cp, radj, rmse = calculate_metrics(X_train, y_train, X_test, selected_features)
    
    metrics['R2'].append(r2)
    metrics['RSS'].append(rss)
    metrics['BIC'].append(bic)
    metrics['Cp'].append(cp)
    metrics['R2_adj'].append(radj)
    metrics['RMSE'].append(rmse)
    
    print(f"\nFeatures selected for n={n}:")
    print(selected_features)

In [None]:
# Create plots for each metric
plt.figure(figsize=(15, 10))

# R2 plot
plt.subplot(2, 3, 1)
plt.plot(n_features, metrics['R2'], 'b-o')
plt.title('R² vs Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('R²')
plt.grid(True)

# RSS plot
plt.subplot(2, 3, 2)
plt.plot(n_features, metrics['RSS'], 'r-o')
plt.title('RSS vs Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('RSS')
plt.grid(True)

# BIC plot
plt.subplot(2, 3, 3)
plt.plot(n_features, metrics['BIC'], 'g-o')
plt.title('BIC vs Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('BIC')
plt.grid(True)

# Mallow's Cp plot
plt.subplot(2, 3, 4)
plt.plot(n_features, metrics['Cp'], 'm-o')
plt.title("Mallow's Cp vs Number of Features")
plt.xlabel('Number of Features')
plt.ylabel("Mallow's Cp")
plt.grid(True)

# Adjusted R² plot
plt.subplot(2, 3, 5)
plt.plot(n_features, metrics['R2_adj'], 'c-o')
plt.title('Adjusted R² vs Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('Adjusted R²')
plt.grid(True)

plt.subplot(3, 2, 6)
plt.plot(n_features, metrics['RMSE'], 'y-o')
plt.title('RMSE vs Number of Features')
plt.xlabel('Number of Features')
plt.ylabel('RMSE')
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Print summary of selected features for each step
print("Summary of Feature Selection Process:")
print("-" * 50)
for i, features in enumerate(selected_features_history, 1):
    print(f"\nStep {i} - Selected Features:")
    print(features)
    print(f"R²: {metrics['R2'][i-1]:.4f}")
    print(f"Adjusted R²: {metrics['R2_adj'][i-1]:.4f}")
    print(f"BIC: {metrics['BIC'][i-1]:.4f}")
    print(f"Mallow's Cp: {metrics['Cp'][i-1]:.4f}")
    print(f"RMSE: {metrics['RMSE'][i-1]:.4f}")

Se han conseguido buenos modelos con foward. Los pasos 10 y 16 cuentan con valores de RMSE menores a 900. Aunque el RMSE del paso 16 es menor tomamos el paso 10, por el criterio de navaja de Ockham.