Dataset

In [None]:
from sklearn.datasets import load_diabetes

diabetes = load_diabetes()
X = diabetes.data
y = diabetes.target

print("Shape of features (X):", X.shape)
print("Shape of target (y):", y.shape)

# Filter Methods


In [None]:
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2, f_regression, mutual_info_regression
import pandas as pd
import numpy as np


feature_names = ['age', 'sex', 'bmi', 'bp', 's1', 's2', 's3', 's4', 's5', 's6']
X_df = pd.DataFrame(X, columns=feature_names)

# 1. Variance Threshold
print("--- Variance Threshold ---")
variances = X_df.var()
print("Variance of each feature:\n", variances)
selector_vt = VarianceThreshold(threshold=0.002)
selector_vt.fit(X_df)
selected_features_vt = X_df.columns[selector_vt.get_support()]
print(f"\nFeatures selected by Variance Threshold (threshold=0.002): {list(selected_features_vt)}")


# 2. Correlation Coefficient
print("\n--- Correlation Coefficient ---")
correlations = X_df.corrwith(pd.Series(y))
print("Correlation with target:\n", correlations)


# 3. Chi-Square Test
print("\n--- Chi-Square Test ---")
X_non_negative = X_df - X_df.min()
selector_chi2 = SelectKBest(score_func=chi2, k=5)
selector_chi2.fit(X_non_negative, y)
scores_chi2 = pd.Series(selector_chi2.scores_, index=feature_names)
p_values_chi2 = pd.Series(selector_chi2.pvalues_, index=feature_names)
print("Chi-Square scores:\n", scores_chi2)
print("\nChi-Square p-values:\n", p_values_chi2)
selected_features_chi2 = X_df.columns[selector_chi2.get_support()]
print(f"\nFeatures selected by Chi-Square (k=5): {list(selected_features_chi2)}")


# 4. Mutual Information
print("\n--- Mutual Information ---")
mi_scores = mutual_info_regression(X_df, y)
mi_scores_series = pd.Series(mi_scores, index=feature_names)
print("Mutual Information scores:\n", mi_scores_series.sort_values(ascending=False))

selector_mi = SelectKBest(score_func=mutual_info_regression, k=5)
selector_mi.fit(X_df, y)
selected_features_mi = X_df.columns[selector_mi.get_support()]
print(f"\nFeatures selected by Mutual Information (k=5): {list(selected_features_mi)}")


# 5. ANOVA F-test
print("\n--- ANOVA F-test ---")
f_scores_anova, p_values_anova = f_regression(X_df, y)
f_scores_series = pd.Series(f_scores_anova, index=feature_names)
p_values_series = pd.Series(p_values_anova, index=feature_names)
print("ANOVA F-scores:\n", f_scores_series)
print("\nANOVA p-values:\n", p_values_series)

selector_anova = SelectKBest(score_func=f_regression, k=5)
selector_anova.fit(X_df, y)
selected_features_anova = X_df.columns[selector_anova.get_support()]
print(f"\nFeatures selected by ANOVA F-test (k=5): {list(selected_features_anova)}")

# Wrapper Methods

In [None]:
from sklearn.feature_selection import RFE, SequentialFeatureSelector
from sklearn.linear_model import LinearRegression

# Initialize a LinearRegression model
estimator = LinearRegression()

# Implement RFE
rfe = RFE(estimator, n_features_to_select=5)
rfe.fit(X_df, y)
selected_features_rfe = X_df.columns[rfe.get_support()]
print("Features selected by RFE:", list(selected_features_rfe))

# Implement SFS
sfs = SequentialFeatureSelector(estimator, n_features_to_select=5)
sfs.fit(X_df, y)
selected_features_sfs = X_df.columns[sfs.get_support()]
print("Features selected by SFS:", list(selected_features_sfs))

# Embedded Methods


In [None]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

# 1. Initialize embedded models
lasso = Lasso(alpha=0.1, random_state=42)
ridge = Ridge(alpha=0.1, random_state=42)
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42)
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# 2. Use SelectFromModel with Lasso
selector_lasso = SelectFromModel(lasso)
selector_lasso.fit(X_df, y)
selected_features_lasso_mask = selector_lasso.get_support()
selected_features_lasso = X_df.columns[selected_features_lasso_mask]
print("Features selected by Lasso:", list(selected_features_lasso))

# 3. Use SelectFromModel with Ridge
selector_ridge = SelectFromModel(ridge)
selector_ridge.fit(X_df, y)
selected_features_ridge_mask = selector_ridge.get_support()
selected_features_ridge = X_df.columns[selected_features_ridge_mask]
print("Features selected by Ridge:", list(selected_features_ridge))

# 4. Use SelectFromModel with Elastic Net
selector_elastic_net = SelectFromModel(elastic_net)
selector_elastic_net.fit(X_df, y)
selected_features_elastic_net_mask = selector_elastic_net.get_support()
selected_features_elastic_net = X_df.columns[selected_features_elastic_net_mask]
print("Features selected by Elastic Net:", list(selected_features_elastic_net))

# 5. Use SelectFromModel with Random Forest Importance
selector_rf = SelectFromModel(rf_regressor)
selector_rf.fit(X_df, y)
selected_features_rf_mask = selector_rf.get_support()
selected_features_rf = X_df.columns[selected_features_rf_mask]
print("Features selected by Random Forest Importance:", list(selected_features_rf))
