# 07_Feature_Selection.py

This notebook was automatically converted from a Python script.

Feature Selection with Scikit-learn


Import necessary libraries


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import load_breast_cancer, make_classification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import (
    SelectKBest, 
    f_classif, 
    chi2, 
    RFE, 
    SelectFromModel,
    mutual_info_classif,
    VarianceThreshold
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score



Set random seed for reproducibility


In [None]:
np.random.seed(42)



Save figures in the same directory


In [None]:
IMAGES_DIR = './'

print("# 1. Introduction to Feature Selection")
print("Feature selection is the process of selecting a subset of relevant features for use in model construction.")
print("It helps in reducing overfitting, improving accuracy, and reducing training time.")
print("In this notebook, we'll explore various feature selection techniques in scikit-learn.")

print("\n# 2. Dataset Preparation")


Load breast cancer dataset


In [None]:
data = load_breast_cancer()
X = data.data
y = data.target
feature_names = data.feature_names

print(f"Dataset shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of samples: {X.shape[0]}")
print(f"Feature names: {feature_names}")



Split data into train and test sets


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



Standardize features


In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n# 3. Filter Methods")
print("Filter methods select features based on statistical measures, independent of any ML algorithm.")

print("\n## 3.1 Variance Threshold")
print("Removes features with low variance.")
selector = VarianceThreshold(threshold=0.1)
X_train_var = selector.fit_transform(X_train_scaled)
X_test_var = selector.transform(X_test_scaled)

var_support = selector.get_support()
var_features = feature_names[var_support]
print(f"Number of selected features: {len(var_features)}")
print(f"Selected features: {var_features}")



Train model on selected features


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_var, y_train)
y_pred = model.predict(X_test_var)
print(f"Accuracy with variance threshold: {accuracy_score(y_test, y_pred):.4f}")

print("\n## 3.2 SelectKBest with F-score")
print("Selects features based on univariate statistical tests.")
selector = SelectKBest(f_classif, k=10)
X_train_k = selector.fit_transform(X_train_scaled, y_train)
X_test_k = selector.transform(X_test_scaled)

scores = selector.scores_
features_scores = list(zip(feature_names, scores))
features_scores.sort(key=lambda x: x[1], reverse=True)

print("Top 10 features based on F-score:")
for feature, score in features_scores[:10]:
    print(f"{feature}: {score:.4f}")



Plot feature importance


In [None]:
plt.figure(figsize=(12, 6))
top_features = [x[0] for x in features_scores[:10]]
top_scores = [x[1] for x in features_scores[:10]]
plt.barh(range(10), top_scores, align='center')
plt.yticks(range(10), top_features)
plt.xlabel('F-Score')
plt.title('Top 10 Features by F-Score')
plt.tight_layout()
plt.savefig(f"{IMAGES_DIR}f_score_feature_importance.png")
plt.close()



Train model on selected features


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_k, y_train)
y_pred = model.predict(X_test_k)
print(f"Accuracy with SelectKBest (F-score): {accuracy_score(y_test, y_pred):.4f}")

print("\n## 3.3 SelectKBest with Mutual Information")
print("Selects features based on mutual information between features and target.")
selector = SelectKBest(mutual_info_classif, k=10)
X_train_mi = selector.fit_transform(X_train_scaled, y_train)
X_test_mi = selector.transform(X_test_scaled)

scores = selector.scores_
features_scores = list(zip(feature_names, scores))
features_scores.sort(key=lambda x: x[1], reverse=True)

print("Top 10 features based on Mutual Information:")
for feature, score in features_scores[:10]:
    print(f"{feature}: {score:.4f}")



Train model on selected features


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_mi, y_train)
y_pred = model.predict(X_test_mi)
print(f"Accuracy with SelectKBest (Mutual Information): {accuracy_score(y_test, y_pred):.4f}")

print("\n# 4. Wrapper Methods")
print("Wrapper methods use a ML algorithm to evaluate different subsets of features.")

print("\n## 4.1 Recursive Feature Elimination (RFE)")
print("Recursively removes features by training a model and removing the weakest features.")
model = LogisticRegression(max_iter=1000)
rfe = RFE(estimator=model, n_features_to_select=10, step=1)
X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
X_test_rfe = rfe.transform(X_test_scaled)

rfe_support = rfe.get_support()
rfe_features = feature_names[rfe_support]
print(f"Selected features: {rfe_features}")



Train model on selected features


In [None]:
model.fit(X_train_rfe, y_train)
y_pred = model.predict(X_test_rfe)
print(f"Accuracy with RFE: {accuracy_score(y_test, y_pred):.4f}")

print("\n# 5. Embedded Methods")
print("Embedded methods perform feature selection as part of the model training process.")

print("\n## 5.1 Feature Importance with Random Forest")
forest = RandomForestClassifier(n_estimators=100, random_state=42)
forest.fit(X_train_scaled, y_train)
importances = forest.feature_importances_



Sort feature importances and get indices


In [None]:
indices = np.argsort(importances)[::-1]



Print the feature ranking


In [None]:
print("Feature ranking:")
for f in range(10):
    print(f"{f+1}. {feature_names[indices[f]]} ({importances[indices[f]]:.4f})")



Plot feature importances


In [None]:
plt.figure(figsize=(12, 6))
plt.title("Feature importances")
plt.bar(range(10), importances[indices[:10]], align="center")
plt.xticks(range(10), feature_names[indices[:10]], rotation=90)
plt.tight_layout()
plt.savefig(f"{IMAGES_DIR}rf_feature_importance.png")
plt.close()

print("\n## 5.2 SelectFromModel")
print("Uses a model's feature importances to select features.")
selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42), threshold='median')
X_train_sfm = selector.fit_transform(X_train_scaled, y_train)
X_test_sfm = selector.transform(X_test_scaled)

sfm_support = selector.get_support()
sfm_features = feature_names[sfm_support]
print(f"Number of selected features: {len(sfm_features)}")
print(f"Selected features: {sfm_features}")



Train model on selected features


In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train_sfm, y_train)
y_pred = model.predict(X_test_sfm)
print(f"Accuracy with SelectFromModel: {accuracy_score(y_test, y_pred):.4f}")

print("\n# 6. Comparing Different Feature Selection Methods")
print("Let's compare the performance of different feature selection methods.")
print("We'll use a simple benchmark: a logistic regression model trained on features selected by each method.")



Create a synthetic dataset for comparison


In [None]:
X_syn, y_syn = make_classification(n_samples=1000, n_features=20, n_informative=10, 
                                   n_redundant=5, n_repeated=0, n_classes=2, 
                                   random_state=42)

X_train_syn, X_test_syn, y_train_syn, y_test_syn = train_test_split(
    X_syn, y_syn, test_size=0.3, random_state=42
)



Define feature selection methods to compare


In [None]:
methods = {
    'Variance Threshold': VarianceThreshold(threshold=0.01),
    'SelectKBest (F-score)': SelectKBest(f_classif, k=10),
    'SelectKBest (Mutual Info)': SelectKBest(mutual_info_classif, k=10),
    'RFE': RFE(estimator=LogisticRegression(max_iter=1000), n_features_to_select=10),
    'SelectFromModel (RF)': SelectFromModel(
        RandomForestClassifier(n_estimators=100, random_state=42), max_features=10
    )
}

results = {}
for name, method in methods.items():
    # Select features
    X_train_selected = method.fit_transform(X_train_syn, y_train_syn)
    X_test_selected = method.transform(X_test_syn)
    
    # Train and evaluate model
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_selected, y_train_syn)
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test_syn, y_pred)
    
    results[name] = accuracy
    print(f"{name}: Accuracy = {accuracy:.4f}, Features = {X_train_selected.shape[1]}")



Plot results


In [None]:
plt.figure(figsize=(10, 6))
methods = list(results.keys())
accuracies = list(results.values())
plt.bar(methods, accuracies)
plt.ylabel('Accuracy')
plt.title('Comparison of Feature Selection Methods')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(f"{IMAGES_DIR}feature_selection_comparison.png")
plt.close()

print("\n# 7. Best Practices and Considerations")
print("When applying feature selection:")
print("- Always split your data before feature selection to prevent data leakage")
print("- Consider the nature of your data when choosing a selection method")
print("- Remember that correlation ≠ causation")
print("- Use domain knowledge when available")
print("- Try multiple feature selection methods and compare results")
print("- Beware of multicollinearity between features")

print("\n# 8. Conclusion")
print("Feature selection is a crucial step in the machine learning pipeline.")
print("It can improve model performance, reduce overfitting, and decrease training time.")
print("Scikit-learn provides a comprehensive set of tools for feature selection.")
print("The best method depends on your specific dataset and problem.") 
