Feature Selection

In [None]:
# Importing required libraries
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.feature_selection import VarianceThreshold, SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectFromModel

In [None]:
# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target
feature_names = iris.feature_names

In [None]:
# Convert to DataFrame for easier interpretation
df = pd.DataFrame(X, columns=feature_names)
print("Original Data:\n", df.head())

Original Data:
    sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2


In [None]:
# 1. Variance Threshold - Dropping features with low variance
print("\n--- Variance Threshold ---")
sel = VarianceThreshold(threshold=0.0)  # Removing features with zero variance
X_var_threshold = sel.fit_transform(X)
print("Selected Features (Variance Threshold):\n", X_var_threshold[:5])


--- Variance Threshold ---
Selected Features (Variance Threshold):
 [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]]


In [None]:
# 2. Univariate Feature Selection (Chi-square test)
print("\n--- Univariate Feature Selection (Chi-square) ---")
chi2_selector = SelectKBest(chi2, k=2)  # Select 2 best features
X_chi2 = chi2_selector.fit_transform(X, y)
print("Selected Features (Chi-square):\n", X_chi2[:5])


--- Univariate Feature Selection (Chi-square) ---
Selected Features (Chi-square):
 [[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]]


In [None]:
print("\n--- Univariate Feature Selection (Chi-square) ---")
chi2_selector = SelectKBest(chi2, k=3)  # Select 2 best features
X_chi2 = chi2_selector.fit_transform(X, y)
print("Selected Features (Chi-square):\n", X_chi2[:5])


--- Univariate Feature Selection (Chi-square) ---
Selected Features (Chi-square):
 [[5.1 1.4 0.2]
 [4.9 1.4 0.2]
 [4.7 1.3 0.2]
 [4.6 1.5 0.2]
 [5.  1.4 0.2]]


In [None]:
# 3. Model-based Feature Selection (Random Forest)
print("\n--- Feature Selection using Random Forest ---")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)


--- Feature Selection using Random Forest ---


In [None]:
# Select features based on their importance
model = SelectFromModel(rf, threshold="mean")
X_rf_selected = model.transform(X)
print("Selected Features (RandomForest):\n", X_rf_selected[:5])

# Display the feature importance scores from RandomForest
importances = rf.feature_importances_
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")

Selected Features (RandomForest):
 [[1.4 0.2]
 [1.4 0.2]
 [1.3 0.2]
 [1.5 0.2]
 [1.4 0.2]]
sepal length (cm): 0.1061
sepal width (cm): 0.0217
petal length (cm): 0.4361
petal width (cm): 0.4361


In [None]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# 1. Feature Selection using SelectKBest (Chi-square)
print("\n--- Feature Selection using Chi-square ---")
kbest_selector = SelectKBest(chi2, k=2)  # Select the top 2 features using Chi-square

# 2. Define KNN Classifier
knn = KNeighborsClassifier(n_neighbors=3)

# 3. RandomForest Classifier with 10 estimators (for feature selection)
rfc = RandomForestClassifier(n_estimators=10, random_state=42)

# 4. Pipeline: First do feature selection, then use RandomForest and KNN
pipeline = Pipeline([
    ('feature_selection', kbest_selector),  # Step 1: Select top 2 features using Chi-square
    ('random_forest', SelectFromModel(rfc, threshold='mean')),  # Step 2: Further select features using RFC
    ('classification', knn)  # Step 3: Classify using KNN
])

# Train the pipeline
pipeline.fit(X_train, y_train)

# Test the accuracy
accuracy = pipeline.score(X_test, y_test)
print("\nAccuracy of the KNN classifier with feature selection:", accuracy)

# RandomForest feature importance after training
rfc.fit(X_train, y_train)
importances = rfc.feature_importances_
for name, importance in zip(feature_names, importances):
    print(f"{name}: {importance:.4f}")


--- Feature Selection using Chi-square ---

Accuracy of the KNN classifier with feature selection: 0.9777777777777777
sepal length (cm): 0.1344
sepal width (cm): 0.0356
petal length (cm): 0.4204
petal width (cm): 0.4096
