Forward Elimination

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load the Iris dataset
data = load_iris()
X, y = data.data, data.target

def forward_selection(X, y):
    selected_features = []
    remaining_features = list(range(X.shape[1]))
    model = GaussianNB()  # Initialize the Naive Bayes model

    # Loop until all features are evaluated
    while remaining_features:
        best_acc = 0
        best_feature = None

        # Try adding each remaining feature and evaluate performance
        for feature in remaining_features:
            X_selected = X[:, selected_features + [feature]]
            X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)

            # Track the best feature that improves accuracy
            if acc > best_acc:
                best_acc = acc
                best_feature = feature

        # If a feature improves accuracy, add it to the selected features
        if best_feature is not None:
            selected_features.append(best_feature)
            remaining_features.remove(best_feature)

    return selected_features

# Perform Forward Selection
selected_features = forward_selection(X, y)
print("Selected features:", selected_features)

# Evaluate the model with selected features
X_selected = X[:, selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
model = GaussianNB()  # Reinitialize the model
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Model accuracy with selected features:", accuracy_score(y_test, y_pred))


Selected features: [2, 3, 0, 1]
Model accuracy with selected features: 0.9777777777777777


Backward Elimination

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Initialize the Naive Bayes model
model = GaussianNB()

def backward_elimination(X, y, model):
    selected_features = list(range(X.shape[1]))  # All features initially

    # Loop until all features are evaluated
    while len(selected_features) > 1:
        best_acc = 0
        worst_feature = None

        # Try removing each feature and evaluate performance
        for feature in selected_features:
            selected_features_temp = [f for f in selected_features if f != feature]
            X_selected = X[:, selected_features_temp]
            X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            acc = accuracy_score(y_test, y_pred)

            # Track the worst feature (the one whose removal improves accuracy)
            if acc > best_acc:
                best_acc = acc
                worst_feature = feature

        # If removing a feature improves accuracy, remove it
        if worst_feature is not None:
            selected_features.remove(worst_feature)

    return selected_features

# Perform Backward Elimination
selected_features = backward_elimination(X, y, model)
print("Selected features:", selected_features)

# Evaluate the model with selected features
X_selected = X[:, selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
print("Model accuracy with selected features:", model.score(X_test, y_test))


Selected features: [3]
Model accuracy with selected features: 1.0


RFE

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression model (since it has coef_ for feature importance)
model = LogisticRegression(max_iter=1000)

def rfe_feature_selection(X, y, model):
    # Perform RFE (Recursive Feature Elimination) for feature selection
    rfe = RFE(estimator=model, n_features_to_select=1, step=1)
    rfe = rfe.fit(X, y)

    # Get the selected features (those with ranking 1)
    selected_features = [i for i in range(X.shape[1]) if rfe.support_[i]]

    return selected_features

# Perform RFE for feature selection
selected_features = rfe_feature_selection(X, y, model)
print("Selected features:", selected_features)

# Evaluate the model with selected features
X_selected = X[:, selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
print("Model accuracy with selected features:", model.score(X_test, y_test))


Selected features: [2]
Model accuracy with selected features: 1.0


RFE CV

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFECV
from sklearn.metrics import accuracy_score

# Initialize the Logistic Regression model (since it has coef_ for feature importance)
model = LogisticRegression(max_iter=1000)

def rfecv_feature_selection(X, y, model):
    # Perform RFE with Cross-Validation (RFE-CV) for feature selection
    rfecv = RFECV(estimator=model, step=1, cv=5)
    rfecv.fit(X, y)

    # Get the selected features (those with ranking 1)
    selected_features = [i for i in range(X.shape[1]) if rfecv.support_[i]]

    return selected_features, rfecv

# Perform RFE-CV for feature selection
selected_features, rfecv = rfecv_feature_selection(X, y, model)
print("Selected features:", selected_features)
print("Optimal number of features:", rfecv.n_features_)

# Evaluate the model with selected features
X_selected = X[:, selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_selected, y, test_size=0.3, random_state=42)
model.fit(X_train, y_train)
print("Model accuracy with selected features:", model.score(X_test, y_test))



Selected features: [0, 1, 2, 3]
Optimal number of features: 4
Model accuracy with selected features: 1.0
