In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import accuracy_score

# Load dataset
dataset1 = pd.read_csv("prep.csv", index_col=None)
df2 = dataset1
df2 = pd.get_dummies(df2, drop_first=True)

# Define the independent and dependent variables
indep_X = df2.drop('classification_yes', axis=1)
dep_Y = df2['classification_yes']

# Split data
X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size=0.25, random_state=0)

# Standardize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Function to perform feature selection and model evaluation
def evaluate_model_with_selection(model, X_train, y_train, X_test, y_test, direction='forward', n_features=5):
    sfs = SequentialFeatureSelector(model, n_features_to_select=n_features, direction=direction, scoring='accuracy', cv=5)
    sfs = sfs.fit(X_train, y_train)
    
    # Get selected features
    selected_features = sfs.get_support()
    selected_feature_names = indep_X.columns[selected_features].tolist()
    
    # Select the features and fit the model
    X_train_selected = X_train[:, selected_features]
    X_test_selected = X_test[:, selected_features]
    
    # Fit the model and make predictions
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    
    accuracy = accuracy_score(y_test, y_pred)
    return selected_feature_names, accuracy

# Models for feature selection
log_model = LogisticRegression(solver='lbfgs', random_state=0)
svc_model = SVC(kernel='linear', random_state=0)
dt_model = DecisionTreeClassifier(random_state=0)
rf_model = RandomForestClassifier(random_state=0)

# DataFrame to store the results for Forward and Backward selection
results_forward = pd.DataFrame(columns=['Model', 'Selected Features', 'Accuracy'])
results_backward = pd.DataFrame(columns=['Model', 'Selected Features', 'Accuracy'])

# Function to apply forward and backward selection for all models
def perform_feature_selection(models, X_train, y_train, X_test, y_test):
    forward_results = []
    backward_results = []
    
    for model in models:
        model_name = model.__class__.__name__
        
        # Forward Selection
        selected_features_forward, accuracy_forward = evaluate_model_with_selection(model, X_train, y_train, X_test, y_test, 
                                                                                    direction='forward', n_features=5)
        
        # Backward Selection
        selected_features_backward, accuracy_backward = evaluate_model_with_selection(model, X_train, y_train, X_test, y_test, 
                                                                                      direction='backward', n_features=5)
        
        # Append to respective lists
        forward_results.append([model_name, ', '.join(selected_features_forward), accuracy_forward])
        backward_results.append([model_name, ', '.join(selected_features_backward), accuracy_backward])
    
    # Create DataFrames for Forward and Backward selection
    forward_df = pd.DataFrame(forward_results, columns=['Model', 'Selected Features', 'Accuracy'])
    backward_df = pd.DataFrame(backward_results, columns=['Model', 'Selected Features', 'Accuracy'])
    
    return forward_df, backward_df

# List of models to evaluate
models = [log_model, svc_model, dt_model, rf_model]

# Perform feature selection and evaluation for each model
forward_df, backward_df = perform_feature_selection(models, X_train, y_train, X_test, y_test)

# Display the results in two separate tables
print("Forward Selection Results:")
print(forward_df)

print("\nBackward Selection Results:")
print(backward_df)

Forward Selection Results:
                    Model                  Selected Features  Accuracy
0      LogisticRegression         al, sc, hrmo, sg_c, dm_yes      0.96
1                     SVC           al, hrmo, rc, sg_c, sg_d      0.97
2  DecisionTreeClassifier         su, rc, sg_c, sg_d, dm_yes      0.97
3  RandomForestClassifier  al, rc, sg_b, pcc_present, dm_yes      0.96

Backward Selection Results:
                    Model                    Selected Features  Accuracy
0      LogisticRegression        su, hrmo, sg_c, sg_d, htn_yes      0.97
1                     SVC       sc, hrmo, sg_c, dm_yes, pe_yes      0.96
2  DecisionTreeClassifier      al, sc, sg_c, sg_d, pcc_present      0.99
3  RandomForestClassifier  hrmo, sg_c, sg_d, dm_yes, appet_yes      0.99
