In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, RFE, chi2, SelectKBest
from sklearn.linear_model import LogisticRegression

# =========================================================================
# 1. METHOD: FEATURE IMPORTANCE (RANDOM FOREST) for feature selection part

# Train a Random Forest Classifier to calculate importance scores
rf_selector = RandomForestClassifier(random_state=42)
rf_selector.fit(X_train, y_train)

# Create a Series of feature importances
importance_df = pd.Series(
    rf_selector.feature_importances_, 
    index=X_train.columns
).sort_values(ascending=False)

print("--- 1. Random Forest Feature Importance (Top 10) ---")
print(importance_df.head(10))


# =========================================================================
# 2. METHOD: RECURSIVE FEATURE ELIMINATION (RFE)

# Initialize a model (Logistic Regression) to use within RFE
rfe_model = LogisticRegression(solver='liblinear', random_state=42)

# Initialize RFE to select the top 10 features
rfe_selector = RFE(estimator=rfe_model, n_features_to_select=10, step=1)
rfe_selector.fit(X_train, y_train)

# Get the selected features
rfe_selected_features = X_train.columns[rfe_selector.support_]

print("\n--- 2. RFE Selected Features (Top 10) ---")
print(rfe_selected_features.tolist())


# =========================================================================
# 3. METHOD: CHI-SQUARE TEST

# Note: Chi-Square requires non-negative data. Our scaled continuous features
# are both negative and positive, so we'll only apply this to the binary (0/1)
# one-hot encoded features which are guaranteed non-negative.

# Identify the categorical/binary columns
binary_cols = X_train.columns[~X_train.columns.isin(numerical_cols)]

# Apply SelectKBest with Chi-Square (selecting the top 10 binary features)
chi2_selector = SelectKBest(chi2, k=10)
chi2_selector.fit(X_train[binary_cols], y_train)

# Get the selected features
chi2_selected_features = binary_cols[chi2_selector.get_support()]

print("\n--- 3. Chi-Square Selected Features (Top 10 Binary) ---")
print(chi2_selected_features.tolist())


# =========================================================================
# 4. FINAL SELECTION & TRANSFORMATION
# =========================================================================

# For simplicity and robustness, we will select the top 12 features based on the
# Random Forest Importance scores (a highly reliable model-based method).

# Get the names of the final selected features (e.g., top 12)
final_selected_features = importance_df.head(12).index.tolist()

# Create the final feature-selected datasets
X_train_fs = X_train[final_selected_features]
X_test_fs = X_test[final_selected_features]