In [5]:
# ======================================================================
# feature_selection.ipynb  |  Disease-Prediction Mini-Hackathon
# FOCUSED: Comprehensive feature selection pipeline only
# Author: <your name>  |  Python 3.10.11
# ======================================================================

# %% [markdown]
# # 1. Setup & Configuration

# %%
import os, warnings, logging, joblib, json
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
from pathlib import Path
from sklearn.feature_selection import (
    SelectKBest, chi2, mutual_info_classif, 
    RFE, RFECV, SelectFromModel, VarianceThreshold
)
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

warnings.filterwarnings("ignore")
SEED = 42
np.random.seed(SEED)

# Paths
ROOT = Path.cwd().parent if Path.cwd().name.lower()=="notebook" else Path.cwd()
PROC = ROOT / "data" / "processed" 
MODELS = ROOT / "models"
MODELS.mkdir(exist_ok=True)

print("🔍 Feature Selection Pipeline")
print("=" * 50)

# %% [markdown]
# # 2. Data Loading

# %%
X_train = pd.read_csv(PROC / "X_train.csv")
y_train = pd.read_csv(PROC / "y_train.csv").squeeze()
X_valid = pd.read_csv(PROC / "X_valid.csv") 
y_valid = pd.read_csv(PROC / "y_valid.csv").squeeze()

print(f"Training shape: {X_train.shape}")
print(f"Original features: {X_train.shape[1]}")

# %% [markdown]
# # 3. Feature Selection Methods

# %%
print("\n🔍 FEATURE SELECTION PIPELINE")
print("=" * 50)

# Method 1: Remove Low Variance Features
print("Step 1: Variance Threshold...")
variance_selector = VarianceThreshold(threshold=0.01)
variance_selector.fit(X_train)
variance_features = X_train.columns[variance_selector.get_support()].tolist()
X_train_filtered = X_train[variance_features]
print(f"   Kept {len(variance_features)} features (removed {X_train.shape[1] - len(variance_features)})")

# Method 2: Chi-Square
print("Step 2: Chi-Square selection...")
chi2_selector = SelectKBest(chi2, k=min(60, len(variance_features)))
chi2_selector.fit(X_train_filtered, y_train)
chi2_features = X_train_filtered.columns[chi2_selector.get_support()].tolist()
print(f"   Selected {len(chi2_features)} features")

# Method 3: Mutual Information  
print("Step 3: Mutual Information...")
mi_selector = SelectKBest(mutual_info_classif, k=min(60, len(variance_features)))
mi_selector.fit(X_train_filtered, y_train)
mi_features = X_train_filtered.columns[mi_selector.get_support()].tolist()
print(f"   Selected {len(mi_features)} features")

# Method 4: Random Forest Importance
print("Step 4: Random Forest importance...")
rf_selector = RandomForestClassifier(n_estimators=100, random_state=SEED, n_jobs=-1)
rf_selector.fit(X_train_filtered, y_train)
rf_importance = pd.Series(rf_selector.feature_importances_, index=variance_features).sort_values(ascending=False)
rf_features = rf_importance.head(60).index.tolist()
print(f"   Selected {len(rf_features)} features")

# Method 5: RFE
print("Step 5: Recursive Feature Elimination...")
rfe_estimator = ExtraTreesClassifier(n_estimators=50, random_state=SEED, n_jobs=-1)
rfe_selector = RFECV(
    estimator=rfe_estimator,
    step=5,
    cv=StratifiedKFold(3, shuffle=True, random_state=SEED),
    scoring='accuracy',
    n_jobs=-1
)
rfe_selector.fit(X_train_filtered, y_train)
rfe_features = X_train_filtered.columns[rfe_selector.support_].tolist()
print(f"   Selected {len(rfe_features)} features")

# Method 6: Lasso
print("Step 6: L1 Regularization...")
lasso_selector = SelectFromModel(
    LogisticRegression(penalty='l1', solver='liblinear', C=0.1, random_state=SEED, max_iter=1000),
    threshold='median'
)
lasso_selector.fit(X_train_filtered, y_train)
lasso_features = X_train_filtered.columns[lasso_selector.get_support()].tolist()
print(f"   Selected {len(lasso_features)} features")

# %% [markdown]
# # 4. Consensus Analysis

# %%
print("\n📊 CONSENSUS ANALYSIS")
print("=" * 50)

# Voting system
all_methods = {
    'Chi2': chi2_features,
    'Mutual_Info': mi_features, 
    'Random_Forest': rf_features,
    'RFE': rfe_features,
    'Lasso': lasso_features
}

feature_votes = {}
for method, features in all_methods.items():
    for feature in features:
        if feature not in feature_votes:
            feature_votes[feature] = {'votes': 0, 'methods': []}
        feature_votes[feature]['votes'] += 1
        feature_votes[feature]['methods'].append(method)

# Consensus DataFrame
consensus_df = pd.DataFrame([
    {
        'feature': feature,
        'vote_count': data['votes'],
        'methods': ', '.join(data['methods'])
    }
    for feature, data in feature_votes.items()
]).sort_values('vote_count', ascending=False)

# Final feature selection (≥2 votes)
final_features = consensus_df[consensus_df.vote_count >= 2]['feature'].tolist()

# If too few, add top single-vote features
if len(final_features) < 30:
    additional = consensus_df[
        (consensus_df.vote_count == 1) & 
        (~consensus_df.feature.isin(final_features))
    ].head(30 - len(final_features))['feature'].tolist()
    final_features.extend(additional)

print(f"Final selected features: {len(final_features)}")
print(f"Reduction: {X_train.shape[1]} → {len(final_features)} ({len(final_features)/X_train.shape[1]:.1%})")

display(consensus_df.head(15))

# %% [markdown]
# # 5. Save Results

# %%
print("\n💾 SAVING FEATURE SELECTION RESULTS")
print("=" * 50)

# Save selected features (multiple formats)
joblib.dump(final_features, MODELS / "selected_features.pkl")
pd.DataFrame({'feature_name': final_features}).to_csv(MODELS / "selected_features.csv", index=False)
consensus_df.to_csv(MODELS / "feature_consensus.csv", index=False)

# Save processed data with selected features
X_train_selected = X_train[final_features]
X_valid_selected = X_valid[final_features]

X_train_selected.to_csv(PROC / "X_train_selected.csv", index=False)
X_valid_selected.to_csv(PROC / "X_valid_selected.csv", index=False)

# Summary
summary = {
    'original_features': X_train.shape[1],
    'selected_features': len(final_features),
    'reduction_ratio': len(final_features) / X_train.shape[1],
    'methods_used': list(all_methods.keys())
}

with open(MODELS / "feature_selection_summary.json", 'w') as f:
    json.dump(summary, f, indent=2)

print(f"✅ Selected features saved!")
print(f"✅ Processed data ready for model_training.ipynb")
print(f"📁 Files: selected_features.pkl, X_train_selected.csv, X_valid_selected.csv")


🔍 Feature Selection Pipeline
Training shape: (3936, 134)
Original features: 134

🔍 FEATURE SELECTION PIPELINE
Step 1: Variance Threshold...
   Kept 132 features (removed 2)
Step 2: Chi-Square selection...
   Selected 60 features
Step 3: Mutual Information...
   Selected 60 features
Step 4: Random Forest importance...
   Selected 60 features
Step 5: Recursive Feature Elimination...
   Selected 97 features
Step 6: L1 Regularization...
   Selected 66 features

📊 CONSENSUS ANALYSIS
Final selected features: 93
Reduction: 134 → 93 (69.4%)


Unnamed: 0,feature,vote_count,methods
59,symptom_count,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
2,pain_behind_the_eyes,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
35,red_spots_over_body,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
37,abnormal_menstruation,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
40,family_history,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
25,muscle_weakness,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
42,rusty_sputum,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
43,lack_of_concentration,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
44,receiving_blood_transfusion,5,"Chi2, Mutual_Info, Random_Forest, RFE, Lasso"
82,nausea,4,"Mutual_Info, Random_Forest, RFE, Lasso"



💾 SAVING FEATURE SELECTION RESULTS
✅ Selected features saved!
✅ Processed data ready for model_training.ipynb
📁 Files: selected_features.pkl, X_train_selected.csv, X_valid_selected.csv
