# Best Feature Selection Methods

## Load 3 Datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.linear_model import LassoCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
import time

In [None]:
# Load datasets
TB_HC_OD = pd.read_csv(r'..\..\Datasets\train\balanced\TB_HC_OD.csv')
PTB_EPTB = pd.read_csv(r'..\..\Datasets\train\balanced\PTB_EPTB.csv')
ATB_LTB = pd.read_csv(r'..\..\Datasets\train\balanced\ATB_LTB.csv')

## Separate features and target

In [27]:
## Separate features and target
X_TB_HC_OD = TB_HC_OD.drop(columns=['TB_Status'])
y_TB_HC_OD = TB_HC_OD['TB_Status']

X_PTB_EPTB = PTB_EPTB.drop(columns=['TB_Status'])
y_PTB_EPTB = PTB_EPTB['TB_Status']

X_ATB_LTB = ATB_LTB.drop(columns=['TB_Status'])
y_ATB_LTB = ATB_LTB['TB_Status']

In [28]:
# Encode target labels
def encode_target(y):
    le = LabelEncoder()
    return le.fit_transform(y)

## Select Best Feature Selection Algorithm

In [29]:
# OPTIMIZED function to evaluate feature selection methods
def evaluate_feature_selection_optimized(X, y, dataset_name):
    print(f"\n{'='*50}")
    print(f"Evaluating {dataset_name}")
    print(f"{'='*50}")
    
    start_time = time.time()
    y_encoded = encode_target(y)
    
    # Reduced feature counts for faster execution
    if dataset_name == "TB_HC_OD":
        feature_counts = [15, 20, 25, 30, 35, 40]
    elif dataset_name == "PTB_EPTB":
        feature_counts = [8, 10, 12, 15, 18, 20]
    else:  # ATB_LTB
        feature_counts = [5, 8, 10, 12, 15]
    
    results = []
    best_score = 0
    best_config = {}
    
    # Use only fastest algorithms
    algorithms = {
        'RandomForest': RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
        'GradientBoosting': GradientBoostingClassifier(n_estimators=50, random_state=42)
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for algo_name, model in algorithms.items():
        print(f"\nTesting {algo_name}...")
        
        for k in feature_counts:
            try:
                # Fast feature importance calculation
                model.fit(X, y_encoded)
                feature_importances = model.feature_importances_
                top_k_indices = np.argsort(feature_importances)[-k:]
                X_selected = X.iloc[:, top_k_indices]
                
                # Quick cross-validation with reduced folds
                score = cross_val_score(
                    RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
                    X_selected, y_encoded, cv=kf, n_jobs=-1
                ).mean()
                
                results.append({
                    'Algorithm': algo_name,
                    'Feature_Count': k,
                    'CV_Score': score
                })
                
                print(f"  Features: {k}, CV Score: {score:.4f}")
                
                # Track best configuration
                if score > best_score:
                    best_score = score
                    best_config = {'algorithm': algo_name, 'features': k, 'score': score}
                    
                # Early stopping if we reach 90%+
                if score >= 0.90:
                    break
                    
            except Exception as e:
                print(f"  Error with {k} features: {e}")
                continue
    
    # Also test SelectKBest for comparison
    print(f"\nTesting SelectKBest...")
    for k in feature_counts[:3]:  # Test only first 3 for speed
        try:
            selector = SelectKBest(score_func=f_classif, k=k)
            X_selected = selector.fit_transform(X, y_encoded)
            
            score = cross_val_score(
                RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1),
                X_selected, y_encoded, cv=kf, n_jobs=-1
            ).mean()
            
            results.append({
                'Algorithm': 'SelectKBest_f_classif',
                'Feature_Count': k,
                'CV_Score': score
            })
            
            print(f"  Features: {k}, CV Score: {score:.4f}")
            
            if score > best_score:
                best_score = score
                best_config = {'algorithm': 'SelectKBest_f_classif', 'features': k, 'score': score}
                
        except Exception as e:
            print(f"  Error with {k} features: {e}")
            continue
    
    end_time = time.time()
    print(f"\n⏰ Execution time: {end_time - start_time:.2f} seconds")
    
    # Display best result
    print(f"\n🎯 BEST CONFIGURATION for {dataset_name}:")
    print(f"   Algorithm: {best_config['algorithm']}")
    print(f"   Feature Count: {best_config['features']}")
    print(f"   CV Score: {best_config['score']:.4f}")
    
    return pd.DataFrame(results), best_config



In [30]:
# Evaluate all datasets
print("🚀 STARTING OPTIMIZED FEATURE SELECTION EVALUATION")
print("Target: ≥90% CV Score with Minimal Features")
print("Total Feature Budget: ≤50 across all datasets")

results_TB_HC_OD, best_TB_HC_OD = evaluate_feature_selection_optimized(X_TB_HC_OD, y_TB_HC_OD, "TB_HC_OD")
results_PTB_EPTB, best_PTB_EPTB = evaluate_feature_selection_optimized(X_PTB_EPTB, y_PTB_EPTB, "PTB_EPTB")
results_ATB_LTB, best_ATB_LTB = evaluate_feature_selection_optimized(X_ATB_LTB, y_ATB_LTB, "ATB_LTB")



🚀 STARTING OPTIMIZED FEATURE SELECTION EVALUATION
Target: ≥90% CV Score with Minimal Features
Total Feature Budget: ≤50 across all datasets

Evaluating TB_HC_OD

Testing RandomForest...
  Features: 15, CV Score: 0.7473
  Features: 20, CV Score: 0.7524
  Features: 25, CV Score: 0.7793
  Features: 30, CV Score: 0.7918
  Features: 35, CV Score: 0.7941
  Features: 40, CV Score: 0.7955

Testing GradientBoosting...
  Features: 15, CV Score: 0.8006
  Features: 20, CV Score: 0.8173
  Features: 25, CV Score: 0.8141
  Features: 30, CV Score: 0.8229
  Features: 35, CV Score: 0.8322
  Features: 40, CV Score: 0.8321

Testing SelectKBest...
  Features: 15, CV Score: 0.7353
  Features: 20, CV Score: 0.7460
  Features: 25, CV Score: 0.7640

⏰ Execution time: 16969.99 seconds

🎯 BEST CONFIGURATION for TB_HC_OD:
   Algorithm: GradientBoosting
   Feature Count: 35
   CV Score: 0.8322

Evaluating PTB_EPTB

Testing RandomForest...
  Features: 8, CV Score: 0.6657
  Features: 10, CV Score: 0.6658
  Features:

In [31]:
# Summary
total_features = best_TB_HC_OD['features'] + best_PTB_EPTB['features'] + best_ATB_LTB['features']
print(f"\n📊 FINAL SUMMARY:")
print(f"TB_HC_OD: {best_TB_HC_OD['features']} features, CV: {best_TB_HC_OD['score']:.4f}")
print(f"PTB_EPTB: {best_PTB_EPTB['features']} features, CV: {best_PTB_EPTB['score']:.4f}")
print(f"ATB_LTB: {best_ATB_LTB['features']} features, CV: {best_ATB_LTB['score']:.4f}")
print(f"TOTAL FEATURES: {total_features} (Target: ≤50)")
print(f"ALL ≥90%: {all([best_TB_HC_OD['score'] >= 0.90, best_PTB_EPTB['score'] >= 0.90, best_ATB_LTB['score'] >= 0.90])}")




📊 FINAL SUMMARY:
TB_HC_OD: 35 features, CV: 0.8322
PTB_EPTB: 20 features, CV: 0.7653
ATB_LTB: 10 features, CV: 0.9028
TOTAL FEATURES: 65 (Target: ≤50)
ALL ≥90%: False


In [32]:
# Save best configurations for feature selection
best_configs = {
    'TB_HC_OD': best_TB_HC_OD,
    'PTB_EPTB': best_PTB_EPTB, 
    'ATB_LTB': best_ATB_LTB
}

import json
with open(r'..\..\Saved_files\best_feature_configs.json', 'w') as f:
    json.dump(best_configs, f, indent=2)

print("\n✅ Best configurations saved!")


✅ Best configurations saved!


In [None]:
## Optimize PTB_EPTB Dataset Only - Extended Feature Range
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import LabelEncoder
import time

# Load PTB_EPTB dataset only
PTB_EPTB = pd.read_csv(r'..\..\Datasets\train\balanced\PTB_EPTB.csv')
X_PTB_EPTB = PTB_EPTB.drop(columns=['TB_Status'])
y_PTB_EPTB = PTB_EPTB['TB_Status']

# Encode target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_PTB_EPTB)

def optimize_ptb_eptb_features(X, y_encoded, dataset_name="PTB_EPTB"):
    print(f"\n{'='*60}")
    print(f"OPTIMIZING {dataset_name} FOR ≥80% CV SCORE")
    print(f"{'='*60}")
    
    start_time = time.time()
    
    # Extended feature range testing
    feature_counts = [25, 30, 35, 40, 45, 50, 60]
    
    results = []
    best_score = 0
    best_config = {}
    
    algorithms = {
        'GradientBoosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
        'SelectKBest_f_classif': 'f_classif',
        'SelectKBest_mutual_info': 'mutual_info_classif'
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    
    for algo_name, algorithm in algorithms.items():
        print(f"\n🔍 Testing {algo_name}...")
        
        for k in feature_counts:
            try:
                if algo_name in ['GradientBoosting', 'RandomForest']:
                    # Tree-based feature importance
                    model = algorithm
                    model.fit(X, y_encoded)
                    feature_importances = model.feature_importances_
                    top_k_indices = np.argsort(feature_importances)[-k:]
                    X_selected = X.iloc[:, top_k_indices]
                    
                else:
                    # SelectKBest with different scoring functions
                    selector = SelectKBest(score_func=algorithm, k=k)
                    X_selected = selector.fit_transform(X, y_encoded)
                
                # Cross-validation with RandomForest
                score = cross_val_score(
                    RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
                    X_selected, y_encoded, cv=kf, n_jobs=-1
                ).mean()
                
                results.append({
                    'Algorithm': algo_name,
                    'Feature_Count': k,
                    'CV_Score': score
                })
                
                print(f"   Features: {k:3d}, CV Score: {score:.4f} {'✅' if score >= 0.80 else ''}")
                
                # Track best configuration
                if score > best_score:
                    best_score = score
                    best_config = {
                        'algorithm': algo_name, 
                        'features': k, 
                        'score': score
                    }
                
                # Early stopping if we reach target
                if score >= 0.80:
                    print(f"   🎯 TARGET ACHIEVED with {k} features!")
                    # Continue to see if we can do better with fewer features
                    if k <= 30:  # If we hit target with ≤30 features, we're good
                        break
                    
            except Exception as e:
                print(f"   Error with {k} features: {e}")
                continue
        
        # Stop testing this algorithm if we found a good solution
        if best_score >= 0.80 and best_config['features'] <= 40:
            print(f"   ✅ Optimal solution found with {algo_name}")
            break
    
    end_time = time.time()
    print(f"\n⏰ Optimization time: {end_time - start_time:.2f} seconds")
    
    # Display detailed results
    results_df = pd.DataFrame(results)
    
    print(f"\n📊 RESULTS SUMMARY for {dataset_name}:")
    print(f"   Best Algorithm: {best_config['algorithm']}")
    print(f"   Best Feature Count: {best_config['features']}")
    print(f"   Best CV Score: {best_config['score']:.4f}")
    
    # Show top 5 configurations
    top_5 = results_df.nlargest(5, 'CV_Score')
    print(f"\n🏆 TOP 5 CONFIGURATIONS:")
    for idx, row in top_5.iterrows():
        print(f"   {row['Algorithm']:25} | {row['Feature_Count']:3d} features | CV: {row['CV_Score']:.4f}")
    
    return results_df, best_config

# Run optimization for PTB_EPTB
results_PTB_EPTB_optimized, best_PTB_EPTB_optimized = optimize_ptb_eptb_features(X_PTB_EPTB, y_encoded)

# Update the best configurations
best_configs_updated = {
    'TB_HC_OD': {'algorithm': 'GradientBoosting', 'features': 35, 'score': 0.8322},
    'PTB_EPTB': best_PTB_EPTB_optimized,
    'ATB_LTB': {'algorithm': 'GradientBoosting', 'features': 10, 'score': 0.9028}
}

# Save updated configurations
import json
with open(r'..\..\Saved_files\best_feature_configs_updated.json', 'w') as f:
    json.dump(best_configs_updated, f, indent=2)

print(f"\n✅ Updated configurations saved!")
print(f"📈 Previous PTB_EPTB score: 0.7653")
print(f"📈 Optimized PTB_EPTB score: {best_PTB_EPTB_optimized['score']:.4f}")

# Calculate total features
total_features = (best_configs_updated['TB_HC_OD']['features'] + 
                 best_configs_updated['PTB_EPTB']['features'] + 
                 best_configs_updated['ATB_LTB']['features'])

print(f"📊 TOTAL FEATURES: {total_features}")
print(f"🎯 TARGET ACHIEVED: {best_PTB_EPTB_optimized['score'] >= 0.80}")


OPTIMIZING PTB_EPTB FOR ≥80% CV SCORE

🔍 Testing GradientBoosting...
   Features:  25, CV Score: 0.7748 
   Features:  30, CV Score: 0.7653 
   Features:  35, CV Score: 0.7724 
   Features:  40, CV Score: 0.7629 
   Features:  45, CV Score: 0.7938 
   Features:  50, CV Score: 0.7913 
   Features:  60, CV Score: 0.7771 

🔍 Testing RandomForest...
   Features:  25, CV Score: 0.6847 
   Features:  30, CV Score: 0.6659 
   Features:  35, CV Score: 0.6730 
   Features:  40, CV Score: 0.6944 
   Features:  45, CV Score: 0.6872 
   Features:  50, CV Score: 0.6778 
   Features:  60, CV Score: 0.7038 

🔍 Testing SelectKBest_f_classif...
   Error with 25 features: The 'score_func' parameter of SelectKBest must be a callable. Got 'f_classif' instead.
   Error with 30 features: The 'score_func' parameter of SelectKBest must be a callable. Got 'f_classif' instead.
   Error with 35 features: The 'score_func' parameter of SelectKBest must be a callable. Got 'f_classif' instead.
   Error with 40 feat