**Using Filter + Embedded (ROC + Lasso)**


In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LassoCV

def calculate_roc_auc_scores(X, y):
    """Calculate ROC-AUC scores for each feature for each label"""
    n_features = X.shape[1]
    n_labels = y.shape[1]
    scores = np.zeros((n_features, n_labels))
    
    for label_idx in range(n_labels):
        y_label = y.iloc[:, label_idx]
        for feature_idx in range(n_features):
            feature = X.iloc[:, feature_idx]
            try:
                score = roc_auc_score(y_label, feature)
                # Adjust score if it's below 0.5 (worse than random)
                scores[feature_idx, label_idx] = max(score, 1 - score)
            except:
                scores[feature_idx, label_idx] = 0.5  # Default to random for failed calculations
                
    return scores

def filter_roc_auc(X, y, k='auto'):
    print("\nPerforming ROC-AUC based filtering...")
    
    if k == 'auto':
        k = 30
    
    # Calculate ROC-AUC scores for each feature-label combination
    roc_scores = calculate_roc_auc_scores(X, y)
    
    # Average ROC-AUC scores across labels
    mean_scores = np.mean(roc_scores, axis=1)
    
    # Select top k features
    selected_features = np.argsort(mean_scores)[-k:]
    
    print(f"Selected {len(selected_features)} features in filter stage")
    print(f"ROC-AUC scores range: {mean_scores[selected_features].min():.3f} - {mean_scores[selected_features].max():.3f}")
    
    return list(selected_features), mean_scores

def embedded_lasso_selection(X_train, y_train, selected_features, percentile=60):
    
    print("\nPerforming embedded selection with Lasso...")
    
    X_filtered = X_train.iloc[:, selected_features]
    final_importance = np.zeros(len(selected_features))
    
    # Lasso selection for each label
    for label_idx in range(y_train.shape[1]):
        # Fit LassoCV
        lasso = LassoCV(cv=5, random_state=42)
        lasso.fit(X_filtered, y_train.iloc[:, label_idx])
        
        
        importance = np.abs(lasso.coef_)
        final_importance += importance / np.sum(importance)
    
    # Average Lasso importance across labels
    final_importance /= y_train.shape[1]
    
 
    importance_threshold = np.percentile(final_importance, percentile)
    final_mask = final_importance >= importance_threshold
    final_features = np.array(selected_features)[final_mask]
    
    print(f"Selected {len(final_features)} features in embedded stage")
    print(f"Importance threshold: {importance_threshold:.4f}")
    
    return final_features

def hybrid_feature_selection(X, y, feature_names, test_size=0.2, random_state=42):
    """Complete pipeline for multi-label feature selection"""
    # Split the data first
    X_train, X_test, y_train, y_test = train_test_split(
        X, y,
        test_size=test_size,
        random_state=random_state,
        stratify=y.iloc[:, 0]  # Stratify on first label as approximation
    )
    
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Test set: {X_test.shape[0]} samples")
    
    # Step 1: Filter using ROC-AUC
    selected_feature_indices, roc_scores = filter_roc_auc(X_train, y_train)
    
    # Get feature names for selected features
    selected_feature_names = [feature_names[idx] for idx in selected_feature_indices]
    
    # Step 2: Embedded selection using Lasso
    final_feature_indices = embedded_lasso_selection(X_train, y_train, selected_feature_indices, percentile=60)
    
    # Get feature names for final selected features
    final_feature_names = [feature_names[idx] for idx in final_feature_indices]
    
    return {
        'selected_feature_names': final_feature_names,
        'selected_feature_indices': final_feature_indices,
        'X_train_selected': X_train.iloc[:, final_feature_indices],
        'X_test_selected': X_test.iloc[:, final_feature_indices],
        'y_train': y_train,
        'y_test': y_test,
        'roc_scores': roc_scores
    }

def main():
    # Load resampled data from CSV
    X_resampled = pd.read_csv('X_resampled_no_crix.csv')
    y_resampled = pd.read_csv('Y_resampled_no_crix.csv')
    
    # Get feature names from X_resampled columns
    feature_names = X_resampled.columns.tolist()
    
    # Run feature selection
    results = hybrid_feature_selection(X_resampled, y_resampled, feature_names)
    
    # Print results
    print("\nFeature Selection Results:")
    print(f"Original features: {X_resampled.shape[1]}")
    print(f"Selected features: {len(results['selected_feature_names'])}")
    
    print("\nSelected features:")
    for i, feature_name in enumerate(results['selected_feature_names'], 1):
        print(f"{i}. {feature_name}")
    
    # Prepare selected feature dataset
    X_selected = X_resampled.iloc[:, results['selected_feature_indices']]
    
    # Save selected feature dataset
    X_selected.to_csv('X_selected_no_crix.csv', index=False)
    y_resampled.to_csv('Y_selected_no_crix.csv', index=False)
    
    # Save selected feature names
    with open('feature_selected2/selected_feature_names.txt', 'w') as f:
        for feature in results['selected_feature_names']:
            f.write(f"{feature}\n")

if __name__ == "__main__":
    main()

Training set: 577469 samples
Test set: 144368 samples

Performing ROC-AUC based filtering...
Selected 30 features in filter stage
ROC-AUC scores range: 0.529 - 0.807

Performing embedded selection with Lasso...
Selected 12 features in embedded stage
Importance threshold: 0.0266

Feature Selection Results:
Original features: 40
Selected features: 12

Selected features:
1. PNAS
2. DIT
3. NAS
4. NOAM
5. TCC
6. DAC
7. WOC
8. NOA
9. FDP
10. NOM
11. AMW
12. CBO
