In [None]:
# ANOVA F-test Implementation
# Based on LAB5 materials - for numerical features with classification target

def apply_anova_ftest(X, y, k=10):
    """
    Apply ANOVA F-test for feature selection
    
    Parameters:
    X: DataFrame with numerical features
    y: target variable (classification)
    k: number of top features to select
    
    Returns:
    selected_features: list of selected feature names
    scores: F-scores for all features
    selector: fitted selector for transform
    """
    print("Applying ANOVA F-test...")
    print(f"ANOVA F-test measures the linear dependency between features and target")
    print(f"Higher F-scores indicate stronger linear relationship")
    
    # Apply ANOVA F-test
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    
    # Get feature names and scores
    selected_features = X.columns[selector.get_support()].tolist()
    feature_scores = selector.scores_
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'feature': X.columns,
        'f_score': feature_scores,
        'selected': selector.get_support()
    }).sort_values('f_score', ascending=False)
    
    print(f"\nSelected {len(selected_features)} features using ANOVA F-test:")
    for i, feature in enumerate(selected_features[:5]):
        score = results_df[results_df['feature'] == feature]['f_score'].iloc[0]
        print(f"  {i+1}. {feature}: F-score = {score:.2f}")
    
    if len(selected_features) > 5:
        print(f"  ... and {len(selected_features) - 5} more features")
    
    return selected_features, results_df, selector

# Example usage with numerical features
if not df.empty:
    # Prepare numerical features and target
    numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove ID columns and target from features
    numerical_features = [col for col in numerical_features 
                         if not col.lower().endswith('_id') and col != 'loan_status']
    
    if len(numerical_features) >= 5 and target_column:
        # Prepare data
        X_numerical = df[numerical_features].fillna(df[numerical_features].mean())
        
        # Create binary target if needed
        if df[target_column].nunique() == 2:
            y = df[target_column]
        else:
            # Convert to binary classification
            le = LabelEncoder()
            y = le.fit_transform(df[target_column].fillna('Unknown'))
        
        print(f"Demonstrating ANOVA F-test on {len(numerical_features)} numerical features")
        
        # Apply ANOVA F-test
        selected_anova, anova_results, anova_selector = apply_anova_ftest(
            X_numerical, y, k=min(10, len(numerical_features))
        )
        
        # Visualize top features
        plt.figure(figsize=(12, 6))
        top_features = anova_results.head(15)
        plt.bar(range(len(top_features)), top_features['f_score'])
        plt.xticks(range(len(top_features)), top_features['feature'], rotation=45, ha='right')
        plt.title('Top 15 Features by ANOVA F-test Score')
        plt.xlabel('Features')
        plt.ylabel('F-Score')
        plt.tight_layout()
        plt.show()
    else:
        print("ANOVA F-test demonstration requires numerical features and target variable")
else:
    print("No data available for ANOVA F-test demonstration")

## Univariate Statistical Tests
### ANOVA F-test for Numerical Features
Based on LAB5 materials - measures linear dependency between features and target

# Feature Selection for Lending Club Data
## COMP647 Assignment 03
### Student ID: 1163127

This notebook implements feature selection techniques including:
- Univariate statistical tests (ANOVA, Chi-Square)
- Wrapper methods (Forward/Backward selection)
- Embedded methods (Random Forest importance)

Based on LAB5 materials and course teachings.

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Feature selection libraries
from sklearn.feature_selection import (
    SelectKBest, f_classif, chi2, 
    RFE, RFECV,
    SelectFromModel
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

# Set random seed for reproducibility
np.random.seed(42)

# Configure display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 20)

In [None]:
# Load preprocessed data from previous notebooks
try:
    # Load the processed sample data
    df = pd.read_csv('../data/processed/accepted_sample_10000.csv')
    print(f"Data loaded successfully: {df.shape}")
    print(f"Columns: {len(df.columns)}")
except FileNotFoundError:
    print("Processed data not found. Please run Assignment 02 notebooks first.")
    # Create sample data for demonstration
    df = pd.DataFrame()

# Display basic info about the dataset
if not df.empty:
    print("\nDataset Info:")
    print(df.info())
    
    # Identify target variable (loan status)
    target_column = 'loan_status' if 'loan_status' in df.columns else None
    if target_column:
        print(f"\nTarget variable: {target_column}")
        print(f"Target distribution:")
        print(df[target_column].value_counts())
    else:
        print("\nWarning: loan_status column not found. Will create binary target for demonstration.")