In [10]:
# Feature Importance Selection Module
 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ------------------------------
# Load Dataset
# ------------------------------
def load_dataset(filename, target_column):
    # Read CSV
    df = pd.read_csv(filename)

    # One-hot encode categorical features
    df = pd.get_dummies(df, drop_first=True)

    # Split into independent (X) and dependent (y)
    X = df.drop(target_column, axis=1)
    y = df[target_column]

    return X, y


# ------------------------------
# Feature Importance Selection
# ------------------------------
def feature_importance_selection(X, y, n_features=5):
    rf = RandomForestClassifier(n_estimators=100, random_state=0)
    rf.fit(X, y)

    # Get feature importance
    importances = rf.feature_importances_

    feature_importance_df = pd.DataFrame({
        'Feature': X.columns,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)

    # Select top-n
    top_features = feature_importance_df.head(n_features)['Feature'].tolist()
    X_selected = X[top_features]

    print("\n Feature Importance Ranking:")
    print(feature_importance_df)

    print(f"\n Top {n_features} selected features:")
    print(top_features)

    

    return X_selected, feature_importance_df, top_features


# ------------------------------
# Example Run
# ------------------------------
if __name__ == "__main__":
    # Load dataset
    X, y = load_dataset("prep.csv", "classification_yes")

    # Split into train/test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

    # Scale features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Convert back to DataFrame (after scaling)
    X_train = pd.DataFrame(X_train, columns=X.columns)
    X_test = pd.DataFrame(X_test, columns=X.columns)

    # Feature selection
    X_selected, importance_df, top_feats = feature_importance_selection(X_train, y_train, n_features=3)



 Feature Importance Ranking:
        Feature  Importance
10          pcv    0.223889
9          hrmo    0.208717
6            sc    0.104623
12           rc    0.078391
4           bgr    0.060795
2            al    0.053595
5            bu    0.046562
21      htn_yes    0.042001
22       dm_yes    0.040559
7           sod    0.030193
14         sg_c    0.022539
15         sg_d    0.016589
13         sg_b    0.015881
0           age    0.010809
1            bp    0.008730
18    pc_normal    0.005421
11           wc    0.005321
17   rbc_normal    0.005299
25       pe_yes    0.005014
8           pot    0.004996
24    appet_yes    0.004753
3            su    0.004155
26      ane_yes    0.000933
19  pcc_present    0.000217
20   ba_present    0.000017
16         sg_e    0.000000
23      cad_yes    0.000000

 Top 3 selected features:
['pcv', 'hrmo', 'sc']


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):
