In [None]:
%pip install boruta

In [None]:
'''
 -----------------------------------------------------------
          Artificial Intelligence Workshop RUG
 -----------------------------------------------------------
            R.M. (Rolando) Gonzales Martinez
 -----------------------------------------------------------
 ~~~~~~~ Credit scoring model with Machine Learning ~~~~~~~~
             Feature Selection with Boruta
 -----------------------------------------------------------
'''
import pandas as pd
df = pd.read_excel("bankloans.xlsx")
print(df.head())
# age: Age in years
# education: Level of education, (1) did not complete high school, (2) high school degree, (3) some college, (4) college degree, (5) postundergraduate degree
# employears: Years with current employer
# address: Years at current address
# salary: salary in thousands
# creddebt: Credit card debt in thousands
# othdebt: Other debt in thousands
# default: credit default

In [None]:
# Feature selection with Boruta
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from boruta import BorutaPy

# Define features and target
numerical_cols = ['age', 'employears', 'salary', 'creddebt','othdebt']
categorical_cols = ['education']
target_col = 'default'

# Separate features and target
X = df[numerical_cols + categorical_cols]
y = df[target_col].astype(int)

# Impute numerical columns
num_imputer = SimpleImputer(strategy='mean')
X_num = num_imputer.fit_transform(X[numerical_cols])

# Impute and encode categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
X_cat_imputed = cat_imputer.fit_transform(X[categorical_cols])

encoder = OneHotEncoder(drop='if_binary', sparse_output=False)
X_cat = encoder.fit_transform(X_cat_imputed)

# Combine numerical and categorical
import numpy as np
X_processed = np.hstack([X_num, X_cat])

# Fit Boruta
forest = RandomForestClassifier(n_estimators=100, random_state=, class_weight='balanced')
boruta = BorutaPy(estimator=forest, n_estimators='auto', random_state=0)
boruta.fit(X_processed, y)

# Get feature names
cat_feature_names = encoder.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)

# Select important features
selected_features = [feature for feature, support in zip(feature_names, boruta.support_) if support]

# Get feature names and flags
cat_feature_names = encoder.get_feature_names_out(categorical_cols)
feature_names = numerical_cols + list(cat_feature_names)
boruta_result_df = pd.DataFrame({
    "Feature": feature_names,
    "Selected": boruta.support_,
    "Tentative": boruta.support_weak_,
    "Ranking": boruta.ranking_
})

boruta_result_df