In [1]:
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import logging
from typing import Dict, Any, List
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import numpy as np
from sklearn.model_selection import train_test_split


In [2]:
df = pd.read_csv("../data/01_raw/bank-additional-full.csv", sep=";")

<hr>
<hr>

# RFE

In [3]:
X_train_preprocessed = pd.read_csv("../data/03_primary/X_train_preprocessed.csv")
y_train_encoded = pd.read_csv("../data/03_primary/y_train_encoded.csv")

In [4]:
X_train_preprocessed.columns

Index(['age', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'emp_rate_x_employed', 'loan_risk_score', 'economic_pressure_index',
       'default', 'housing', 'loan', 'cpi_above_75th', 'cci_above_75th',
       'young_housing_loan', 'middle_aged_housing_loan', 'senior_housing_loan',
       'young_loan', 'middle_aged_loan', 'senior_loan', 'contacted_before',
       'is_student_or_retired', 'successful_prev_contact', 'has_any_loan',
       'job', 'education', 'campaign_bin', 'cci_top_value',
       'marital_edu_combo', 'marital_divorced', 'marital_married',
       'marital_single', 'marital_unknown', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success', 'age_binned_quantile_0.0',
       'age_binned_quantile_1.0', 'age_binned_quantile_2.0',
       'age_binned_quantile_3.0', 'age_binned_quantile_4.0', 'previous_bin_0',
       'previous_bin_1', 'previous_bin_2', 'previous_bin_3',
       'cpi_top_v

In [5]:
len(X_train_preprocessed.columns)

60

In [None]:
params = {
    "model_params": {
        "n_estimators": 50,       # Menor para ser rápido (aumentar se necessário)
        "max_depth": [3, 4, 5],   # Menos profundidades para testar
        "learning_rate": 0.1,
        "random_state": 42,
        "n_jobs": -1,
        "use_label_encoder": False,
        "eval_metric": "mlogloss"
    },
    "n_features": [10, 14, 18]    # Menos valores para ser rápido
}


In [15]:
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
from typing import List

def feature_selection_rfe_all(X, y, model_params: dict) -> List[str]:
    best_score = 0.0
    best_features = []
    best_n_features = 0
    y = np.ravel(y)
    
    # Dividir treino e validação
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    total_features = X.shape[1]
    feature_names = X.columns

    for n in range(1, total_features + 1):
        print(f"\nA testar {n} features...")
        base_model = XGBClassifier(**model_params, verbosity=0)
        rfe = RFE(estimator=base_model, n_features_to_select=n, step=0.2)
        rfe.fit(X_train_split, y_train_split)
        selected_features = feature_names[rfe.support_].tolist()

        model = XGBClassifier(**model_params, verbosity=0)
        model.fit(X_train_split[selected_features], y_train_split)

        y_train_pred = model.predict(X_train_split[selected_features])
        y_val_pred = model.predict(X_val_split[selected_features])

        train_f1 = f1_score(y_train_split, y_train_pred, average="macro")
        val_f1 = f1_score(y_val_split, y_val_pred, average="macro")
        gap = abs(train_f1 - val_f1)

        print(f"Train F1: {train_f1:.4f}, Val F1: {val_f1:.4f}, Gap: {gap:.4f}")

        if val_f1 > best_score:
            best_score = val_f1
            best_features = selected_features
            best_n_features = n
            print(f"✅ Novo melhor modelo com {n} features: {best_features}")

    print(f"\n🎯 Melhor conjunto: {best_features}")
    print(f"⭐️ Número ideal de features: {best_n_features}")
    print(f"⭐️ Melhor F1 de validação: {best_score:.4f}")
    return best_features

# Exemplo de uso:
model_params = {
    "n_estimators": 50,
    "max_depth": 4,
    "learning_rate": 0.1,
    "random_state": 42,
    "n_jobs": -1,
    "use_label_encoder": False,
    "eval_metric": "mlogloss"
}

best_features = feature_selection_rfe_all(X_train_preprocessed, y_train_encoded, model_params)




A testar 1 features...
Train F1: 0.6033, Val F1: 0.5883, Gap: 0.0150
✅ Novo melhor modelo com 1 features: ['nr.employed']

A testar 2 features...
Train F1: 0.7546, Val F1: 0.7453, Gap: 0.0093
✅ Novo melhor modelo com 2 features: ['duration', 'nr.employed']

A testar 3 features...
Train F1: 0.7580, Val F1: 0.7439, Gap: 0.0142

A testar 4 features...
Train F1: 0.7633, Val F1: 0.7487, Gap: 0.0146
✅ Novo melhor modelo com 4 features: ['duration', 'emp.var.rate', 'nr.employed', 'month']

A testar 5 features...
Train F1: 0.7753, Val F1: 0.7622, Gap: 0.0130
✅ Novo melhor modelo com 5 features: ['duration', 'emp.var.rate', 'euribor3m', 'nr.employed', 'month']

A testar 6 features...
Train F1: 0.7777, Val F1: 0.7669, Gap: 0.0107
✅ Novo melhor modelo com 6 features: ['duration', 'pdays', 'emp.var.rate', 'euribor3m', 'nr.employed', 'month']

A testar 7 features...
Train F1: 0.7796, Val F1: 0.7668, Gap: 0.0128

A testar 8 features...
Train F1: 0.7768, Val F1: 0.7661, Gap: 0.0107

A testar 9 featu

In [6]:
params = {
    "model_params": {
        "n_estimators": 500,
        "max_depth": [5,6,7,8],
        "random_state": 42,
        "n_jobs": -1
    },
    "n_features": [14,16,18, 20,24,28,32,36]
}

In [7]:
def feature_selection_rfe(X: pd.DataFrame, y: pd.Series, params: Dict) -> List[str]:
    logger = logging.getLogger(__name__)

    model_params = params["model_params"]
    n_features_list = params["n_features"]

    # Normalize max_depth to a list
    max_depth_list = model_params["max_depth"]
    if isinstance(max_depth_list, int) or max_depth_list is None:
        max_depth_list = [max_depth_list]

    best_score = 0.0
    best_features = []

    y = np.ravel(y)

    # Validation split within training data
    X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    for n in n_features_list:
        for d in max_depth_list:
            logger.info(f"📌 Evaluating RFE with top {n} features and max_depth={d}...")
            model_params["max_depth"] = d

            base_model = RandomForestClassifier(**model_params)
            rfe = RFE(estimator=base_model, n_features_to_select=n)
            rfe.fit(X_train_split, y_train_split)

            selected_features = X.columns[rfe.support_].tolist()
            logger.info(f"🔍 Selected features: {selected_features}")

            # Retrain model only on selected features
            model = RandomForestClassifier(**model_params)
            model.fit(X_train_split[selected_features], y_train_split)

            y_train_pred = model.predict(X_train_split[selected_features])
            y_val_pred = model.predict(X_val_split[selected_features])

            train_f1 = f1_score(y_train_split, y_train_pred, average="macro")
            val_f1 = f1_score(y_val_split, y_val_pred, average="macro")
            gap_check = abs(train_f1 - val_f1)

            print(f"Max Depth: {d}, Features: {n}")
            print(f"Train F1 (macro): {train_f1:.4f}")
            print(f"Validation F1 (macro): {val_f1:.4f}")   
            print(f"Gap (train - val): {gap_check:.4f}")
            logger.info(f"Train f1_macro: {train_f1:.4f} | Val f1_macro: {val_f1:.4f} | 🔍 Gap: {gap_check:.4f}")

            if val_f1 > best_score:
                best_score = val_f1
                best_features = selected_features
                logger.info(f"✅ New best model with {n} features selected")

    logger.info(f"🎯 Best val f1_macro: {best_score:.4f} with {len(best_features)} features")
    return best_features


In [None]:
best_features = feature_selection_rfe(X_train_preprocessed, y_train_encoded, params)
print("🎯 Selected Features:", best_features)

KeyboardInterrupt: 

In [42]:
### tweaking feats
best_features = feature_selection_rfe(X_train_preprocessed, y_train_encoded, params)
print("🎯 Selected Features:", best_features)

Max Depth: 5, Features: 14
Train F1 (macro): 0.6923
Validation F1 (macro): 0.6609
Gap (train - val): 0.0313
Max Depth: 6, Features: 14
Train F1 (macro): 0.7348
Validation F1 (macro): 0.7000
Gap (train - val): 0.0348
Max Depth: 7, Features: 14
Train F1 (macro): 0.7668
Validation F1 (macro): 0.7265
Gap (train - val): 0.0403
Max Depth: 8, Features: 14
Train F1 (macro): 0.7916
Validation F1 (macro): 0.7354
Gap (train - val): 0.0563
Max Depth: 5, Features: 16
Train F1 (macro): 0.7075
Validation F1 (macro): 0.6760
Gap (train - val): 0.0315
Max Depth: 6, Features: 16
Train F1 (macro): 0.7483
Validation F1 (macro): 0.7072
Gap (train - val): 0.0412
Max Depth: 7, Features: 16
Train F1 (macro): 0.7760
Validation F1 (macro): 0.7292
Gap (train - val): 0.0468
Max Depth: 8, Features: 16
Train F1 (macro): 0.7989
Validation F1 (macro): 0.7437
Gap (train - val): 0.0552
Max Depth: 5, Features: 18
Train F1 (macro): 0.6971
Validation F1 (macro): 0.6673
Gap (train - val): 0.0298
Max Depth: 6, Features: 18
T

# ANALYSIS OF OUTPUTS OF NODE.PY FROM FEATURE SELECTION - UNIFY FINAL LIST OF FEATURES

In [6]:
import pickle

def load_features(file_path):
    with open(file_path, "rb") as f:
        return pickle.load(f)

rfe_features = load_features("../data/04_feature/rfe_selected_features.pkl")
chi2_features = load_features("../data/04_feature/chi2_selected_features.pkl")
boruta_features = load_features("../data/04_feature/boruta_selected_features.pkl")
var_threshold_features = load_features("../data/04_feature/variance_threshold_selected_features.pkl")

In [7]:

print("RFE:", len(rfe_features))
print("Chi2:", len(chi2_features))
print("Boruta:", len(boruta_features))
print("Variance Threshold:", len(var_threshold_features))


common_features = set(rfe_features) & set(chi2_features) & set(boruta_features) & set(var_threshold_features)
print(f"Common Features Across All Methods ({len(common_features)}):\n", common_features)



RFE: 19
Chi2: 44
Boruta: 14
Variance Threshold: 60
Common Features Across All Methods (11):
 {'is_student_or_retired', 'default', 'cpi_top_value_92.893', 'cpi_top_value_93.994', 'cci_top_value', 'age_binned_quantile_3.0', 'poutcome_failure', 'euribor3m', 'nr.employed', 'pdays', 'successful_prev_contact'}


## RFE PICKS:

In [8]:
print((rfe_features))

['age', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'default', 'senior_housing_loan', 'is_student_or_retired', 'successful_prev_contact', 'cci_top_value', 'poutcome_failure', 'age_binned_quantile_2.0', 'age_binned_quantile_3.0', 'cpi_top_value_92.893', 'cpi_top_value_93.994', 'euribor_bin_high']


## CHI2 

In [9]:
print(chi2_features)

['age', 'campaign', 'pdays', 'previous', 'euribor3m', 'nr.employed', 'loan_risk_score', 'economic_pressure_index', 'default', 'cpi_above_75th', 'cci_above_75th', 'young_housing_loan', 'middle_aged_housing_loan', 'senior_housing_loan', 'young_loan', 'middle_aged_loan', 'senior_loan', 'contacted_before', 'is_student_or_retired', 'successful_prev_contact', 'cci_top_value', 'marital_married', 'marital_single', 'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success', 'age_binned_quantile_0.0', 'age_binned_quantile_2.0', 'age_binned_quantile_3.0', 'age_binned_quantile_4.0', 'previous_bin_0', 'previous_bin_1', 'previous_bin_2', 'previous_bin_3', 'cpi_top_value_92.893', 'cpi_top_value_93.918', 'cpi_top_value_93.994', 'cpi_top_value_other', 'euribor_bin_high', 'euribor_bin_low', 'euribor_bin_very_high', 'euribor_bin_very_low', 'education_mapped_basic_education', 'education_mapped_higher_education']


## BORUTA

In [10]:
print(boruta_features)

['pdays', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'default', 'is_student_or_retired', 'successful_prev_contact', 'cci_top_value', 'poutcome_failure', 'age_binned_quantile_3.0', 'cpi_top_value_92.893', 'cpi_top_value_93.994']


In [13]:
for col in rfe_features:
    if col not in boruta_features:
        print(f"Feature {col} is not selected by Boruta.")

Feature age is not selected by Boruta.
Feature previous is not selected by Boruta.
Feature senior_housing_loan is not selected by Boruta.
Feature age_binned_quantile_2.0 is not selected by Boruta.
Feature euribor_bin_high is not selected by Boruta.


These were the 5 features not selected by boruta

In [14]:
for col in boruta_features:
    if col not in rfe_features:
        print(f"Feature {col} is not selected by Boruta.")

## VAR

In [11]:
print(var_threshold_features)

['age', 'campaign', 'pdays', 'previous', 'emp.var.rate', 'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed', 'emp_rate_x_employed', 'loan_risk_score', 'economic_pressure_index', 'default', 'housing', 'loan', 'cpi_above_75th', 'cci_above_75th', 'young_housing_loan', 'middle_aged_housing_loan', 'senior_housing_loan', 'young_loan', 'middle_aged_loan', 'senior_loan', 'contacted_before', 'is_student_or_retired', 'successful_prev_contact', 'has_any_loan', 'job', 'education', 'campaign_bin', 'cci_top_value', 'marital_edu_combo', 'marital_divorced', 'marital_married', 'marital_single', 'marital_unknown', 'poutcome_failure', 'poutcome_nonexistent', 'poutcome_success', 'age_binned_quantile_0.0', 'age_binned_quantile_1.0', 'age_binned_quantile_2.0', 'age_binned_quantile_3.0', 'age_binned_quantile_4.0', 'previous_bin_0', 'previous_bin_1', 'previous_bin_2', 'previous_bin_3', 'cpi_top_value_92.893', 'cpi_top_value_93.918', 'cpi_top_value_93.994', 'cpi_top_value_other', 'euribor_bin_high',

In [12]:
for col in X_train_preprocessed.columns:
    if col not in var_threshold_features:
        print(f"Feature {col} is not selected by any method.")


- there wer no features with variance = 0

## FINAL SET OF FEATURES:

- should contain all features from boruta and rfe

- variance threshold did not eliminate any features

- chi-2: 44 features chosen, we can cross check with rfe and boruta

In [17]:
for col in chi2_features:
    if col not in rfe_features:
        print(f"Feature {col} is not selected by RFE.")
print("Number of features not selected by RFE:", len(chi2_features) - len(rfe_features))

Feature campaign is not selected by RFE.
Feature loan_risk_score is not selected by RFE.
Feature economic_pressure_index is not selected by RFE.
Feature cpi_above_75th is not selected by RFE.
Feature cci_above_75th is not selected by RFE.
Feature young_housing_loan is not selected by RFE.
Feature middle_aged_housing_loan is not selected by RFE.
Feature young_loan is not selected by RFE.
Feature middle_aged_loan is not selected by RFE.
Feature senior_loan is not selected by RFE.
Feature contacted_before is not selected by RFE.
Feature campaign_bin is not selected by RFE.
Feature marital_married is not selected by RFE.
Feature marital_single is not selected by RFE.
Feature poutcome_nonexistent is not selected by RFE.
Feature poutcome_success is not selected by RFE.
Feature age_binned_quantile_0.0 is not selected by RFE.
Feature age_binned_quantile_4.0 is not selected by RFE.
Feature previous_bin_0 is not selected by RFE.
Feature previous_bin_1 is not selected by RFE.
Feature previous_bin

- We can try for now include all features chosen from rfe and boruta and those selected by chi2

In [17]:
rfe_features

['age',
 'pdays',
 'previous',
 'emp.var.rate',
 'cons.price.idx',
 'cons.conf.idx',
 'euribor3m',
 'nr.employed',
 'default',
 'senior_housing_loan',
 'is_student_or_retired',
 'successful_prev_contact',
 'cci_top_value',
 'poutcome_failure',
 'age_binned_quantile_2.0',
 'age_binned_quantile_3.0',
 'cpi_top_value_92.893',
 'cpi_top_value_93.994',
 'euribor_bin_high']

In [18]:
chi2_features

['age',
 'campaign',
 'pdays',
 'previous',
 'euribor3m',
 'nr.employed',
 'loan_risk_score',
 'economic_pressure_index',
 'default',
 'cpi_above_75th',
 'cci_above_75th',
 'young_housing_loan',
 'middle_aged_housing_loan',
 'senior_housing_loan',
 'young_loan',
 'middle_aged_loan',
 'senior_loan',
 'contacted_before',
 'is_student_or_retired',
 'successful_prev_contact',
 'cci_top_value',
 'marital_married',
 'marital_single',
 'poutcome_failure',
 'poutcome_nonexistent',
 'poutcome_success',
 'age_binned_quantile_0.0',
 'age_binned_quantile_2.0',
 'age_binned_quantile_3.0',
 'age_binned_quantile_4.0',
 'previous_bin_0',
 'previous_bin_1',
 'previous_bin_2',
 'previous_bin_3',
 'cpi_top_value_92.893',
 'cpi_top_value_93.918',
 'cpi_top_value_93.994',
 'cpi_top_value_other',
 'euribor_bin_high',
 'euribor_bin_low',
 'euribor_bin_very_high',
 'euribor_bin_very_low',
 'education_mapped_basic_education',
 'education_mapped_higher_education']

In [22]:
final_set = list(set(rfe_features) | set(chi2_features))
dupes = list(set(rfe_features) & set(chi2_features))

print("Final Set of Features:", final_set)
print("Number of features in final set:", len(final_set))
print("Duplicate Features (in both):", dupes)


Final Set of Features: ['is_student_or_retired', 'cpi_top_value_92.893', 'poutcome_success', 'cci_top_value', 'marital_single', 'previous_bin_1', 'age', 'age_binned_quantile_0.0', 'education_mapped_basic_education', 'poutcome_nonexistent', 'middle_aged_housing_loan', 'previous_bin_0', 'previous', 'euribor_bin_high', 'emp.var.rate', 'age_binned_quantile_4.0', 'cpi_top_value_93.994', 'cpi_top_value_93.918', 'euribor_bin_very_high', 'loan_risk_score', 'nr.employed', 'education_mapped_higher_education', 'cons.conf.idx', 'pdays', 'successful_prev_contact', 'senior_loan', 'age_binned_quantile_2.0', 'senior_housing_loan', 'cci_above_75th', 'young_housing_loan', 'euribor_bin_very_low', 'young_loan', 'middle_aged_loan', 'contacted_before', 'default', 'euribor_bin_low', 'age_binned_quantile_3.0', 'campaign', 'cpi_above_75th', 'previous_bin_3', 'poutcome_failure', 'marital_married', 'euribor3m', 'economic_pressure_index', 'previous_bin_2', 'cons.price.idx', 'cpi_top_value_other']
Number of featur

In [23]:
final_set

['is_student_or_retired',
 'cpi_top_value_92.893',
 'poutcome_success',
 'cci_top_value',
 'marital_single',
 'previous_bin_1',
 'age',
 'age_binned_quantile_0.0',
 'education_mapped_basic_education',
 'poutcome_nonexistent',
 'middle_aged_housing_loan',
 'previous_bin_0',
 'previous',
 'euribor_bin_high',
 'emp.var.rate',
 'age_binned_quantile_4.0',
 'cpi_top_value_93.994',
 'cpi_top_value_93.918',
 'euribor_bin_very_high',
 'loan_risk_score',
 'nr.employed',
 'education_mapped_higher_education',
 'cons.conf.idx',
 'pdays',
 'successful_prev_contact',
 'senior_loan',
 'age_binned_quantile_2.0',
 'senior_housing_loan',
 'cci_above_75th',
 'young_housing_loan',
 'euribor_bin_very_low',
 'young_loan',
 'middle_aged_loan',
 'contacted_before',
 'default',
 'euribor_bin_low',
 'age_binned_quantile_3.0',
 'campaign',
 'cpi_above_75th',
 'previous_bin_3',
 'poutcome_failure',
 'marital_married',
 'euribor3m',
 'economic_pressure_index',
 'previous_bin_2',
 'cons.price.idx',
 'cpi_top_value_o

In [25]:
with open("../data/04_feature/final_selected_features.pkl", "wb") as f:
    pickle.dump(final_set, f)