***IMPORTS***

In [None]:
from typing import List, Tuple, Literal, Any, Union
from dataclasses import dataclass
import pandas as pd
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import gc

def show(*args):
    """A function that displays the arguments in a Jupyter notebook or prints them in a console depending on the environment."""
    try:
        from IPython.display import display

        display(*args)
    except ImportError:
        print(*args)

***LOADING DATA***

In [None]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Loads the bank data from the CSV files."""
    try:
        bank_data = pd.read_csv("./data/train.csv")
        bank_query = pd.read_csv("./data/test.csv")
        return bank_data, bank_query
    except FileNotFoundError:
        raise Exception(
            "Kaggle data files not found. Please download the data from https://www.kaggle.com/competitions/playground-series-s4e1/data and place them in the data folder."
        )
    

***UTILS***

In [None]:
@dataclass
class ColumnTransformerWrapper:
    transformers: List[Tuple[str, Any, List[str]]]
    remainder: Literal["drop", "passthrough"] = "passthrough"

    def fit_transform(self, X: pd.DataFrame, y: Any | None = None) -> pd.DataFrame:  # type: ignore
        ct = ColumnTransformer(self.transformers, remainder=self.remainder)

        return pd.DataFrame(
            ct.fit_transform(X, y),  # type: ignore
            index=X.index,
            columns=ct.get_feature_names_out()
        )


def preprocess_data(data: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
    data = data.copy()

    # irrelevant_features = ["Surname", "CustomerId"]
    cat_features = ["Geography","Gender"]
    num_features = ["CreditScore","Age","Tenure","Balance","NumOfProducts","EstimatedSalary","HasCrCard","IsActiveMember"]
    id_features = ["id"]
    target_features = ["Exited"]

    data_indexed = data.set_index(id_features).sort_index()

    ctw = ColumnTransformerWrapper(
        transformers=[
            ("num", "passthrough", num_features),
            ("cat", OneHotEncoder(drop="first"), cat_features),
        ],
        remainder="drop",
    )
    X = ctw.fit_transform(data_indexed)
    y = data[target_features]

    return X, y


def split_data(
    X: pd.DataFrame, y: pd.DataFrame, test_size: float, random_state: Union[int, None]
) -> List[Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]]:
    """A function that splits the data into training and testing sets.
    NOTE: It is created to provided future interface with K-fold cross-validation.
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    return [(X_train, y_train, X_test, y_test)]


def create_model(**hyper_params):
    model = RandomForestClassifier(**hyper_params)
    return model


def brute_force_features(
    X_train: pd.DataFrame, 
    y_train: pd.DataFrame,
    X_test: pd.DataFrame,
    y_test: pd.DataFrame,
    bitmap_start: int = 1,
) -> Tuple[float, List[str]]:
    """
    A function that trains models on all possible feature set and returns the list of features that were used in a best model.
    """
    show(X_train.columns)
    best_score : float = 0.0
    best_features : List[str] = []

    size : int = len(X_train.columns)
    bitmap_size : int = 2**size

    if size > 20:
        show("The number of features is too large for brute force search.")
        return best_score, best_features

    for bitmap in range(bitmap_start, bitmap_size):
        show(f"Trying bitmap: {bitmap}/{bitmap_size} ")
        features = [X_train.columns[i] for i in range(len(X_train.columns)) if (bitmap >> i) & 1]
        if not features:
            continue

        model = create_model()
        model.fit(X_train[features], y_train.squeeze(axis=1))

        y_pred = pd.DataFrame(
            data=model.predict(X_test[features]),
            index=y_test.index,
            columns=y_test.columns,
        )

        score = accuracy_score(y_test, y_pred)
        if score > best_score:
            best_score = score
            best_features = features
            show(f"Best score: {best_score:.2f}")
            show(f"Best features: {best_features}")

        del model
        gc.collect()

    return best_score, best_features


def execute_test_brute_force_run(bitmap_start : int) -> Tuple[float, List[str]]:
    bank_data, bank_query = load_data()
    X, y = preprocess_data(bank_data)

    test_size = 0.2
    random_state = 42

    data_splits = split_data(X, y, test_size, random_state)

    for X_train, y_train, X_test, y_test in data_splits:
        best_score, best_features = brute_force_features(
            X_train, y_train, X_test, y_test, bitmap_start
        )
        show(f"Best score: {best_score:.2f}")
        show(f"Best features: {best_features}")

    return best_score, best_features


In [None]:
def brute_force_features_with_k_fold_validation(
    X: pd.DataFrame,
    y: pd.DataFrame,
    bitmap_start: int = 1,
    n_splits: int = 5,
) -> Tuple[float, List[str]]:
    show(X.columns)

    best_score: float = 0.0
    best_features: List[str] = []

    size: int = len(X.columns)
    bitmap_size: int = 2**size

    for bitmap in range(1, bitmap_size+1):
        features = [
            X.columns[i] for i in range(size) if (bitmap >> i) & 1
        ]
        show(f"Trying bitmap: {bitmap}/{bitmap_size} : {features}")

        if not features:
            continue

        kf = KFold(n_splits=n_splits, shuffle=False)

        scores = []
        for train_index, test_index in kf.split(X):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            model = create_model()
            model.fit(X_train[features], y_train.squeeze(axis=1))

            y_pred = pd.DataFrame(
                data=model.predict(X_test[features]),
                index=y_test.index,
                columns=y_test.columns,
            )

            score = accuracy_score(y_test, y_pred)
            scores.append(score)

            del model

        score = sum(scores) / len(scores)
        gc.collect()

        if score > best_score:
            best_score = score
            best_features = features
            show(f"Best score: {best_score:.4f}")
            show(f"Best features: {best_features}")


    return best_score, best_features

def execute_with_k_fold_validation(bitmap_start : int) -> Tuple[float, List[str]]:
    bank_data, bank_query = load_data()
    X, y = preprocess_data(bank_data)

    best_score, best_features = brute_force_features_with_k_fold_validation(X, y, bitmap_start)

    return best_score, best_features

***EXECUTION***

In [None]:
# --- NOTICE ---
# The code below is the entry point of the script.
# It will allow to stop and resume the search for the best features.
BITMAP_START = 100
"""
Furtherst progress: 1300
Best score of brute force search so far: 0.86
Best features of brute force search so far: 
['num__Age', 'num__NumOfProducts', 'num__IsActiveMember', 'cat__Geography_Germany']
['num__Age', 'num__NumOfProducts', 'num__IsActiveMember', 'cat__Geography_Germany', 'cat__Geography_Spain']"
['num__Age', 'num__NumOfProducts', 'num__IsActiveMember', 'cat__Geography_Germany', 'cat__Gender_Male']"

With K-fold validation:
0.85
['num__Age', 'num__NumOfProducts']"
"""


# best_score, best_features = execute_test_brute_force_run(bitmap_start=BITMAP_START)
best_score, best_features = execute_with_k_fold_validation(bitmap_start=BITMAP_START)