In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import numpy as np
from definitions import *
import copy

import matplotlib.pyplot as plt

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from fairlearn.metrics import demographic_parity_difference, demographic_parity_ratio, true_positive_rate_difference, true_positive_rate, false_positive_rate_difference
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb



def eq_odd(y_test, y_pred, group_test):
    return true_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)\
                + false_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)





In [2]:
dataset_name = "dutch"


dataset_name_latex = "\\"+dataset_name


if dataset_name=="credit":
    dataset_name_latex += "dataset"
dataset_generator = Dataset(dataset_name)
all_data = dataset_generator.original_dataframe.copy()




arrays = np.load('../results/{}/arrays/arrays_all_models.npz'.format(dataset_name))
average = arrays['average']
std = arrays['std']
    # feat_imp_average = arrays['feat_imp_average']
    # feat_imp_std = arrays['feat_imp_std']

    # feature_names = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']


column_types_map = [dataset_generator.dtype_map[col] for col in all_data.columns]

# Check if all columns have the data type 'category'
all_categorical = all(dtype == 'category' for dtype in column_types_map)

# generative_methods = ["tvae", "cart", "smote"]
generative_methods = ["gaussian_copula", "ctgan", "tvae", "cart", "smote"]

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    generative_methods.remove("smote")

problem_classification = {"metrics":[accuracy_score, f1_score, roc_auc_score],
                    "metric_names":["\\acc", "\\f1", "\\rocauc"],
                    "fairness_metrics": [eq_odd],
                    "fairness_metric_names": ["\\eqoddtable"],
                    "generative_methods":generative_methods,
                    "sampling_methods":['\\classonly', '\\classprotectedtable', '\\protectedonly', '\\sameclass']}



numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_cols = dataset_generator.categorical_input_cols.copy()

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, dataset_generator.continuous_input_cols),
        ('cat', categorical_transformer, categorical_cols)])


clf_RF = Pipeline(steps=[('preprocessor', transformations),
                ('classifier', RandomForestClassifier(random_state=42))])
clf_DT = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', DecisionTreeClassifier(random_state=42))])     

# clf_lgbm = Pipeline(steps=[('preprocessor', transformations_lgbm),
#                     ('classifier', LGBMClassifier(categorical_feature=dataset_generator.categorical_input_col_locations, verbose=-1))])  

                    
# model_names_classification = ["LightGBM", "XGBoost", "Decission Tree", "Random Forest"]
model_names_classification = ["\\lgbm", "\\xgb", "\\dt", "\\rf"]


models_classification = [LGBMClassifier, xgb.XGBClassifier, clf_DT, clf_RF]



args = [{"categorical_feature":dataset_generator.categorical_input_col_locations, "verbose":-1}, {"enable_categorical":True, "tree_method":'hist'}, {}, {}]

problems_classification = []
for model, name, arg in zip(models_classification, model_names_classification, args):
    problem = problem_classification.copy()
    problem["model"] = copy.deepcopy(model)
    problem["model_name"] = name
    problem["args"] = arg
    problems_classification.append(problem)


# metric_names_actual = [r"Accuracy $\uparrow$", r"F1 $\uparrow$", r"ROC AUC $\uparrow$", r"Equalized Odds $\downarrow$"]
metric_names_actual = ["\\acc", "\\fone", "\\rocauc", "\\eqoddtable"]

metrics_optimal = ["max", "max", "max", "min"]
# names_train = ["{}".format(dataset_name), "Augmented {} (TVAE)".format(dataset_name), "Augmented {} (CART)".format(dataset_name), "Augmented {} (SMOTENC)".format(dataset_name)]

names_train = ["\gaussiancopulatable", "\ctgan", "\\tvae", "\\cart", "\\smote"]

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    names_train.remove("\\smote")

test_sets, _ = dataset_generator.split_population(all_data)
protected_attributes = ["Sex"]
# names_test = [f"Sex={value}" for value in test_sets.keys()]
names_test = []
names_test.append("Overall")




Dataset dutch has ['age', 'household_position', 'household_size', 'prev_residence_place', 'citizenship', 'country_birth', 'edu_level', 'economic_status', 'cur_eco_activity', 'marital_status'] categorical and [] numerical columns.
Only categorical features, dropping SMOTE
Only categorical features, dropping SMOTE


  for attr_values, indices in dataframe.groupby(protected_attributes).groups.items():


In [3]:
color_best_matrix = [0, 1, 2, 3]

latex_table = generate_latex_table_max_all_methods(average, std, names_train, names_test, problems_classification, metric_names_actual=metric_names_actual, test_data=True, metrics_optimal=metrics_optimal, dataset_name=dataset_name_latex, longtable=True, color_best_matrix=color_best_matrix)
print(latex_table)

\begin{center}
\begin{longtable}{l l l c c c c}
\caption[\dutch]{\dutch}
\label{tab:results_dutch}\\ 
\hline
\clf & \samplingmethod & \training & \multicolumn{4}{c}{\metrics} \\
\cline{4-7}& &  & \acc & \fone & \rocauc & \eqoddtable \\
\endfirsthead
\endhead
\hline \multicolumn{7}{c}{{Continued on next page}} \\ \hline
\endfoot
\hline \hline
\endlastfoot
\hline\multirow{17}{*}{\lgbm} & & \multirow{1}{*}{Real} & \textbf{0.823} \tiny{$\pm$ 0.002} & \textbf{0.805} \tiny{$\pm$ 0.002} & \textbf{0.821} \tiny{$\pm$ 0.002} & \textbf{0.087} \tiny{$\pm$ 0.009} \\
\cline{3-7}\noalign{\vskip\doublerulesep\vskip-\arrayrulewidth}\cline{3-7} & \multirow{4}{*}{\classonly} & \multirow{1}{*}{\gaussiancopulatable} & \textbf{0.823} \tiny{$\pm$ 0.002} & \underline{0.804} \tiny{$\pm$ 0.002} & \underline{0.820} \tiny{$\pm$ 0.002} & 0.090 \tiny{$\pm$ 0.010} \\
 & & \multirow{1}{*}{\ctgan} & \underline{0.821} \tiny{$\pm$ 0.002} & 0.800 \tiny{$\pm$ 0.002} & 0.817 \tiny{$\pm$ 0.002} & 0.096 \tiny{$\pm$ 0.009} \\