In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import numpy as np
from definitions import *
import copy

import matplotlib.pyplot as plt

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, balanced_accuracy_score
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb






In [2]:
dataset_name = "adult"
protected_attribute = "race"


dataset_name_latex = "\\"+dataset_name


if dataset_name=="credit":
    dataset_name_latex += "dataset"
dataset_generator = Dataset(dataset_name)
all_data = dataset_generator.original_dataframe.copy()




arrays = np.load('../results/{}/arrays/arrays_all_models_all_fairness_metrics.npz'.format(dataset_name))
average = arrays['average']
# std = arrays['std']

print(average.shape)
average_over_problems = np.mean(average, axis=0)
std_over_problems = np.std(average, axis=0)

print(average_over_problems.shape)



Dataset adult_fnlwgt_educational-num has ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] categorical and ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] numerical columns.
(4, 21, 1, 6)
(21, 1, 6)


In [3]:



column_types_map = [dataset_generator.dtype_map[col] for col in all_data.columns]

# Check if all columns have the data type 'category'
all_categorical = all(dtype == 'category' for dtype in column_types_map)

# generative_methods = ["tvae", "cart", "smote"]
generative_methods = ["gaussian_copula", "ctgan", "tvae", "cart", "smote"]

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    generative_methods.remove("smote")

problem_classification = {"metrics":[accuracy_score, f1_score, roc_auc_score],
                    "metric_names":["\\acc", "\\f1", "\\rocauc"],
                    "fairness_metrics": [eq_odd, stat_par, eq_opp],
                    "fairness_metric_names": ["Equalized odds", "Statistical Parity", "Equal Opportunity"],
                    "fairness_metric_names": ["\\eqoddtable","\\statpartable", "\\eqopptable"],
                    "generative_methods":generative_methods,
                    "sampling_methods":['\\classonly', '\\classprotectedtable', '\\protectedonly', '\\sameclass']}



numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_cols = dataset_generator.categorical_input_cols.copy()

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, dataset_generator.continuous_input_cols),
        ('cat', categorical_transformer, categorical_cols)])


clf_RF = Pipeline(steps=[('preprocessor', transformations),
                ('classifier', RandomForestClassifier(random_state=42))])
clf_DT = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', DecisionTreeClassifier(random_state=42))])     

# clf_lgbm = Pipeline(steps=[('preprocessor', transformations_lgbm),
#                     ('classifier', LGBMClassifier(categorical_feature=dataset_generator.categorical_input_col_locations, verbose=-1))])  

                    
# model_names_classification = ["LightGBM", "XGBoost", "Decission Tree", "Random Forest"]
model_names_classification = ["\\lgbm", "\\xgb", "\\dt", "\\rf"]


models_classification = [LGBMClassifier, xgb.XGBClassifier, clf_DT, clf_RF]



args = [{"categorical_feature":dataset_generator.categorical_input_col_locations, "verbose":-1}, {"enable_categorical":True, "tree_method":'hist'}, {}, {}]

problems_classification = []
for model, name, arg in zip(models_classification, model_names_classification, args):
    problem = problem_classification.copy()
    problem["model"] = copy.deepcopy(model)
    problem["model_name"] = name
    problem["args"] = arg
    problems_classification.append(problem)


# metric_names_actual = [r"Accuracy $\uparrow$", r"F1 $\uparrow$", r"ROC AUC $\uparrow$", r"Equalized Odds $\downarrow$"]
metric_names_actual = ["\\acc", "\\fone", "\\rocauc", "\\eqoddtable","\\statpartable", "\\eqopptable"]

metrics_optimal = ["max", "max", "max", "min", "min", "min"]
# names_train = ["{}".format(dataset_name), "Augmented {} (TVAE)".format(dataset_name), "Augmented {} (CART)".format(dataset_name), "Augmented {} (SMOTENC)".format(dataset_name)]

names_train = ["\gaussiancopulatable", "\ctgan", "\\tvae", "\\cart", "\\smote"]

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    names_train.remove("\\smote")

test_sets, _ = dataset_generator.split_population(all_data)
# names_test = [f"Sex={value}" for value in test_sets.keys()]
names_test = []
names_test.append("Overall")




  for attr_values, indices in dataframe.groupby(protected_attributes).groups.items():


In [4]:
color_best_matrix = [0, 1, 2, 3, 4, 5]

latex_table = generate_latex_table_max_all_methods_avg_problems(average_over_problems, std_over_problems, names_train, names_test, problems_classification, metric_names_actual=metric_names_actual, test_data=True, metrics_optimal=metrics_optimal, dataset_name=dataset_name_latex, longtable=False, color_best_matrix=color_best_matrix, include_std=False)
print(latex_table)

(21, 1, 6)
(6,)
\begin{table*}[h]
\caption{\adult}
\label{tab:results_adult}
\centering
\begin{tabular}{l l c c c c c c}
\hline
\samplingmethod & \training & \multicolumn{6}{c}{\metrics} \\
\cline{3-8}&  & \acc & \fone & \rocauc & \eqoddtable & \statpartable & \eqopptable \\
\hline & \multirow{1}{*}{Real} & \textbf{0.849} & \textbf{0.678} & 0.778 & 0.137 & 0.180 & 0.062 \\
\hline
\hline
\multirow{5}{*}{\classonly} & \multirow{1}{*}{\gaussiancopulatable} & 0.844 & 0.666 & 0.772 & \cellcolor{blue!15}0.123 & \cellcolor{blue!15}0.174 & \cellcolor{blue!15}0.051 \\
 & \multirow{1}{*}{\ctgan} & 0.837 & \underline{0.676} & \cellcolor{blue!15}0.786 & \cellcolor{blue!15}0.128 & 0.188 & \cellcolor{blue!15}0.044 \\
 & \multirow{1}{*}{\tvae} & 0.835 & 0.668 & \cellcolor{blue!15}0.782 & 0.154 & 0.198 & \cellcolor{blue!15}0.058 \\
 & \multirow{1}{*}{\cart} & 0.818 & 0.674 & \cellcolor{blue!15}\textbf{0.799} & \cellcolor{blue!15}0.119 & 0.193 & \cellcolor{blue!15}\underline{0.025} \\
 & \multirow{1}{*