In [29]:
%load_ext autoreload

# Enable autoreload
%autoreload 2

import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import numpy as np
from definitions import *
import copy



import matplotlib.pyplot as plt

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score, balanced_accuracy_score
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb






The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [30]:
dataset_name = "adult"

protected_attributes_all = ["sex", "race", "both"]


dataset_name_latex = "\\"+dataset_name


if dataset_name=="credit":
    dataset_name_latex += "dataset"
dataset_generator = Dataset(dataset_name)
all_data = dataset_generator.original_dataframe.copy()




arrays = np.load('../results/{}/arrays/arrays_all_models_all_fairness_metrics_protected_{}_small.npz'.format(dataset_name, "_".join(protected_attributes_all)))

average = arrays['average'][2:3]
std = arrays['std'][2:3]

protected_attributes_all = ["both"]


# average_over_problems = np.mean(average, axis=0)
# std_over_problems = np.std(average, axis=0)


Dataset adult_fnlwgt_educational-num has ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 'native-country'] categorical and ['age', 'capital-gain', 'capital-loss', 'hours-per-week'] numerical columns.


In [31]:
column_types_map = [dataset_generator.dtype_map[col] for col in all_data.columns]

# Check if all columns have the data type 'category'
all_categorical = all(dtype == 'category' for dtype in column_types_map)

# generative_methods = ["tvae", "cart", "smote"]
generative_methods = ["gaussian_copula", "ctgan", "tvae", "cart", "smote"]

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    generative_methods.remove("smote")

problem_classification = {"metrics":[accuracy_score, f1_score, roc_auc_score],
                    "metric_names":["\\acc", "\\f1", "\\rocauc"],
                    "fairness_metrics": [eq_odd, stat_par, eq_opp],
                    "fairness_metric_names": ["Equalized odds", "Statistical Parity", "Equal Opportunity"],
                    "fairness_metric_names": ["\\eqoddtable","\\statpartable", "\\eqopptable"],
                    "generative_methods":generative_methods,
                    "sampling_methods":['\\classonly', '\\classprotectedtable', '\\protectedonly', '\\sameclass']}



numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

categorical_cols = dataset_generator.categorical_input_cols.copy()

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, dataset_generator.continuous_input_cols),
        ('cat', categorical_transformer, categorical_cols)])


clf_RF = Pipeline(steps=[('preprocessor', transformations),
                ('classifier', RandomForestClassifier(random_state=42))])
clf_DT = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', DecisionTreeClassifier(random_state=42))])     

# clf_lgbm = Pipeline(steps=[('preprocessor', transformations_lgbm),
#                     ('classifier', LGBMClassifier(categorical_feature=dataset_generator.categorical_input_col_locations, verbose=-1))])  

                    
# model_names_classification = ["LightGBM", "XGBoost", "Decission Tree", "Random Forest"]
model_names_classification = ["\\lgbm", "\\xgb", "\\dt", "\\rf"]


models_classification = [LGBMClassifier, xgb.XGBClassifier, clf_DT, clf_RF]



args = [{"categorical_feature":dataset_generator.categorical_input_col_locations, "verbose":-1}, {"enable_categorical":True, "tree_method":'hist'}, {}, {}]

problems_classification = []
for model, name, arg in zip(models_classification, model_names_classification, args):
    problem = problem_classification.copy()
    problem["model"] = copy.deepcopy(model)
    problem["model_name"] = name
    problem["args"] = arg
    problems_classification.append(problem)


metric_names_actual_all = ["\\acc", "\\fone", "\\rocauc", "\\eqoddtable","\\statpartable", "\\eqopptable"]

metrics_optimal_all = ["max", "max", "max", "min", "min", "min"]

metrics_keep_all = [0, 2, 3, 4, 5]
len_metrics_keep_all = len(metric_names_actual_all)


names_train = ["\gaussiancopulatable", "\ctgan", "\\tvae", "\\cart", "\\smote"]

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    names_train.remove("\\smote")

test_sets, _ = dataset_generator.split_population(all_data)
# names_test = [f"Sex={value}" for value in test_sets.keys()]
names_test = []
names_test.append("Overall")


average_use = np.concatenate(average, axis=-1)[0]
std_use = np.concatenate(std, axis=-1)[0]

metric_names_actual = []
metrics_optimal = []
metrics_keep = []
for pp, _ in enumerate(protected_attributes_all):
    metric_names_actual.extend([metric_names_actual_all[i] for i in metrics_keep_all])
    metrics_optimal.extend([metrics_optimal_all[i] for i in metrics_keep_all])
    metrics_keep.extend([i + len_metrics_keep_all*pp for i in metrics_keep_all])

print(metric_names_actual)
print(metrics_optimal)
print(metrics_keep)

average_use = average_use[..., metrics_keep]
std_use = std_use[..., metrics_keep]
average_use.shape

['\\acc', '\\rocauc', '\\eqoddtable', '\\statpartable', '\\eqopptable']
['max', 'max', 'min', 'min', 'min']
[0, 2, 3, 4, 5]


  for attr_values, indices in dataframe.groupby(protected_attributes).groups.items():


(21, 1, 5)

In [32]:
color_best_matrix = range(average_use.shape[-1])

if "both" in protected_attributes_all or "race" in protected_attributes_all :
    latex_table = generate_latex_table_max_all_methods_sex_race_both(average_use, std_use, names_train, names_test, problems_classification, protected_attributes_all, metric_names_actual=metric_names_actual, test_data=True, metrics_optimal=metrics_optimal, dataset_name=dataset_name_latex, longtable=False, color_best_matrix=color_best_matrix, include_std=False)
else:
    latex_table = generate_latex_table_max_all_methods_sex_race(average_use, std_use, names_train, names_test, problems_classification, protected_attributes_all, metric_names_actual=metric_names_actual, test_data=True, metrics_optimal=metrics_optimal, dataset_name=dataset_name_latex, longtable=False, color_best_matrix=color_best_matrix, include_std=False)
print(latex_table)

\begin{table*}[h]
\caption{\adult}
\label{tab:results_adult}
\centering
\begin{tabular}{l l c c c c c}
\hline
\samplingmethod & \training & \multicolumn{5}{c}{\metrics} \\
&  & \multicolumn{5}{c}{both} \\
& & \acc & \rocauc & \eqoddtable & \statpartable & \eqopptable \\
\hline & \multirow{1}{*}{Real} & \textbf{0.867} & \underline{0.798} & 0.205 & 0.221 & 0.131 \\
\hline
\multirow{5}{*}{\classonly} & \multirow{1}{*}{\gaussiancopulatable} & 0.855 & 0.764 & 0.209 & \cellcolor{blue!15}0.186 & 0.146 \\
 & \multirow{1}{*}{\ctgan} & 0.849 & 0.774 & \cellcolor{blue!15}0.192 & \cellcolor{blue!15}0.192 & \cellcolor{blue!15}0.122 \\
 & \multirow{1}{*}{\tvae} & 0.848 & 0.773 & 0.242 & \cellcolor{blue!15}0.213 & 0.153 \\
 & \multirow{1}{*}{\cart} & 0.835 & \cellcolor{blue!15}\textbf{0.817} & 0.222 & 0.243 & \cellcolor{blue!15}\underline{0.114} \\
 & \multirow{1}{*}{\smote} & 0.840 & 0.790 & 0.247 & 0.230 & 0.143 \\
\hline
\multirow{5}{*}{\classprotectedtable} & \multirow{1}{*}{\gaussiancopulatable}