In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import numpy as np
from definitions import *
import copy

import matplotlib.pyplot as plt

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")

from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
from fairlearn.metrics import demographic_parity_difference, demographic_parity_ratio, true_positive_rate_difference, true_positive_rate, false_positive_rate_difference
from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
import xgboost as xgb



def eq_odd(y_test, y_pred, group_test):
    return true_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)\
                + false_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)





In [2]:
dataset_name = "dutch"
sampling_methods = ['class', 'class_protected', 'protected', 'same_class']
dataset_generator = Dataset(dataset_name)
all_data = dataset_generator.original_dataframe.copy()


print("\n\n\n\n\n")

for sampling_method in sampling_methods:


    arrays = np.load('../results/{}/arrays/arrays_{}.npz'.format(dataset_name, sampling_method))
    average = arrays['average']
    std = arrays['std']
    # feat_imp_average = arrays['feat_imp_average']
    # feat_imp_std = arrays['feat_imp_std']

    # feature_names = ['age', 'workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']


    column_types_map = [dataset_generator.dtype_map[col] for col in all_data.columns]

    # Check if all columns have the data type 'category'
    all_categorical = all(dtype == 'category' for dtype in column_types_map)

    generative_methods = ["tvae", "cart", "smote"]
    if all_categorical:
        print("Only categorical features, dropping SMOTE")
        generative_methods.remove("smote")

    problem_classification = {"metrics":[accuracy_score, f1_score, roc_auc_score],
                        "metric_names":["Accuracy", "F1", "ROC AUC"],
                        "fairness_metrics": [eq_odd],
                        "fairness_metric_names": ["Equalized odds"],
                        "generative_methods":generative_methods}


    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    categorical_cols = dataset_generator.categorical_input_cols.copy()

    transformations = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, dataset_generator.continuous_input_cols),
            ('cat', categorical_transformer, categorical_cols)])


    clf_RF = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', RandomForestClassifier(random_state=42))])
    clf_DT = Pipeline(steps=[('preprocessor', transformations),
                        ('classifier', DecisionTreeClassifier(random_state=42))])     

    # clf_lgbm = Pipeline(steps=[('preprocessor', transformations_lgbm),
    #                     ('classifier', LGBMClassifier(categorical_feature=dataset_generator.categorical_input_col_locations, verbose=-1))])  

                        
    model_names_classification = ["LightGBM", "XGBoost", "Decission Tree", "Random Forest"]


    models_classification = [LGBMClassifier, xgb.XGBClassifier, clf_DT, clf_RF]



    args = [{"categorical_feature":dataset_generator.categorical_input_col_locations, "verbose":-1}, {"enable_categorical":True, "tree_method":'hist'}, {}, {}]

    problems_classification = []
    for model, name, arg in zip(models_classification, model_names_classification, args):
        problem = problem_classification.copy()
        problem["model"] = copy.deepcopy(model)
        problem["model_name"] = name
        problem["args"] = arg
        problems_classification.append(problem)


    metric_names_actual = [r"Accuracy $\uparrow$", r"F1 $\uparrow$", r"ROC AUC $\uparrow$", r"Equalized Odds $\downarrow$"]

    metrics_optimal = ["max", "max", "max", "min"]
    names_train = ["{}".format(dataset_name), "Augmented {} (TVAE)".format(dataset_name), "Augmented {} (CART)".format(dataset_name), "Augmented {} (SMOTENC)".format(dataset_name)]
    test_sets, _ = dataset_generator.split_population(all_data)
    protected_attributes = ["Sex"]
    # names_test = [f"Sex={value}" for value in test_sets.keys()]
    names_test = []
    names_test.append("Overall")


    latex_table = generate_latex_table_max(average, std, names_train, names_test, problems_classification, metric_names_actual=metric_names_actual, test_data=True, metrics_optimal=metrics_optimal, sampling_method=sampling_method)
    print(latex_table)

Dataset dutch has ['age', 'household_position', 'household_size', 'prev_residence_place', 'citizenship', 'country_birth', 'edu_level', 'economic_status', 'cur_eco_activity', 'marital_status'] categorical and ['dummy'] numerical columns.






\begin{table}[h]
\centering
\begin{tabular}{l l c c c c}
\hline
Model & Train data & \multicolumn{4}{c}{Test data} \\
&  & \multicolumn{4}{c}{Overall}\\
\cline{3-6}&  & Accuracy $\uparrow$ & F1 $\uparrow$ & ROC AUC $\uparrow$ & Equalized Odds $\downarrow$ \\
\hline\multirow{4}{*}{LightGBM} & \multirow{1}{*}{dutch} & \textbf{0.823} \scriptsize{$\pm$ 0.002} & \textbf{0.805} \scriptsize{$\pm$ 0.002} & \textbf{0.821} \scriptsize{$\pm$ 0.002} & \textbf{0.087} \scriptsize{$\pm$ 0.009} \\
 & \multirow{1}{*}{Augmented dutch (TVAE)} & 0.817 \scriptsize{$\pm$ 0.002} & 0.797 \scriptsize{$\pm$ 0.002} & 0.814 \scriptsize{$\pm$ 0.002} & \underline{0.088} \scriptsize{$\pm$ 0.011} \\
 & \multirow{1}{*}{Augmented dutch (CART)} & 0.821 \scriptsize{$\pm$ 0.002} & 0.

  for attr_values, indices in dataframe.groupby(protected_attributes).groups.items():


In [3]:
# import matplotlib.pyplot as plt

# # Assuming you have defined problems_classification, names_train, feat_imp_average, and feature_names

# num_problems = len(problems_classification)
# num_train_sets = len(names_train)

# scale = 5
# fontsize_title= 25
# fontsize_suptitle= 18
# fontsize_tick= 15
# fig = plt.figure(constrained_layout=True, figsize=(scale*num_train_sets, scale*num_problems))
# fig.suptitle('Feature importances', fontsize=fontsize_title)

# # Create subfigures
# subfigs = fig.subfigures(nrows=num_problems, ncols=1)

# for row, subfig in enumerate(subfigs):
#     problem_name = problems_classification[row]["model_name"]
#     subfig.suptitle(problem_name, fontsize=fontsize_title)

#     # Create subplots per subfigure
#     axs = subfig.subplots(nrows=1, ncols=num_train_sets)

#     for col, ax in enumerate(axs):
#         train_name = names_train[col]
#         feature_importances_avg = feat_imp_average[row][col][0]
#         feature_importances_std = feat_imp_std[row][col][0]  # Update this line to access std
#         sorted_lists = sorted(zip(feature_importances_avg, feature_importances_std, feature_names), reverse=True)
#         sorted_feature_importances_avg, sorted_feature_importances_std, sorted_feature_names = zip(*sorted_lists)

#         # print(sorted_feature_importances_avg)
#         # print(sorted_feature_importances_std)
#         ax.bar(sorted_feature_names, sorted_feature_importances_avg, color='skyblue', yerr=sorted_feature_importances_std)
#         # ax.set_xlabel('Features')
#         # ax.set_ylabel('Frequency')
#         ax.set_title(train_name, fontsize=fontsize_suptitle, loc='right')  # Add train_name as subplot title
#         ax.tick_params(axis='x', rotation=35)
#         ax.tick_params(axis='x', labelsize=fontsize_tick, labelrotation=90)  # Decrease the font size of x-ticks

# plt.subplots_adjust(hspace=5)  # Adjust the spacing between subplots
# plt.savefig("../results/feature_importance/feature_importance_{}.jpg".format(sampling_method), dpi=300, bbox_inches="tight", pad_inches=0)

# plt.show()


In [4]:
# protected_attributes = ["sex"]
# split_dfs, additional_sizes = adult_dataset_generator.split_population(adult_dataset_generator.original_dataframe, protected_attributes)

# split_df_keys, split_df_vals = zip(*split_dfs.items())

# augmented_dfs, augmented_dfs_plot = get_synthetic_splits(adult_dataset_generator, split_dfs, generative_method="cart", generative_seed=0, return_plot=True, sampling_method=sampling_method)


# final_augmented_dataset = pd.concat(augmented_dfs)
# final_augmented_dataset_plot = pd.concat(augmented_dfs_plot)

# final_augmented_dataset_plot['income_method'] = final_augmented_dataset_plot['income'].astype(str) + ' (' + final_augmented_dataset_plot['method'] + ')'



In [5]:
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assuming you have two different DataFrames: df1 and df2
# # Replace df1 and df2 with your actual DataFrame names

# # Set up the plot with two subplots in two columns
# fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(15, 6), sharey=True)

# # Define the palette using Seaborn's color palette generator
# palette = sns.color_palette("husl", 2)  # Using 'husl' palette with 2 colors

# gender_order = ['Female', 'Male']  # Adjust as per your actual category order

# # Plot for df1
# sns.histplot(data=adult_dataset_generator.original_dataframe, x='sex', hue='income', palette=palette, hue_order=['>50K', '<=50K'], multiple="stack", ax=axes[0], discrete = True)
# axes[0].set_title('Original dataset (Adult)')
# axes[0].set_xlabel('Gender')
# axes[0].set_ylabel('Count')

# # Plot for df2
# sns.histplot(data=final_augmented_dataset_plot, x='sex', hue='income', palette=palette, hue_order=['>50K', '<=50K'], multiple="stack", ax=axes[1], discrete = True)
# axes[1].set_title('Augmented dataset (Adult-augmented)')
# axes[1].set_xlabel('Gender')
# axes[1].set_ylabel('Count')

# # Adjust layout
# plt.tight_layout()

# for ax in axes:
#     for bar in ax.patches:
#         # Find the total height of bars for the current x-coordinate
#         total_height = sum(p.get_height() for p in ax.patches if p.get_x() == bar.get_x())
#         # Calculate the percentage
#         percentage = (bar.get_height() / total_height) * 100
#         # Annotate the bar with the percentage
#         ax.annotate(f'{percentage:.1f}%', 
#                     xy=(bar.get_x() + bar.get_width() / 2, bar.get_y() + bar.get_height() / 2),
#                     xytext=(0, 0),  # 3 points vertical offset
#                     textcoords="offset points",
#                     ha='center', va='bottom', fontsize=8)

# plt.savefig("../results/sampling/{}.jpg".format(sampling_method), dpi=300, bbox_inches="tight", pad_inches=0)

# plt.show()
