In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.exceptions import NotFittedError
import copy


In [2]:
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb

from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


In [4]:
adult_dataset_generator = Dataset("adult")
all_data = adult_dataset_generator.original_dataframe.copy()

In [5]:
def compute_metrics(y_test, y_pred, problem):
    return [m(y_test, y_pred) for m in problem["metrics"]]
    
def train_eval(X_train, y_train, X_test, y_test, problem):
    model = problem["model"](**problem["args"])
    try:
        model.predict(X_test)
        print(name_tr, name_t, "Model is already fitted!")
        exit(1)
    except:
        pass
    # object_columns = X_train.select_dtypes(include=['object']).columns
    # X_train[object_columns] = X_train[object_columns].astype(str)
    # X_test[object_columns] = X_test[object_columns].astype(str)
    # print(object_columns)
    # print(X_train.nunique())
    # print(y_train.nunique())
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    metrics = compute_metrics(y_test, y_pred, problem)
    return metrics, y_pred


In [6]:
def run_experiments(problem_classification, adult_dataset_generator, all_data, num_repeats = 1, num_folds = 2, protected_attributes = ["sex"]):

    average_problems = []
    std_problems = []
    for problem in problems_classification:

        rkf = RepeatedKFold(n_splits=num_folds, n_repeats=num_repeats, random_state=42)
        all_metrics_mean = []
        all_metrics_std = []
        metrics_all = []
        for i, (train_index, test_index) in enumerate(rkf.split(all_data)):    

            data_train, data_test = all_data.loc[train_index], all_data.loc[test_index]
            data_train_encoded = adult_dataset_generator.encode(data_train, keep_dtypes=True)
            data_test_encoded = adult_dataset_generator.encode(data_test)


            X_train_real = data_train.copy().drop(columns=["income"])
            y_train_real = data_train_encoded["income"].copy().astype("int")

            test_sets, _ = adult_dataset_generator.split_population(data_test)
            test_sets["all"] = data_test

            split_dfs, additional_sizes = adult_dataset_generator.split_population(data_train, protected_attributes=protected_attributes)


            # Get the DataFrame with the maximum length
            max_length_df_key = max(split_dfs, key=lambda x: len(split_dfs[x]))
            # Retrieve the DataFrame using the key
            max_length_df = split_dfs[max_length_df_key]

            max_length_df_class_counts = max_length_df['income'].value_counts()

            max_length_df_majority_class = max_length_df_class_counts.idxmax()
            max_length_df_majority_class_count = max_length_df_class_counts[max_length_df_majority_class]

            augmented_dfs = []
            split_df_keys, split_df_vals = zip(*split_dfs.items())

            for split_key, split_df in split_dfs.items():
                class_counts = split_df['income'].value_counts()
                augmented_dfs.append(split_df)

                for class_label, class_count in class_counts.items():
                    minority_class_count = class_count
                    imbalance = max_length_df_majority_class_count - minority_class_count
                    size = imbalance

                    if size > 0:
                        class_split_df = split_df[split_df['income'] == class_label].copy()
                        class_split_df.drop('income', axis=1, inplace=True)
                        class_split_df.drop('sex', axis=1, inplace=True)

                        split_synthesizer = adult_dataset_generator.train_synthesizer(class_split_df, encode=True) 
                        split_synthetic_data = adult_dataset_generator.generate_data(split_synthesizer, num=size)
                        split_synthetic_data['income'] = class_label
                        split_synthetic_data['sex'] = split_key
                        augmented_dfs.append(split_synthetic_data.copy())

            augmented_trainingset = pd.concat(augmented_dfs)
            augmented_trainingset_encoded = adult_dataset_generator.encode(augmented_trainingset, keep_dtypes=True)

            X_train_augmented = augmented_trainingset.drop(columns=["income"])
            y_train_augmented = augmented_trainingset_encoded["income"].astype("int")

            train_real = data_train_encoded["income"].astype("int")


            train_sets_X = [X_train_real, X_train_augmented]
            train_sets_y = [y_train_real, y_train_augmented]
            metrics_split = []
            
            for X_train, y_train in zip(train_sets_X, train_sets_y):
                setup_metrics = []
                preds = [] 
                for test_set_name, test_set in test_sets.items():
                    test_set_encoded = adult_dataset_generator.encode(test_set)
                    X_test = test_set.drop(columns=["income"])
                    y_test = test_set_encoded["income"].astype("int")

                    results, pred = train_eval(X_train, y_train, X_test, y_test, problem)
                    setup_metrics.append(results)
                    preds.append(pred)
                metrics_split.append(setup_metrics)
            metrics_all.append(metrics_split)
        metrics_all = np.array(metrics_all)    
        average_metrics_all = np.mean(metrics_all, axis=0)
        std_metrics_all = np.std(metrics_all, axis=0)
        average_problems.append(average_metrics_all)
        std_problems.append(std_metrics_all)
    return np.array(average_problems), np.array(std_problems)

In [7]:
problem_classification = {"metrics":[accuracy_score,  precision_score, recall_score, f1_score],
                      "metric_names":["Accuracy", "P", "R", "F1"]}
                      
# models = [MultiOutputRegressor(LGBMRegressor(random_state=42)), DecisionTreeRegressor(random_state=42), RandomForestRegressor(random_state=42)]
# models_classification = [xgb.XGBClassifier, CatBoostClassifier, DecisionTreeClassifier, RandomForestClassifier]
# models_classification = [xgb.XGBClassifier]
models_classification = [CatBoostClassifier]

# args = [{"random_state":42}, {"random_state":42, "loss_function":"MultiRMSE", "verbose":False, "iterations":100, "learning_rate":0.01}, {"random_state":42}, {"random_state":42}]
args = [{"random_state":42, "loss_function":"Logloss", "verbose":False, "iterations":100, "learning_rate":0.01, "cat_features":adult_dataset_generator.categorical_input_cols}]

# model_names_classification = ["xgboost", "catboost", "DT", "RF"]
model_names_classification = ["catboost"]
problems_classification = []
for model, name, arg in zip(models_classification, model_names_classification, args):
    problem = problem_classification.copy()
    problem["model"] = copy.deepcopy(model)
    problem["model_name"] = name
    problem["args"] = arg
    problems_classification.append(problem)

In [8]:
average, std = run_experiments(problem_classification, adult_dataset_generator, all_data, num_repeats = 5, num_folds = 3, protected_attributes = ["sex"])

In [9]:
def generate_latex_table1(all_metrics_mean, all_metrics_std, names_train, names_test, problems, test_data=False, metric_names_actual=[]):
    if test_data:
        all_cols =  str(2 + len(metric_names_actual) * len(names_test))
    else:
        all_cols = str(len(problems[0]["metric_names"]) + 2)
    latex_table = "\\begin{table}[h]\n"
    latex_table += "\\centering\n"
    # latex_table += "\\scalebox{0.70}{\n"
    latex_table += "\\begin{tabular}{l l " + " ".join(["c"]*(int(all_cols)-2)) + "}\n"
    latex_table += "\\hline\n"
    if test_data:
        if len(metric_names_actual) > 0:
            latex_table += "Model & Train data & \multicolumn{" + str(len(names_test) * len(metric_names_actual)) + "}{c}{Test data} \\\\\n" 
            latex_table += "& "
            for name_t in names_test:
                latex_table +=" & \multicolumn{" + str(len(metric_names_actual)) + "}{c}{" + name_t + "}"
            latex_table += "\\\\\n"
            latex_table += "\cline{3-" + str(all_cols) +"}"
            # latex_table +=  "& & " + " & ".join(metric_names_actual) + " & " + " & ".join(metric_names_actual) + " \\\\\n"
            latex_table +=  "& " + "".join([" & " + " & ".join(metric_names_actual) for _ in range(len(names_test))]) + " \\\\\n"
        else:
            latex_table += "Model & Train data & Test data & " + " & ".join(problems[0]["metric_names"]) + " \\\\\n"
    else:
        if len(metric_names_actual) > 0:
            latex_table += "Model & Train data & " + " & ".join(metric_names_actual) + " \\\\\n"
        else:
            latex_table += "Model & Train data & " + " & ".join(problems[0]["metric_names"]) + " \\\\\n"

    latex_table += "\\hline"
    count_make_cell = sum("makecell" in item for item in names_train)

    for problem_i in range(len(problems)):

        latex_table += "\\multirow{" + str(2*len(problems)) + "}{*}{" + problems[problem_i]["model_name"] + "}"

        for i in range(len(names_train)):
            train_name = names_train[i]
            # if "makecell" in train_name:
            #     latex_table += " & " + "\\multirow{2}{*}{" + train_name + "}"
            # else:
            latex_table += " & " + "\\multirow{2}{*}{" + train_name + "}"
            # avg_metric = all_metrics_mean[metric_row][name_row][metric_col]
            # std_metric = all_metrics_std[metric_row][name_row][metric_col]
            # latex_table += f"& {avg_metric:.3f} ({std_metric:.3f})"
            avgs_c = ""
            stds_c = ""
            for j in range(len(names_test)):
                test_name = names_test[j]
                avg_metric = all_metrics_mean[problem_i][i][j]
                std_metric = all_metrics_std[problem_i][i][j]
                # std_metric = all_metrics_std[metric_row][name_row][metric_col]
                avgs_c += " & " + " & ".join(map(lambda x: "{:.3f}".format(x), avg_metric))
                stds_c += " & " + " & ".join(map(lambda x: "({:.3f})".format(x), std_metric))

                # if test_data:
                #     latex_table += " & " + test_name + " & " +  numbers + " \\\\\n"
                # else:
            latex_table += avgs_c + " \\\\\n"
            latex_table += " & " + stds_c + " \\\\\n"

                # latex_table += "\\cline{2-" + all_cols + "}\n"
        latex_table += "\\hline\n"
    latex_table += "\\end{tabular}\n"
    # latex_table += "}\n"
    latex_table += "\\caption{Comparison}\n"
    latex_table += "\\label{tab:eval}\n"
    latex_table += "\\end{table}"
    
    return latex_table

In [10]:
metric_names_actual = ["Accuracy", "P", "R", "F1"]
names_train = ["Adult", "Augmented Adult"]
test_sets, _ = adult_dataset_generator.split_population(all_data)
protected_attributes = ["Sex"]
# names_test = ["\\makecell[c]{" + '\\\\'.join([f"{attr}-{value}" for attr, value in zip(protected_attributes, entry)]) + "}" for entry in test_sets.keys()]
# names_test = [' \& '.join([f"{attr}={value}" for attr, value in zip(protected_attributes, entry)]) for entry in test_sets.keys()]
names_test = [f"Sex={value}" for value in test_sets.keys()]
print(names_test)
names_test.append("Overall")
latex_table = generate_latex_table1(average, std, names_train, names_test, problems_classification, metric_names_actual=metric_names_actual, test_data=True)
print(latex_table)


['Sex=Female', 'Sex=Male']
\begin{table}[h]
\centering
\begin{tabular}{l l c c c c c c c c c c c c}
\hline
Model & Train data & \multicolumn{12}{c}{Test data} \\
&  & \multicolumn{4}{c}{Sex=Female} & \multicolumn{4}{c}{Sex=Male} & \multicolumn{4}{c}{Overall}\\
\cline{3-14}&  & Accuracy & P & R & F1 & Accuracy & P & R & F1 & Accuracy & P & R & F1 \\
\hline\multirow{2}{*}{catboost} & \multirow{2}{*}{Adult} & 0.915 & 0.887 & 0.261 & 0.403 & 0.770 & 0.906 & 0.277 & 0.424 & 0.818 & 0.903 & 0.275 & 0.421 \\
 &  & (0.002) & (0.022) & (0.017) & (0.021) & (0.007) & (0.013) & (0.011) & (0.013) & (0.005) & (0.013) & (0.009) & (0.012) \\
 & \multirow{2}{*}{Augmented Adult} & 0.870 & 0.449 & 0.809 & 0.577 & 0.711 & 0.517 & 0.865 & 0.647 & 0.764 & 0.506 & 0.857 & 0.636 \\
 &  & (0.011) & (0.024) & (0.017) & (0.018) & (0.006) & (0.010) & (0.006) & (0.007) & (0.005) & (0.009) & (0.006) & (0.007) \\
\hline
\end{tabular}
\caption{Comparison}
\label{tab:eval}
\end{table}


In [11]:
test_sets.keys()

dict_keys(['Female', 'Male'])