In [None]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import os
import numpy as np
from sklearn.exceptions import NotFittedError
import copy

RUN_GPU = False


In [None]:
from fairlearn.metrics import demographic_parity_difference, demographic_parity_ratio, true_positive_rate_difference, true_positive_rate, false_positive_rate_difference

def eq_odd(y_test, y_pred, group_test):
    return true_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)\
                + false_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)

In [None]:
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb

from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

if RUN_GPU:
    from cuml import RandomForestClassifier, DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator


In [None]:
adult_dataset_generator = Dataset("adult")
all_data = adult_dataset_generator.original_dataframe.copy()

In [None]:
from definitions import *

In [None]:


# problem_classification = {"metrics":[accuracy_score,  precision_score, recall_score, f1_score, roc_auc_score],
#                       "metric_names":["Accuracy", "P", "R", "F1", "ROC AUC"],
#                       "fairness_metrics": [eq_odd],
#                       "fairness_metric_names": ["Equalized odds"],
#                       "generative_methods": ["tvae", "cart", "smote"],}
                      
problem_classification = {"metrics":[accuracy_score, f1_score, roc_auc_score],
                      "metric_names":["Accuracy", "F1", "ROC AUC"],
                      "fairness_metrics": [eq_odd],
                      "fairness_metric_names": ["Equalized odds"],
                      "generative_methods": ["tvae", "cart", "smote"],}


# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


categorical_cols = adult_dataset_generator.categorical_input_cols.copy()
categorical_cols.remove("sex")

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, adult_dataset_generator.continuous_input_cols),
        ('cat', categorical_transformer, categorical_cols)])


# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_RF = Pipeline(steps=[('preprocessor', transformations),
                      ('classifier', RandomForestClassifier(random_state=42))])
clf_DT = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', DecisionTreeClassifier(random_state=42))])     


# models = [MultiOutputRegressor(LGBMRegressor(random_state=42)), DecisionTreeRegressor(random_state=42), RandomForestRegressor(random_state=42)]
# models_classification = [xgb.XGBClassifier, CatBoostClassifier, DecisionTreeClassifier, RandomForestClassifier]
# models_classification = [xgb.XGBClassifier]
models_classification = [CatBoostClassifier, clf_DT, clf_RF]

# args = [{"random_state":42}, {"random_state":42, "loss_function":"MultiRMSE", "verbose":False, "iterations":100, "learning_rate":0.01}, {"random_state":42}, {"random_state":42}]
args = [{"random_state":42, "loss_function":"Logloss", "verbose":False, "iterations":100, "learning_rate":0.01, "cat_features":adult_dataset_generator.categorical_input_cols}, {}, {}]

if RUN_GPU:
    args[0]["task_type"] = "GPU"


# model_names_classification = ["xgboost", "catboost", "DT", "RF"]
model_names_classification = ["Catboost", "Decission Tree", "Random Forest"]
problems_classification = []
for model, name, arg in zip(models_classification, model_names_classification, args):
    problem = problem_classification.copy()
    problem["model"] = copy.deepcopy(model)
    problem["model_name"] = name
    problem["args"] = arg
    problems_classification.append(problem)

In [None]:
average, std, feat_imp_average, feat_imp_std = run_experiments(problems_classification, adult_dataset_generator, all_data, num_repeats = 5, num_folds = 3, protected_attributes = ["sex"], visualize_tree=True)
# average, std = run_experiments(problems_classification, adult_dataset_generator, all_data, num_repeats = 1, num_folds = 2, protected_attributes = ["sex"])

In [None]:
np.savez('../results/arrays_test_many_repeats_no_fnlwgt.npz', average=average, std=std, feat_imp_average=feat_imp_average, feat_imp_std=feat_imp_std)


In [None]:
average.shape

In [None]:
metric_names_actual = ["Accuracy", "F1", "Equalized Odds"]
names_train = ["Adult", "Augmented Adult (TVAE)", "Augmented Adult (CART)", "Augmented Adult (SMOTENC)"]
test_sets, _ = adult_dataset_generator.split_population(all_data)
protected_attributes = ["Sex"]

# names_test = [f"Sex={value}" for value in test_sets.keys()]
names_test = []
names_test.append("Overall")
latex_table = generate_latex_table1(average, std, names_train, names_test, problems_classification, metric_names_actual=metric_names_actual, test_data=True)
print(latex_table)
