In [1]:
import sys 
sys.path.append("..")
from src.dataset import Dataset
import pandas as pd
import os
import numpy as np
from sklearn.exceptions import NotFittedError
import copy

import warnings

# Suppress LightGBM categorical_feature warning
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature keyword has been found*")
warnings.filterwarnings("ignore", category=UserWarning, message="categorical_feature in param dict is overridden*")


RUN_GPU = False


In [2]:
from fairlearn.metrics import demographic_parity_difference, demographic_parity_ratio, true_positive_rate_difference, true_positive_rate, false_positive_rate_difference

def eq_odd(y_test, y_pred, group_test):
    return true_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)\
                + false_positive_rate_difference(y_test, y_pred, sensitive_features=group_test)

In [3]:
import warnings

# Ignore FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, precision_score, recall_score
import xgboost as xgb

from catboost import CatBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
# from lightgbm import Dataset as lgbmdataset


if RUN_GPU:
    from cuml import RandomForestClassifier, DecisionTreeClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator


In [5]:
dataset = "dutch"
dataset_generator = Dataset(dataset)
all_data = dataset_generator.original_dataframe.copy()

column_types_map = [dataset_generator.dtype_map[col] for col in all_data.columns]

Dataset dutch has ['age', 'household_position', 'household_size', 'prev_residence_place', 'citizenship', 'country_birth', 'edu_level', 'economic_status', 'cur_eco_activity', 'marital_status'] categorical and [] numerical columns.


In [6]:
from definitions import *

In [7]:
all_categorical = all(dtype == 'category' for dtype in column_types_map)
generative_methods = ["gaussian_copula", "ctgan", "tvae", "cart", "smote"]
protected_attribute = "sex"

if all_categorical:
    print("Only categorical features, dropping SMOTE")
    generative_methods.remove("smote")

problem_classification = {"metrics":[accuracy_score, f1_score, roc_auc_score],
                    "metric_names":["Accuracy", "F1", "ROC AUC"],
                    "fairness_metrics": [eq_odd],
                    "fairness_metric_names": ["Equalized odds"],
                    "generative_methods":generative_methods,
                    "sampling_methods":['class', 'class_protected', 'protected', 'same_class']}
                    



# We create the preprocessing pipelines for both numeric and categorical data.
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


# categorical_transformer_lgbm = Pipeline(steps=[
#     ('ordinal', PositiveOrdinalEncoder())
# ])

categorical_cols = dataset_generator.categorical_input_cols.copy()

transformations = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, dataset_generator.continuous_input_cols),
        ('cat', categorical_transformer, categorical_cols)])

# transformations_lgbm = ColumnTransformer(
#     transformers=[
#         ('num', numeric_transformer, dataset_generator.continuous_input_cols),
#         ('cat', categorical_transformer_lgbm, categorical_cols)])

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf_RF = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', RandomForestClassifier(random_state=42))])
clf_DT = Pipeline(steps=[('preprocessor', transformations),
                    ('classifier', DecisionTreeClassifier(random_state=42))])     

# clf_lgbm = Pipeline(steps=[('preprocessor', transformations_lgbm),
#                     ('classifier', LGBMClassifier(categorical_feature=dataset_generator.categorical_input_col_locations, verbose=-1))])  

                    
model_names_classification = ["LightGBM", "XGBoost", "Decission Tree", "Random Forest"]


models_classification = [LGBMClassifier, xgb.XGBClassifier, clf_DT, clf_RF]



args = [{"categorical_feature":dataset_generator.categorical_input_col_locations, "verbose":-1}, {"enable_categorical":True, "tree_method":'hist'}, {}, {}]

problems_classification = []
for model, name, arg in zip(models_classification, model_names_classification, args):
    problem = problem_classification.copy()
    problem["model"] = copy.deepcopy(model)
    problem["model_name"] = name
    problem["args"] = arg
    problems_classification.append(problem)


Only categorical features, dropping SMOTE


In [8]:
average, std, feat_imp_average, feat_imp_std = run_experiments_all_sampling(problems_classification, dataset_generator, all_data, num_repeats = 5, num_folds = 3, protected_attributes = [protected_attribute])


Split 0 / 15
0        1
2        1
3        0
9        1
10       1
        ..
60400    1
60413    1
60415    1
60417    1
60419    1
Name: occupation, Length: 20159, dtype: int64
occupation
1    12613
0     7546
Name: count, dtype: int64
		 Synthetic samples gaussian_copula
		 Synthetic samples ctgan
		 Synthetic samples tvae
		 Synthetic samples cart
		 Synthetic samples gaussian_copula
		 Synthetic samples ctgan
		 Synthetic samples tvae
		 Synthetic samples cart
		 Synthetic samples gaussian_copula
		 Synthetic samples ctgan
		 Synthetic samples tvae
		 Synthetic samples cart
		 Synthetic samples gaussian_copula
		 Synthetic samples ctgan
		 Synthetic samples tvae
		 Synthetic samples cart
	 Evaluating
Split 1 / 15
1        0
4        0
5        1
7        0
11       0
        ..
60407    1
60411    0
60412    0
60414    1
60416    0
Name: occupation, Length: 20242, dtype: int64
occupation
0    13626
1     6616
Name: count, dtype: int64
		 Synthetic samples gaussian_copula
		 Synth

KeyboardInterrupt: 

In [None]:
average.shape

In [None]:
dsafasdf

In [None]:
np.savez('../results/arrays_test_many_repeats_no_fnlwgt.npz', average=average, std=std, feat_imp_average=feat_imp_average, feat_imp_std=feat_imp_std)


In [None]:
average.shape

In [None]:
metric_names_actual = ["Accuracy", "F1", "Equalized Odds"]
names_train = ["Adult", "Augmented Adult (TVAE)", "Augmented Adult (CART)", "Augmented Adult (SMOTENC)"]
test_sets, _ = adult_dataset_generator.split_population(all_data)
protected_attributes = ["Sex"]

# names_test = [f"Sex={value}" for value in test_sets.keys()]
names_test = []
names_test.append("Overall")
latex_table = generate_latex_table1(average, std, names_train, names_test, problems_classification, metric_names_actual=metric_names_actual, test_data=True)
print(latex_table)
