# Cross-validation exploration

This notebook analyzes the results of cross-validation and presents them in the format of the paper.

In [None]:
import pandas as pd
from collections import defaultdict
from CV import get_latex, get_latex_performance, get_latex_fairness
from pprint import pprint

In [None]:
df = pd.read_csv('CV/CV5_no_protected_attribute/cross_validation.csv', sep=';')
MODEL_NAMES = ('logreg', 'rf', 'logregreweight', 'rfreweight', 'prejudiceremover')

In [None]:
def get_mean_stdev_for_single_model(selection: pd.DataFrame, model: str, deltas=False) -> pd.DataFrame:
    means = selection.mean()
    stds = selection.std()
    mean_std = pd.concat([means, stds], axis=1).rename(columns={0: 'Mean', 1: 'StdDev'})
    # The following check for statistical significance only applies to deltas
    if deltas:
        statistically_significant = mean_std[mean_std.apply(lambda row: row['StdDev'] * 2 < abs(row['Mean']), 1)]
        if len(statistically_significant) > 0:
            print('Statistical significance in ' + model, statistically_significant.index)
        else:
            print('No statistically significant rows')
    mean_std[model] = mean_std.apply(lambda row: ' +/- '.join(['{:.3f}'.format(row[el]) for el in ('Mean', 'StdDev')]), 1)
    report = mean_std.drop(columns=['Mean', 'StdDev']).transpose()
    report.drop(columns=[el for el in ('Bias Mitigator', 'fold') if el in report.columns], inplace=True)
    return report

In [None]:
def select_rows(df: pd.DataFrame, model: str) -> pd.DataFrame:
    if model in ('logreg', 'rf'):
        bias_mitigator = lambda frame: frame['Bias Mitigator'].isnull()
    else:
        bias_mitigator = lambda frame: frame['Bias Mitigator'] == ('Reweighting' if model.endswith('reweight') else 'Prejudice Remover')
    clf = 'Random Forest' if model.startswith('rf') else 'Logistic Regression'
    return df[(bias_mitigator(df)) & (df['Classifier'] == clf)]

def get_mean_stdev(model: str) -> pd.DataFrame:
    selection = select_rows(df, model)
    return get_mean_stdev_for_single_model(selection, model)

In [None]:
def get_classifier_name(model_name: str) -> str:
    if model_name.startswith('rf'):
        return 'Random Forest'
    else:
        return 'Logistic Regression'
    
def get_mitigator_name(model_name: str) -> str:
    if model_name.endswith('reweight'):
        return 'Reweighing'
    elif model_name == 'prejudiceremover':
        return 'Prejudice Remover'
    else:
        return ''

In [None]:
metrics_table = pd.concat([get_mean_stdev(el) for el in MODEL_NAMES])

In [None]:
metrics_table.reset_index(drop=False, inplace=True)
metrics_table['Classifier'] = metrics_table['index'].apply(get_classifier_name)
metrics_table['Bias Mitigator'] = metrics_table['index'].apply(get_mitigator_name)
metrics_table.drop(columns=['index'], errors='ignore', inplace=True)

In [None]:
metrics_table

# Difference between original model and bias mitigators

In [None]:
by_clf = []
for clf in df['Classifier'].unique():
    clf_df = df[df['Classifier'] == clf].reset_index(drop=True)
    df_by_fold = []
    for fold in clf_df['fold'].unique():
        clf_fold_df = clf_df[clf_df['fold'] == fold].reset_index(drop=True)
        mitigators = []
        deltas = defaultdict(list)
        for mitigator in clf_fold_df[clf_fold_df['Bias Mitigator'].notnull()]['Bias Mitigator'].unique():
            mitigators.append(mitigator)
            for metric in clf_fold_df.columns:
                if metric in ('Classifier', 'Bias Mitigator', 'fold'):
                    continue
                effect_metric_values = clf_fold_df[clf_fold_df['Bias Mitigator'] == mitigator][metric].tolist()
                baseline_metric_values = clf_fold_df[clf_fold_df['Bias Mitigator'].isnull()][metric].tolist()
                assert len(effect_metric_values) == 1 and len(baseline_metric_values) == 1
                delta = effect_metric_values[0] - baseline_metric_values[0]
                deltas[metric].append(delta)
        delta_df = pd.DataFrame()
        delta_df['Bias Mitigator'] = mitigators
        for metric, delta in deltas.items():
            delta_df[metric] = delta
        delta_df['fold'] = fold
        df_by_fold.append(delta_df)
    delta_all_folds = pd.concat(df_by_fold)
    mean_std = []
    for lbl, grp in delta_all_folds.groupby('Bias Mitigator'):
        mean_std_for_mitigator = get_mean_stdev_for_single_model(grp, clf + '_' + lbl, deltas=True)
        mean_std_for_mitigator['Bias Mitigator'] = lbl
        mean_std.append(mean_std_for_mitigator)
    mean_std = pd.concat(mean_std)
    mean_std['Classifier'] = clf
    by_clf.append(mean_std)
final_diffs = pd.concat(by_clf)

In [None]:
final_diffs

# Format tables for Latex

In [None]:
for row in get_latex_performance(metrics_table).split('\n'):
    print(row)

In [None]:
for row in get_latex_fairness(metrics_table).split('\n'):
    print(row)

In [None]:
for row in get_latex_performance(final_diffs, True).split('\n'):
    print(row)

In [None]:
for row in get_latex_fairness(final_diffs, True).split('\n'):
    print(row)