# `metrics` documentation

This jupyter notebook demonstrates the functionality of the `metrics` module of our fairness package for evaluating the difference in performance of predictive health models across groups.

In [1]:
import fairness_pkg.metrics as fm

In this notebook we will use observatins from 30 fictional participants who are given age and gender labels, model predictions, and true diagnoses, to show how functions from our package should be used.

In [2]:
age_labels = ['Younger', 'Older', 'Older', 'Younger', 'Older', 'Older',
              'Younger', 'Older', 'Older', 'Younger', 'Older', 'Younger',
              'Older', 'Younger', 'Younger', 'Older', 'Older', 'Older',
              'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older',
              'Younger', 'Younger', 'Younger', 'Older', 'Older', 'Older']

sex_labels = ['Male', 'Female', 'Male', 'Male', 'Female', 'Male',
              'Male', 'Female', 'Male', 'Female', 'Female', 'Male',
              'Female', 'Male', 'Female', 'Male', 'Female', 'Female',
              'Female', 'Female', 'Female', 'Female', 'Female', 'Male',
              'Male', 'Male', 'Male', 'Male', 'Male', 'Female']            

model_predictions = [1, 1, 1, 1, 1, 1, 
                     0, 0, 0, 0, 0, 0,
                     1, 0, 1, 0, 1, 1,
                     0, 1, 0, 1, 0, 0,
                     0, 0, 0, 0, 1, 1]

true_diagnoses = [1, 1, 1, 0, 1, 0,
                  0, 1, 0, 1, 1, 0,
                  0, 0, 1, 1, 0, 0, 
                  0, 1, 0, 1, 1, 0,
                  1, 0, 0, 1, 1, 1]

all_group_labels_dict = {'Age': age_labels,
                         'Sex': sex_labels}

print(all_group_labels_dict)

{'Age': ['Younger', 'Older', 'Older', 'Younger', 'Older', 'Older', 'Younger', 'Older', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Younger', 'Older', 'Older', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Older', 'Younger', 'Younger', 'Younger', 'Older', 'Older', 'Older'], 'Sex': ['Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Female', 'Male', 'Male', 'Male', 'Male', 'Male', 'Male', 'Female']}


The functions `group_acc`, `group_acc_diff` and `group_acc_ratio` can be used to explore differences between simple, non intersectional groups.

In [3]:
print(fm.group_acc(group_label='Female',
                   subject_labels=sex_labels,
                   predictions=model_predictions,
                   true_statuses=true_diagnoses))

0.5333333333333333


In [4]:
print(fm.group_acc(group_label='Male',
                   subject_labels=sex_labels,
                   predictions=model_predictions,
                   true_statuses=true_diagnoses))

0.6666666666666666


In [5]:
print(fm.group_acc_diff('Female', 'Male', sex_labels, model_predictions, true_diagnoses))

0.1333333333333333


In [6]:
print(fm.group_acc_ratio('Female', 'Male', sex_labels, model_predictions, true_diagnoses, natural_log=False))

1.25


In [7]:
print(fm.group_acc_ratio('Female', 'Male', sex_labels, model_predictions, true_diagnoses, natural_log=True))

0.22314355131420976


The functions `intersect_acc`, `all_intersect_accs`, `max_intersect_acc_diff` and `max_intersect_acc_ratio` can be used to explore differences between intersectional groups.

To demonstrate the `intersect_acc` function we will first create dictionaries for each possible sex and age combination.

In [8]:
older_female_dict = {'Age': 'Older',
                     'Sex': 'Female'}
older_male_dict = {'Age': 'Older',
                   'Sex': 'Male'}
younger_female_dict = {'Age': 'Younger',
                       'Sex': 'Female'}
younger_male_dict = {'Age': 'Younger',
                     'Sex': 'Male'}

Now we can try using the intersectional accuracy functions.

In [9]:
print(fm.intersect_acc(group_labels_dict=older_female_dict,
                       subject_labels_dict=all_group_labels_dict,
                       predictions=model_predictions,
                       true_statuses=true_diagnoses))

0.5


In [10]:
print(fm.intersect_acc(group_labels_dict=older_male_dict,
                       subject_labels_dict=all_group_labels_dict,
                       predictions=model_predictions,
                       true_statuses=true_diagnoses))

0.5714285714285714


In [11]:
print(fm.intersect_acc(group_labels_dict=younger_female_dict,
                       subject_labels_dict=all_group_labels_dict,
                       predictions=model_predictions,
                       true_statuses=true_diagnoses))

0.6


In [12]:
print(fm.intersect_acc(group_labels_dict=younger_male_dict,
                       subject_labels_dict=all_group_labels_dict,
                       predictions=model_predictions,
                       true_statuses=true_diagnoses))

0.75


In [13]:
print(fm.all_intersect_accs(subject_labels_dict=all_group_labels_dict,
                            predictions=model_predictions,
                            true_statuses=true_diagnoses))

{'Older + Female': 0.5, 'Older + Male': 0.5714285714285714, 'Younger + Female': 0.6, 'Younger + Male': 0.75}


In [14]:
print(fm.max_intersect_acc_diff(subject_labels_dict=all_group_labels_dict,
                                predictions=model_predictions,
                                true_statuses=true_diagnoses))

0.25


In [15]:
print(fm.max_intersect_acc_ratio(subject_labels_dict=all_group_labels_dict,
                                 predictions=model_predictions,
                                 true_statuses=true_diagnoses,
                                 natural_log=False))

1.5


In [16]:
print(fm.max_intersect_acc_ratio(subject_labels_dict=all_group_labels_dict,
                                 predictions=model_predictions,
                                 true_statuses=true_diagnoses,
                                 natural_log=True))

0.4054651081081644
