### Premise :
Discriminations found in data vs found in trained model : We must distinguish between the discriminations that are in the model and not in the data. Someone who runs an analysis might be more interested in a model that performs better in finding discrimination in data. Using an intermediary model like many of them do might yield results that are not real.

- Proportion of discriminations from the data (nb discrimination / nb unique found)
- Proportion of new discrimination not in data (nb discrimination / nb unique found)

In [2]:
import pandas as pd
from data_generator.main import generate_data
from path import HERE
import sqlite3

data_obj = generate_data()

DB_PATH = HERE.joinpath("experiments/analyzing_methods/global/global_testing_res.db")

conn = sqlite3.connect(DB_PATH)

df_res = pd.read_sql("SELECT * FROM main.table_fe2d9c73c8d144c3958cd9c0382ad043_results", conn)
df_test = pd.read_sql("SELECT * FROM main.table_fe2d9c73c8d144c3958cd9c0382ad043_testdata", conn)

Using cached data


In [3]:
from typing import Tuple
from functools import lru_cache


@lru_cache(maxsize=4096)
def matches_pattern(pattern: str, value: str) -> bool:
    """Convert the pattern with * wildcards to regex and match against value."""
    import re
    # Escape special regex characters except *
    regex_pattern = re.escape(pattern).replace('\\*', '.*')
    return bool(re.match(f'^{regex_pattern}$', value))


def is_individual_part_of_the_original_indv(indv_key, indv_key_list):
    return indv_key in indv_key_list


def is_couple_part_of_a_group(couple_key, group_key_list):
    res = []

    couple_key_elems = couple_key.split('-')
    if len(couple_key_elems) != 2:
        print(f"Warning: Unexpected couple key format: {couple_key}")
        return res

    opt1 = f"{couple_key_elems[0]}-{couple_key_elems[1]}"
    opt2 = f"{couple_key_elems[1]}-{couple_key_elems[0]}"

    for grp_key in group_key_list:
        if matches_pattern(grp_key, opt1) or matches_pattern(grp_key, opt2):
            res.append(grp_key)
    return res


def evaluate_discrimination_detection(
        synthetic_data: pd.DataFrame,
        results_df: pd.DataFrame
) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Optimized evaluation of discrimination detection with fixed metrics calculation."""
    if results_df.empty:
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame()

    # Convert to string type once
    synthetic_data = synthetic_data.astype({
        'indv_key': str,
        'group_key': str,
        'subgroup_key': str
    })

    results_df = results_df.astype({
        'indv_key': str,
        'couple_key': str
    })

    # Create a DataFrame for group-level analysis
    unique_groups = synthetic_data['group_key'].unique()

    def analyze_group(group_key: str) -> pd.Series:
        # Get individuals in this group from synthetic data
        group_synthetic_indv = set(
            synthetic_data[synthetic_data['group_key'] == group_key]['indv_key']
        )

        # Split group key into its two patterns
        group_patterns = group_key.split('-')
        pattern1, pattern2 = group_patterns[0], group_patterns[1]

        # Find exact individual matches in results
        exact_indv_matches = [
            key for key in results_df['indv_key'].unique()
            if is_individual_part_of_the_original_indv(key, group_synthetic_indv)
        ]

        # For couples, we need to check if both individuals in the couple are exact matches
        exact_couple_matches = []
        for couple_key in results_df['couple_key'].unique():
            indv1, indv2 = couple_key.split('-')
            if (is_individual_part_of_the_original_indv(indv1, group_synthetic_indv) and
                    is_individual_part_of_the_original_indv(indv2, group_synthetic_indv)):
                exact_couple_matches.append(couple_key)

        # Find new individuals matching either group pattern but not in original data
        new_group_indv = [
            key for key in results_df['indv_key'].unique()
            if (matches_pattern(pattern1, key) or matches_pattern(pattern2, key))
               and key not in exact_indv_matches
        ]

        # Find new couples matching group pattern but not in exact matches
        new_group_couples = [
            key for key in results_df['couple_key'].unique()
            if is_couple_part_of_a_group(key, [group_key])
               and key not in exact_couple_matches
        ]

        return pd.Series({
            'group_key': group_key,
            'synthetic_group_size': len(group_synthetic_indv),
            'individuals_part_of_original_data': exact_indv_matches,
            'couples_part_of_original_data': exact_couple_matches,
            'new_individuals_part_of_a_group_regex': new_group_indv,
            'new_couples_part_of_a_group_regex': new_group_couples,
            'num_exact_individual_matches': len(exact_indv_matches),
            'num_exact_couple_matches': len(exact_couple_matches),
            'num_new_group_individuals': len(new_group_indv),
            'num_new_group_couples': len(new_group_couples)
        })

    # Create group analysis DataFrame
    group_analysis_df = pd.DataFrame([
        analyze_group(group_key) for group_key in unique_groups
    ])

    # Process results additions
    def process_results_row(row):
        # Check if individual is an exact match in any group
        is_original = any(
            is_individual_part_of_the_original_indv(row['indv_key'],
                                                    synthetic_data[synthetic_data['group_key'] == group_key][
                                                        'indv_key'])
            for group_key in unique_groups
        )

        # Check which groups the individual matches patterns for
        individual_groups = []
        for group_key in unique_groups:
            pattern1, pattern2 = group_key.split('-')
            if matches_pattern(pattern1, row['indv_key']) or matches_pattern(pattern2, row['indv_key']):
                individual_groups.append(group_key)

        # Check which groups the couple matches patterns for
        couple_groups = [
            group_key for group_key in unique_groups
            if is_couple_part_of_a_group(row['couple_key'], [group_key])
        ]

        return pd.Series({
            'is_original_data': is_original,
            'is_individual_part_of_a_group': len(individual_groups) > 0,
            'is_couple_part_of_a_group': len(couple_groups) > 0,
            'matching_groups': individual_groups
        })

    # Apply the processing to results data
    results_additions = results_df.apply(process_results_row, axis=1)
    results_df = pd.concat([results_df, results_additions], axis=1)

    # Create a summary DataFrame
    summary_df = pd.DataFrame({
        'total_synthetic_records': len(synthetic_data),
        'total_result_records': len(results_df),
        'total_groups': len(unique_groups),
        'avg_group_size': group_analysis_df['synthetic_group_size'].mean(),
        'avg_exact_matches_per_group': group_analysis_df['num_exact_individual_matches'].mean(),
        'avg_new_matches_per_group': group_analysis_df['num_new_group_individuals'].mean()
    }, index=[0])

    return group_analysis_df, results_df, summary_df


In [4]:
eval_results, group_details = evaluate_discrimination_detection(df_test, df_res)



KeyboardInterrupt

