In [18]:
import pandas as pd
import numpy as np
from fairlearn.metrics import equalized_odds_difference, demographic_parity_difference
from sklearn.metrics import accuracy_score, f1_score
from scipy.stats import t
import os

# Metrics (Mean Accuracy, 95% CI, f1-score, EO, DP)

## All data samples

In [20]:
def process_file(file_path):
    """
    Processes a single CSV file to compute mean accuracy, CI, F1-score, and fairness metrics.

    Parameters:
        file_path (str): Path to the CSV file.

    Returns:
        pd.DataFrame: A DataFrame containing mean metrics for the file.
    """
    try:
        data = pd.read_csv(file_path)
        data = data[data['dataset_split'] == 'test']

        required_columns = {'dataset_split', 'ground_truth'}
        if not required_columns.issubset(data.columns):
            raise ValueError(f"{file_path} is missing required columns: {required_columns - set(data.columns)}")

        true_labels = data['ground_truth']
        sensitive_features = [col for col in data.columns if col in ['gender', 'race', 'age']]
        prediction_columns = [col for col in data.columns if "predicted_label" in col]

        def accuracy_with_ci(y_true, y_pred, confidence=0.95):
            accuracy = accuracy_score(y_true, y_pred)
            n = len(y_true)
            se = np.sqrt(accuracy * (1 - accuracy) / n)
            h = se * t.ppf((1 + confidence) / 2, n - 1)
            return accuracy, (accuracy - h, accuracy + h)

        def binarize_labels(labels):
            unique_labels = labels.unique()
            return labels.apply(lambda x: 1 if x == unique_labels[0] else 0)

        results = []
        for pred_col in prediction_columns:
            predictions = data[pred_col]
            accuracy, ci = accuracy_with_ci(true_labels, predictions)
            f1_micro = f1_score(true_labels, predictions, average='micro')

            true_labels_binary = binarize_labels(true_labels)
            predictions_binary = binarize_labels(predictions)

            fairness_metrics = {}
            for sensitive_feature in sensitive_features:
                eod = equalized_odds_difference(
                    y_true=true_labels_binary,
                    y_pred=predictions_binary,
                    sensitive_features=data[sensitive_feature]
                )
                dpd = demographic_parity_difference(
                    y_true=true_labels_binary,
                    y_pred=predictions_binary,
                    sensitive_features=data[sensitive_feature]
                )
                fairness_metrics[f"EO_{sensitive_feature}"] = eod
                fairness_metrics[f"DP_{sensitive_feature}"] = dpd

            results.append({
                'Prediction Column': pred_col,
                'Accuracy': accuracy,
                '95% CI Lower': ci[0],
                '95% CI Upper': ci[1],
                'F1-Score': f1_micro,
                **fairness_metrics
            })

        results_df = pd.DataFrame(results)
        fixed_columns = ['Prediction Column', 'Accuracy', '95% CI Lower', '95% CI Upper', 'F1-Score']
        metrics_order = [f"EO_{feat}" for feat in sensitive_features] + [f"DP_{feat}" for feat in sensitive_features]
        sorted_columns = fixed_columns + metrics_order
        results_df = results_df[sorted_columns]

        mean_results = results_df.mean(numeric_only=True).to_frame(name='Mean').T

        # Convert specific metrics to percentage and apply appropriate rounding
        percentage_metrics = ['Accuracy', '95% CI Lower', '95% CI Upper']
        mean_results[percentage_metrics] = mean_results[percentage_metrics] * 100
        mean_results[percentage_metrics] = mean_results[percentage_metrics].round(2)

        # Round other metrics to 3 decimal places
        other_metrics = list(set(mean_results.columns) - set(percentage_metrics))
        mean_results[other_metrics] = mean_results[other_metrics].round(3)

        mean_results.index = [os.path.basename(file_path)]

        return mean_results

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None


# Define file paths
file_paths = [
    "vit.csv",
    "vit_aug.csv",
    "cnn.csv", 
    "cnn_aug.csv", 
    "vgg.csv", 
    "vgg_aug.csv", 
    "vit_aug_under.csv",
    "vit_elastic.csv", 
    "vit_elastic_under.csv",
    "t1.csv", 
    "t2.csv", 
    "t3(s1).csv", 
    "t4.csv",
    "t5.csv", 
    "t6.csv", 
    "s2.csv", 
    "s3.csv", 
    "s4.csv", 
    "s5.csv",
    "a1.csv",
    "a2.csv",
    "a3.csv",
    "a4.csv"
]

# Process all files and collect results
all_results = []
for file_path in file_paths:
    result = process_file(file_path)
    if result is not None:
        all_results.append(result)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results)

# Save to CSV
output_file = f"./results_metrics.csv"
final_results_df.to_csv(output_file)

print(f"Results saved to {output_file}")

Results saved to ./results_metrics.csv


## Underrepresented groups samples

In [6]:
def process_file(file_path):
    try:
        data = pd.read_csv(file_path)
        data = data[data['dataset_split'] == 'test']

        required_columns = {'dataset_split', 'ground_truth', 'gender', 'race', 'age'}
        if not required_columns.issubset(data.columns):
            raise ValueError(f"{file_path} is missing required columns: {required_columns - set(data.columns)}")

        # Define minority group conditions
        def is_minority(row):
            return (
                (row['gender'] == 0 and row['ground_truth'] in [1, 2, 5]) or
                (row['race'] == 1 and row['ground_truth'] in [1, 2, 5]) or
                (row['age'] in [0, 4] and row['ground_truth'] in [1, 2, 5])
            )

        data['is_minority'] = data.apply(is_minority, axis=1)
        minority_data = data[data['is_minority']]

        true_labels_minority = minority_data['ground_truth']
        true_labels_full = data['ground_truth']
        sensitive_features = [col for col in data.columns if col in ['gender', 'race', 'age']]
        prediction_columns = [col for col in data.columns if "predicted_label" in col]

        def accuracy_with_ci(y_true, y_pred, confidence=0.95):
            accuracy = accuracy_score(y_true, y_pred)
            n = len(y_true)
            se = np.sqrt(accuracy * (1 - accuracy) / n)
            h = se * t.ppf((1 + confidence) / 2, n - 1)
            return accuracy, (accuracy - h, accuracy + h)

        def binarize_labels(labels):
            unique_labels = labels.unique()
            return labels.apply(lambda x: 1 if x == unique_labels[0] else 0)

        results = []
        for pred_col in prediction_columns:
            # Metrics for minority group
            predictions_minority = minority_data[pred_col]
            accuracy_minority, ci_minority = accuracy_with_ci(true_labels_minority, predictions_minority)
            f1_micro_minority = f1_score(true_labels_minority, predictions_minority, average='micro')

            # Fairness metrics for the full dataset
            predictions_full = data[pred_col]
            true_labels_binary_full = binarize_labels(true_labels_full)
            predictions_binary_full = binarize_labels(predictions_full)

            fairness_metrics = {}
            for sensitive_feature in sensitive_features:
                eod = equalized_odds_difference(
                    y_true=true_labels_binary_full,
                    y_pred=predictions_binary_full,
                    sensitive_features=data[sensitive_feature]
                )
                dpd = demographic_parity_difference(
                    y_true=true_labels_binary_full,
                    y_pred=predictions_binary_full,
                    sensitive_features=data[sensitive_feature]
                )
                fairness_metrics[f"EO_{sensitive_feature}"] = eod
                fairness_metrics[f"DP_{sensitive_feature}"] = dpd

            results.append({
                'Prediction Column': pred_col,
                'Accuracy': accuracy_minority,
                '95% CI Lower': ci_minority[0],
                '95% CI Upper': ci_minority[1],
                'F1-Score': f1_micro_minority,
                **fairness_metrics
            })

        results_df = pd.DataFrame(results)
        fixed_columns = ['Prediction Column', 'Accuracy', '95% CI Lower', 
                         '95% CI Upper', 'F1-Score']
        metrics_order = [f"EO_{feat}" for feat in sensitive_features] + [f"DP_{feat}" for feat in sensitive_features]
        sorted_columns = fixed_columns + metrics_order
        results_df = results_df[sorted_columns]

        mean_results = results_df.mean(numeric_only=True).to_frame(name='Mean').T

        # Convert specific metrics to percentage and apply appropriate rounding
        percentage_metrics = ['Accuracy', '95% CI Lower', '95% CI Upper']
        mean_results[percentage_metrics] = mean_results[percentage_metrics] * 100
        mean_results[percentage_metrics] = mean_results[percentage_metrics].round(2)

        # Round other metrics to 3 decimal places
        other_metrics = list(set(mean_results.columns) - set(percentage_metrics))
        mean_results[other_metrics] = mean_results[other_metrics].round(3)

        mean_results.index = [os.path.basename(file_path)]

        return mean_results

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Define file paths
file_paths = [
    "vit.csv",
    "vit_aug.csv",
    "cnn.csv", 
    "cnn_aug.csv", 
    "vgg.csv", 
    "vgg_aug.csv", 
    "vit_aug_under.csv",
    "vit_elastic.csv", 
    "vit_elastic_under.csv",
    "t1.csv", 
    "t2.csv", 
    "t3(s1).csv", 
    "t4.csv",
    "t5.csv", 
    "t6.csv", 
    "s2.csv", 
    "s3.csv", 
    "s4.csv", 
    "s5.csv",
    "a1.csv",
    "a2.csv",
    "a3.csv",
    "a4.csv"
]

# Process all files and collect results
all_results = []
for file_path in file_paths:
    result = process_file(file_path)
    if result is not None:
        all_results.append(result)

# Combine all results into a single DataFrame
final_results_df = pd.concat(all_results)

# Save to CSV
output_file = f"./results_metrics_under.csv"
final_results_df.to_csv(output_file)

print(f"Results saved to {output_file}")

Results saved to ./results_metrics_under.csv


# Emotion Class-wise Results with Metrics

In [14]:
# Load the dataset
file_path = "./vit.csv"
data = pd.read_csv(file_path)
data = data[data['dataset_split'] == 'test']

# Extract true labels, sensitive features, and prediction columns
true_labels = data['ground_truth']
sensitive_features = ['gender', 'race', 'age']
prediction_columns = [col for col in data.columns if "predicted_label" in col]

# Map class numbers to emotion labels
emotion_mapping = {
    0: "Surprise",
    1: "Fear",
    2: "Disgust",
    3: "Happiness",
    4: "Sadness",
    5: "Anger",
    6: "Neutral"
}

# Function to calculate accuracy with t-based CI
def accuracy_with_ci(y_true, y_pred, confidence=0.95):
    accuracy = accuracy_score(y_true, y_pred)
    n = len(y_true)
    se = np.sqrt(accuracy * (1 - accuracy) / n)  # Standard error
    h = se * t.ppf((1 + confidence) / 2, n - 1)  # Margin of error
    return accuracy, (accuracy - h, accuracy + h)

# Initialize results storage for overall metrics
overall_results = []

# Loop through each prediction column
for pred_col in prediction_columns:
    predictions = data[pred_col]

    # Calculate overall accuracy and CI
    accuracy, ci = accuracy_with_ci(true_labels, predictions)
    f1_micro = f1_score(true_labels, predictions, average='micro')  # Multi-class F1-score

    # Initialize storage for fairness metrics
    fairness_metrics = {}

    # Calculate fairness metrics for each sensitive feature
    for sensitive_feature in sensitive_features:
        for class_num in emotion_mapping.keys():
            true_labels_binary = (true_labels == class_num).astype(int)
            predictions_binary = (predictions == class_num).astype(int)

            fairness_metrics[f"EO_{sensitive_feature}_class{class_num}"] = equalized_odds_difference(
                y_true=true_labels_binary,
                y_pred=predictions_binary,
                sensitive_features=data[sensitive_feature]
            )
            fairness_metrics[f"DP_{sensitive_feature}_class{class_num}"] = demographic_parity_difference(
                y_true=true_labels_binary,
                y_pred=predictions_binary,
                sensitive_features=data[sensitive_feature]
            )

    # Store overall results
    overall_results.append({
        'Prediction Column': pred_col,
        'Accuracy': accuracy * 100,
        '95% CI Lower': ci[0] * 100,
        '95% CI Upper': ci[1] * 100,
        'F1-Score': f1_micro * 100,
        **fairness_metrics
    })

# Convert overall results to a DataFrame
overall_results_df = pd.DataFrame(overall_results)

# Initialize storage for class-wise metrics
classwise_results = []

# Loop through each emotion class
for class_num, emotion in emotion_mapping.items():
    true_labels_binary = (true_labels == class_num).astype(int)

    accuracies, ci_lowers, ci_uppers, f1_scores = [], [], [], []

    for pred_col in prediction_columns:
        predictions_binary = (data[pred_col] == class_num).astype(int)
        
        accuracy, ci = accuracy_with_ci(true_labels_binary, predictions_binary)
        accuracies.append(accuracy * 100)
        ci_lowers.append(ci[0] * 100)
        ci_uppers.append(ci[1] * 100)
        f1 = f1_score(true_labels_binary, predictions_binary, average='binary', pos_label=1)
        f1_scores.append(f1)

    mean_accuracy = np.mean(accuracies)
    mean_ci_lower = np.mean(ci_lowers)
    mean_ci_upper = np.mean(ci_uppers)
    mean_f1_score = np.mean(f1_scores)

    eo_gender = overall_results_df[f"EO_gender_class{class_num}"].mean()
    eo_race = overall_results_df[f"EO_race_class{class_num}"].mean()
    eo_age = overall_results_df[f"EO_age_class{class_num}"].mean()
    dp_gender = overall_results_df[f"DP_gender_class{class_num}"].mean()
    dp_race = overall_results_df[f"DP_race_class{class_num}"].mean()
    dp_age = overall_results_df[f"DP_age_class{class_num}"].mean()

    classwise_results.append({
        "Emotion Class": emotion,
        "Mean Accuracy (%)": round(mean_accuracy, 2),
        "CI Lower (%)": round(mean_ci_lower, 2),
        "CI Upper (%)": round(mean_ci_upper, 2),
        "F1-Score": round(mean_f1_score, 3),
        "EO_Gender": round(eo_gender, 3),
        "EO_Race": round(eo_race, 3),
        "EO_Age": round(eo_age, 3),
        "DP_Gender": round(dp_gender, 3),
        "DP_Race": round(dp_race, 3),
        "DP_Age": round(dp_age, 3),
    })

# Convert class-wise results to a DataFrame
classwise_results_df = pd.DataFrame(classwise_results)

# Reorder columns
classwise_results_df = classwise_results_df[[
    "Emotion Class", "Mean Accuracy (%)", "CI Lower (%)", "CI Upper (%)", "F1-Score",
    "EO_Gender", "EO_Race", "EO_Age",
    "DP_Gender", "DP_Race", "DP_Age"
]]

output_path = "./vit_class_wise.csv"
classwise_results_df.to_csv(output_path, index=False)

print(f"Results saved to {output_path}")

Results saved to ./vit_class_wise.csv


# Balanced Score of Accuracy and Fairness

In [16]:
def calculate_balanced_scores_from_csv(input_file, output_file, beta1=0.5, beta2=0.5):
    """
    Extract mean metrics from the CSV file, calculate balanced scores, and save the required columns.

    Parameters:
        input_file (str): Path to the CSV file containing mean results.
        output_file (str): Path to save the results with balanced scores.
        beta1 (float): Weight for fairness penalty in the balanced score.
        beta2 (float): Weight for mean accuracy in the balanced score.

    Returns:
        pd.DataFrame: DataFrame with only the balanced scores and file names.
    """
    try:
        # Load the CSV file
        data = pd.read_csv(input_file)

        # Extract required columns for balanced score calculation
        required_columns = [
            'Accuracy', 'EO_gender', 'EO_race', 'EO_age',
            'DP_gender', 'DP_race', 'DP_age'
        ]
        if not set(required_columns).issubset(data.columns):
            raise ValueError(f"The input file is missing required columns: {set(required_columns) - set(data.columns)}")

        # Calculate balanced scores for each fairness metric
        balanced_columns = []
        for fairness_metric in ['EO_gender', 'EO_race', 'EO_age', 'DP_gender', 'DP_race', 'DP_age']:
            balanced_col = f"Balanced_{fairness_metric}"
            data[balanced_col] = (
                beta1 * (1 - data[fairness_metric]) + beta2 * data['Accuracy']/100
            )
            balanced_columns.append(balanced_col)

        # Retain only balanced score columns and file name
        data = data[['Unnamed: 0'] + balanced_columns]

        # Rename 'Unnamed: 0' to 'File Name' for clarity
        data.rename(columns={'Unnamed: 0': 'File Name'}, inplace=True)

        # Round results to 3 decimal places
        data[balanced_columns] = data[balanced_columns].round(3)

        # Save the final DataFrame to a CSV
        data.to_csv(output_file, index=False)
        print(f"Balanced scores saved to {output_file}")

        return data

    except Exception as e:
        print(f"Error processing {input_file}: {e}")
        return None


# Define input and output file paths
input_csv_file = "./results_metrics_under.csv"
output_csv_file = "./balanced_scores_under.csv"

# Calculate balanced scores and save the required columns
final_balanced_scores = calculate_balanced_scores_from_csv(input_csv_file, output_csv_file)

Balanced scores saved to ./balanced_scores_under.csv
