In [None]:
import pipe
import torch
import src.models as models

model = models.OB_05Model()
model.load_state_dict(torch.load("../output/model/best_model.pth"))
model.eval()

In [None]:
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from src.data_loader import create_data_loader

transform = transforms.Compose([
    transforms.Resize((90, 90)),  # Resize images to 90x90
    transforms.Grayscale(num_output_channels=1),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


In [None]:
from src.visualization.plot_histograms import plot_image_dimensions_histogram

def get_image_paths(directory, extensions=None):
    if extensions is None:
        extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
        
    image_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                image_paths.append(os.path.join(root, file))
    return image_paths

# plot_image_dimensions_histogram(get_image_paths(r"../dataset/ANGRY_MODIFIED/young_images"))

In [None]:
import torch
import numpy as np
import torch.nn as nn
import src.utils as utils

from torch.utils.data import DataLoader
from src.types import EvaluationResults


__device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def evaluate_model(model: nn.Module, dataloader: DataLoader) -> EvaluationResults:
    model.eval().to(__device)

    confusion_matrix = np.zeros((4, 4), dtype=int)

    current_batch = 1

    # evaluation/testing
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(__device), labels.to(__device)

            print(len(images))

            # Forward pass
            outputs = model(images)
            # Get predictions from the maximum value
            _, predicted = torch.max(outputs.data, 1)
            expected = labels.tolist()
            actual = predicted.tolist()

            # add to confusion matrix
            for expected, actual in list(zip(expected, actual)):
                confusion_matrix[actual, expected] += 1

            __print_confusion_matrix_metrics(current_batch, confusion_matrix)
            current_batch += 1

    return EvaluationResults(confusion_matrix=confusion_matrix)

def __print_confusion_matrix_metrics(batch_number: int, confusion_matrix: np.ndarray):
    macro_precision, macro_recall, macro_f1_score, macro_accuracy = utils.cm_macro.calculate_overall_metrics(
        confusion_matrix)
    micro_precision, micro_recall, micro_f1_score, micro_accuracy = utils.cm_micro.calculate_overall_metrics(
        confusion_matrix)
    accuracy = (macro_accuracy + micro_accuracy) / 2  # should be the same for both

    print(
        f'\nTesting batch #{batch_number}:\n'
        f'\tMACRO precision: {macro_precision:.4f}\n'
        f'\tMACRO recall: {macro_recall:.4f}\n'
        f'\tMACRO f1_score: {macro_f1_score:.4f}\n\n'
        f'\tMICRO precision: {micro_precision:.4f}\n'
        f'\tMICRO recall: {micro_recall:.4f}\n'
        f'\tMICRO f1_score: {micro_f1_score:.4f}\n\n'
        f'\tAccuracy: {accuracy:.4f}\n\n'
    )

# Testing 

In [None]:
from src.visualization.model_evaluation import TestingVisualizations

young_trainset = datasets.ImageFolder(root=r"../dataset/bias/age/young", transform=transform)
young_dataloader = create_data_loader(young_trainset) 
eval_results_young = evaluate_model(model=model, dataloader=young_dataloader)

"""
_ = TestingVisualizations.generate_overall_metrics_table(eval_results_young)
_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_young)
"""

In [None]:
from src.visualization.model_evaluation import TestingVisualizations

middle_aged_trainset = datasets.ImageFolder(root=r"../dataset/bias/age/middle_aged", transform=transform)
middle_aged_dataloader = create_data_loader(middle_aged_trainset)
eval_results_middle_aged = evaluate_model(model=model, dataloader=middle_aged_dataloader)

"""
_ = TestingVisualizations.generate_overall_metrics_table(eval_results_middle_aged)
_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_middle_aged)
"""

In [None]:
from src.visualization.model_evaluation import TestingVisualizations

old_trainset = datasets.ImageFolder(root=r"../dataset/bias/age/old", transform=transform)
old_dataloader = create_data_loader(old_trainset)
eval_results_old = evaluate_model(model=model, dataloader=old_dataloader)

"""
_ = TestingVisualizations.generate_overall_metrics_table(eval_results_old)
_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_old)
"""

In [None]:
male_trainset = datasets.ImageFolder(root=r"../dataset/bias/gender/male", transform=transform)
male_dataloader = create_data_loader(male_trainset)
eval_results_male = evaluate_model(model=model, dataloader=male_dataloader)

In [None]:
female_trainset = datasets.ImageFolder(root=r"../dataset/bias/gender/female", transform=transform)
female_dataloader = create_data_loader(female_trainset)
eval_results_female = evaluate_model(model=model, dataloader=female_dataloader)

# Statistical analysis, Age

In [None]:
young_metrics = eval_results_young.get_metrics_table_as_df()
young_metrics.style

In [None]:
middle_aged_metrics = eval_results_middle_aged.get_metrics_table_as_df()
middle_aged_metrics.style

In [None]:
old_metrics = eval_results_old.get_metrics_table_as_df()
old_metrics.style

In [None]:
import pandas as pd
import pipe as pipe

young_metrics_list = list(young_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))
middle_aged_metrics_list = list(middle_aged_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))
old_metrics_list = list(old_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))

data = young_metrics_list + middle_aged_metrics_list + old_metrics_list
groups = ['Young'] * len(young_metrics_list) + ['Middle-aged'] * len(middle_aged_metrics_list) + ['Old'] * len(old_metrics_list)
df = pd.DataFrame({'Score': data, 'Group': groups})

In [None]:
from scipy.stats import kruskal
import scikit_posthocs as sp

kruskal_stat, kruskal_p_value = kruskal(young_metrics_list, middle_aged_metrics_list, old_metrics_list)

print(f"Kruskal-Wallis Test: H={kruskal_stat}, p-value={kruskal_p_value}")

if kruskal_p_value < 0.05:
    print("The differences among the groups are statistically significant.")
    
    nemenyi_test = sp.posthoc_nemenyi(df, val_col='Score', group_col='Group')
    print("\nNemenyi test results:\n", nemenyi_test)
else:
    print("The differences among the groups are not statistically significant.")

# Statistical analysis, Gender

In [None]:
male_metrics = eval_results_male.get_metrics_table_as_df()
male_metrics.style

In [None]:
female_metrics = eval_results_female.get_metrics_table_as_df()
female_metrics.style

In [None]:
import pandas as pd
import pipe as pipe

male_metrics_list = list(male_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))
female_metrics_list = list(female_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))

data = male_metrics_list + female_metrics_list 
groups = ['Male'] * len(male_metrics_list) + ['Female'] * len(female_metrics_list)
df_gender = pd.DataFrame({'Score': data, 'Group': groups})

In [None]:
from scipy.stats import kruskal
import scikit_posthocs as sp

kruskal_stat, kruskal_p_value = kruskal(male_metrics_list, female_metrics_list)

print(f"Kruskal-Wallis Test: H={kruskal_stat}, p-value={kruskal_p_value}")

# Interpret the result
if kruskal_p_value < 0.05:
    print("The differences among the groups are statistically significant.")

    nemenyi_test = sp.posthoc_nemenyi(df_gender, val_col='Score', group_col='Group')
    print("\nNemenyi test results:\n", nemenyi_test)
else:
    print("The differences among the groups are not statistically significant.")

# Formatting results

In [None]:
import pipe as pipe

def extract_metrics(metrics_list):
    return metrics_list[:3] + metrics_list[-1:]

def restructure_metrics_list_list(metrics_list_list):
    _precisions = []
    _recalls = []
    _f1_scores = []
    _accuracies = []
    for metrics_list in metrics_list_list:
        _precisions.append(metrics_list[0])
        _recalls.append(metrics_list[1])
        _f1_scores.append(metrics_list[2])
        _accuracies.append(metrics_list[3])
        
    return [_precisions, _recalls, _f1_scores, _accuracies]

def format_metrics(metrics_list, decimal_places=4):
    age_avg = sum(metrics_list[:3]) / 3
    gender_avg = sum(metrics_list[-2:]) / 2
    total_avg = sum(metrics_list) / len(metrics_list)
    
    age_avg = format(age_avg, f".{decimal_places}f")
    gender_avg = format(gender_avg, f".{decimal_places}f")
    total_avg = format(total_avg, f".{decimal_places}f")
        
    return (list(metrics_list[:3] | pipe.map(lambda x: format(x, f".{decimal_places}f"))) 
            + [age_avg, ''] 
            + list(metrics_list[-2:] | pipe.map(lambda x: format(x, f".{decimal_places}f"))) 
            + [gender_avg, '', total_avg]) 

# formatting the performance metrics
metrics = [
    young_metrics_list,
    middle_aged_metrics_list,
    old_metrics_list,
    male_metrics_list,
    female_metrics_list
]

metrics_formatted = list(metrics 
                         | pipe.map(extract_metrics) 
                         | pipe.Pipe(restructure_metrics_list_list) 
                         | pipe.map(format_metrics))

precisions = metrics_formatted[0]
recalls = metrics_formatted[1]
f1_scores = metrics_formatted[2]
accuracies = metrics_formatted[3]

# formatting the no. of images
trainsets = [
    young_trainset,
    middle_aged_trainset,
    old_trainset,
    male_trainset,
    female_trainset,
]

images = format_metrics(list(trainsets | pipe.map(lambda _trainset: _trainset.__len__())), decimal_places=1)

In [None]:
import pandas as pd

data = {
    'Attribute': ['Age', '', '', '', '‎', 'Gender', '', '', '‎', 'Overall System Mean'],
    'Group': ['Young', 'Middle-aged', 'Senior', '(Mean)', '', 'Male', 'Female', '(Mean)', '', ''],
    '#Images': images,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1_scores,
}

df = pd.DataFrame(data)
styled_df = df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '1px solid black'), ('text-align', 'center'), ('padding', '8px')]},
    {'selector': 'td', 'props': [('border', '1px solid black'), ('text-align', 'center'), ('padding', '8px')]}
]).set_properties(**{
    'border': '1px solid black',
    'text-align': 'center',
    'padding': '8px'
})
df.index = [''] * len(df)
df.style

In [1]:
import pipe
import torch
import src.models as models
from src.bias.bias_detection import test_for_biases

model = models.OB_05Model()
model.load_state_dict(torch.load("../output/model/best_model.pth"))
model.eval()

df = test_for_biases(model)

Kruskal-Wallis Test for 'age' bias attribute: H=10.69441816461685, p-value=0.004761421207453485
The differences among the groups are statistically significant.

Nemenyi test results:
              Middle-aged       Old     Young
Middle-aged     1.000000  0.188459  0.356909
Old             0.188459  1.000000  0.004885
Young           0.356909  0.004885  1.000000


Kruskal-Wallis Test for 'gender' bias attribute: H=10.250574712643674, p-value=0.0013664202406668473
The differences among the groups are statistically significant.

Nemenyi test results:
           Female      Male
Female  1.000000  0.001366
Male    0.001366  1.000000


In [2]:
df.style

Unnamed: 0,Attribute,Group,#Images,Accuracy,Precision,Recall,F1-Score
,Age,Young,892.0,0.9294,0.9185,0.9133,0.9149
,,Middle-aged,1177.0,0.9159,0.9114,0.9164,0.9127
,,Senior,235.0,0.9106,0.897,0.9231,0.9058
,,(Mean),768.0,0.9186,0.9089,0.9176,0.9112
,‎,,,,,,
,Gender,Male,1284.0,0.8933,0.8894,0.9016,0.8936
,,Female,1029.0,0.9359,0.9206,0.9227,0.9213
,,(Mean),1156.5,0.9146,0.905,0.9121,0.9075
,‎,,,,,,
,Overall System Mean,,923.4,0.917,0.9074,0.9154,0.9097
