In [1]:
import pipe
import torch
import src.models as models

model = models.OB_05Model()
model.load_state_dict(torch.load("../output/model/best_model.pth"))
model.eval()

OB_05Model(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu1): LeakyReLU(negative_slope=0.1)
  (maxpool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu2): LeakyReLU(negative_slope=0.1)
  (maxpool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu3): LeakyReLU(negative_slope=0.1)
  (maxpool3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv4): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn4): BatchNorm2d

In [2]:
import os
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from src.data_loader import create_data_loader

transform = transforms.Compose([
    transforms.Resize((90, 90)),  # Resize images to 90x90
    transforms.Grayscale(num_output_channels=1),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])


In [3]:
from src.visualization.plot_histograms import plot_image_dimensions_histogram

def get_image_paths(directory, extensions=None):
    if extensions is None:
        extensions = ['.jpg', '.jpeg', '.png', '.bmp', '.gif']
        
    image_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            if any(file.lower().endswith(ext) for ext in extensions):
                image_paths.append(os.path.join(root, file))
    return image_paths

# plot_image_dimensions_histogram(get_image_paths(r"../dataset/ANGRY_MODIFIED/young_images"))

In [4]:
import torch
import numpy as np
import torch.nn as nn
import src.utils as utils

from torch.utils.data import DataLoader
from src.types import EvaluationResults


__device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


def evaluate_model(model: nn.Module, dataloader: DataLoader) -> EvaluationResults:
    model.eval().to(__device)

    confusion_matrix = np.zeros((4, 4), dtype=int)

    current_batch = 1

    # evaluation/testing
    with torch.no_grad():
        for images, labels in dataloader:
            images, labels = images.to(__device), labels.to(__device)

            print(len(images))

            # Forward pass
            outputs = model(images)
            # Get predictions from the maximum value
            _, predicted = torch.max(outputs.data, 1)
            expected = labels.tolist()
            actual = predicted.tolist()

            # add to confusion matrix
            for expected, actual in list(zip(expected, actual)):
                confusion_matrix[actual, expected] += 1

            __print_confusion_matrix_metrics(current_batch, confusion_matrix)
            current_batch += 1

    return EvaluationResults(confusion_matrix=confusion_matrix)

def __print_confusion_matrix_metrics(batch_number: int, confusion_matrix: np.ndarray):
    macro_precision, macro_recall, macro_f1_score, macro_accuracy = utils.cm_macro.calculate_overall_metrics(
        confusion_matrix)
    micro_precision, micro_recall, micro_f1_score, micro_accuracy = utils.cm_micro.calculate_overall_metrics(
        confusion_matrix)
    accuracy = (macro_accuracy + micro_accuracy) / 2  # should be the same for both

    print(
        f'\nTesting batch #{batch_number}:\n'
        f'\tMACRO precision: {macro_precision:.4f}\n'
        f'\tMACRO recall: {macro_recall:.4f}\n'
        f'\tMACRO f1_score: {macro_f1_score:.4f}\n\n'
        f'\tMICRO precision: {micro_precision:.4f}\n'
        f'\tMICRO recall: {micro_recall:.4f}\n'
        f'\tMICRO f1_score: {micro_f1_score:.4f}\n\n'
        f'\tAccuracy: {accuracy:.4f}\n\n'
    )

# Testing 

In [5]:
from src.visualization.model_evaluation import TestingVisualizations

young_trainset = datasets.ImageFolder(root=r"../dataset/bias/age/young", transform=transform)
young_dataloader = create_data_loader(young_trainset) 
eval_results_young = evaluate_model(model=model, dataloader=young_dataloader)

"""
_ = TestingVisualizations.generate_overall_metrics_table(eval_results_young)
_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_young)
"""

32

Testing batch #1:
	MACRO precision: 0.8875
	MACRO recall: 0.8810
	MACRO f1_score: 0.8818

	MICRO precision: 0.9062
	MICRO recall: 0.9062
	MICRO f1_score: 0.9062

	Accuracy: 0.9062


32

Testing batch #2:
	MACRO precision: 0.9187
	MACRO recall: 0.9058
	MACRO f1_score: 0.9087

	MICRO precision: 0.9375
	MICRO recall: 0.9375
	MICRO f1_score: 0.9375

	Accuracy: 0.9375


32

Testing batch #3:
	MACRO precision: 0.9325
	MACRO recall: 0.9303
	MACRO f1_score: 0.9295

	MICRO precision: 0.9479
	MICRO recall: 0.9479
	MICRO f1_score: 0.9479

	Accuracy: 0.9479


32

Testing batch #4:
	MACRO precision: 0.9353
	MACRO recall: 0.9346
	MACRO f1_score: 0.9340

	MICRO precision: 0.9453
	MICRO recall: 0.9453
	MICRO f1_score: 0.9453

	Accuracy: 0.9453


32

Testing batch #5:
	MACRO precision: 0.9193
	MACRO recall: 0.9241
	MACRO f1_score: 0.9211

	MICRO precision: 0.9313
	MICRO recall: 0.9313
	MICRO f1_score: 0.9313

	Accuracy: 0.9313


32

Testing batch #6:
	MACRO precision: 0.9109
	MACRO recall: 0.9217
	

'\n_ = TestingVisualizations.generate_overall_metrics_table(eval_results_young)\n_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_young)\n'

In [6]:
from src.visualization.model_evaluation import TestingVisualizations

middle_aged_trainset = datasets.ImageFolder(root=r"../dataset/bias/age/middle_aged", transform=transform)
middle_aged_dataloader = create_data_loader(middle_aged_trainset)
eval_results_middle_aged = evaluate_model(model=model, dataloader=middle_aged_dataloader)

"""
_ = TestingVisualizations.generate_overall_metrics_table(eval_results_middle_aged)
_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_middle_aged)
"""

32

Testing batch #1:
	MACRO precision: 0.9318
	MACRO recall: 0.8929
	MACRO f1_score: 0.8923

	MICRO precision: 0.9062
	MICRO recall: 0.9062
	MICRO f1_score: 0.9062

	Accuracy: 0.9062


32

Testing batch #2:
	MACRO precision: 0.8833
	MACRO recall: 0.8792
	MACRO f1_score: 0.8808

	MICRO precision: 0.8906
	MICRO recall: 0.8906
	MICRO f1_score: 0.8906

	Accuracy: 0.8906


32

Testing batch #3:
	MACRO precision: 0.8636
	MACRO recall: 0.8720
	MACRO f1_score: 0.8661

	MICRO precision: 0.8750
	MICRO recall: 0.8750
	MICRO f1_score: 0.8750

	Accuracy: 0.8750


32

Testing batch #4:
	MACRO precision: 0.9034
	MACRO recall: 0.9078
	MACRO f1_score: 0.9046

	MICRO precision: 0.9062
	MICRO recall: 0.9062
	MICRO f1_score: 0.9062

	Accuracy: 0.9062


32

Testing batch #5:
	MACRO precision: 0.9161
	MACRO recall: 0.9239
	MACRO f1_score: 0.9190

	MICRO precision: 0.9187
	MICRO recall: 0.9187
	MICRO f1_score: 0.9187

	Accuracy: 0.9187


32

Testing batch #6:
	MACRO precision: 0.9178
	MACRO recall: 0.9208
	

'\n_ = TestingVisualizations.generate_overall_metrics_table(eval_results_middle_aged)\n_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_middle_aged)\n'

In [7]:
from src.visualization.model_evaluation import TestingVisualizations

old_trainset = datasets.ImageFolder(root=r"../dataset/bias/age/old", transform=transform)
old_dataloader = create_data_loader(old_trainset)
eval_results_old = evaluate_model(model=model, dataloader=old_dataloader)

"""
_ = TestingVisualizations.generate_overall_metrics_table(eval_results_old)
_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_old)
"""

32

Testing batch #1:
	MACRO precision: 0.9164
	MACRO recall: 0.8688
	MACRO f1_score: 0.8829

	MICRO precision: 0.9062
	MICRO recall: 0.9062
	MICRO f1_score: 0.9062

	Accuracy: 0.9062


32

Testing batch #2:
	MACRO precision: 0.8708
	MACRO recall: 0.8933
	MACRO f1_score: 0.8789

	MICRO precision: 0.8906
	MICRO recall: 0.8906
	MICRO f1_score: 0.8906

	Accuracy: 0.8906


32

Testing batch #3:
	MACRO precision: 0.8901
	MACRO recall: 0.9079
	MACRO f1_score: 0.8964

	MICRO precision: 0.8958
	MICRO recall: 0.8958
	MICRO f1_score: 0.8958

	Accuracy: 0.8958


32

Testing batch #4:
	MACRO precision: 0.8770
	MACRO recall: 0.9077
	MACRO f1_score: 0.8848

	MICRO precision: 0.8828
	MICRO recall: 0.8828
	MICRO f1_score: 0.8828

	Accuracy: 0.8828


32

Testing batch #5:
	MACRO precision: 0.8872
	MACRO recall: 0.9174
	MACRO f1_score: 0.8980

	MICRO precision: 0.9000
	MICRO recall: 0.9000
	MICRO f1_score: 0.9000

	Accuracy: 0.9000


32

Testing batch #6:
	MACRO precision: 0.8875
	MACRO recall: 0.9216
	

'\n_ = TestingVisualizations.generate_overall_metrics_table(eval_results_old)\n_ = TestingVisualizations.generate_metrics_per_class_table(eval_results_old)\n'

In [8]:
male_trainset = datasets.ImageFolder(root=r"../dataset/bias/gender/male", transform=transform)
male_dataloader = create_data_loader(male_trainset)
eval_results_male = evaluate_model(model=model, dataloader=male_dataloader)

32

Testing batch #1:
	MACRO precision: 0.8646
	MACRO recall: 0.8646
	MACRO f1_score: 0.8646

	MICRO precision: 0.8750
	MICRO recall: 0.8750
	MICRO f1_score: 0.8750

	Accuracy: 0.8750


32

Testing batch #2:
	MACRO precision: 0.9208
	MACRO recall: 0.9214
	MACRO f1_score: 0.9208

	MICRO precision: 0.9219
	MICRO recall: 0.9219
	MICRO f1_score: 0.9219

	Accuracy: 0.9219


32

Testing batch #3:
	MACRO precision: 0.9080
	MACRO recall: 0.9185
	MACRO f1_score: 0.9123

	MICRO precision: 0.9167
	MICRO recall: 0.9167
	MICRO f1_score: 0.9167

	Accuracy: 0.9167


32

Testing batch #4:
	MACRO precision: 0.9238
	MACRO recall: 0.9328
	MACRO f1_score: 0.9277

	MICRO precision: 0.9297
	MICRO recall: 0.9297
	MICRO f1_score: 0.9297

	Accuracy: 0.9297


32

Testing batch #5:
	MACRO precision: 0.9190
	MACRO recall: 0.9248
	MACRO f1_score: 0.9213

	MICRO precision: 0.9250
	MICRO recall: 0.9250
	MICRO f1_score: 0.9250

	Accuracy: 0.9250


32

Testing batch #6:
	MACRO precision: 0.9121
	MACRO recall: 0.9252
	

In [9]:
female_trainset = datasets.ImageFolder(root=r"../dataset/bias/gender/female", transform=transform)
female_dataloader = create_data_loader(female_trainset)
eval_results_female = evaluate_model(model=model, dataloader=female_dataloader)

32

Testing batch #1:
	MACRO precision: 0.8036
	MACRO recall: 0.8225
	MACRO f1_score: 0.8095

	MICRO precision: 0.8750
	MICRO recall: 0.8750
	MICRO f1_score: 0.8750

	Accuracy: 0.8750


32

Testing batch #2:
	MACRO precision: 0.8438
	MACRO recall: 0.8505
	MACRO f1_score: 0.8467

	MICRO precision: 0.8906
	MICRO recall: 0.8906
	MICRO f1_score: 0.8906

	Accuracy: 0.8906


32

Testing batch #3:
	MACRO precision: 0.9017
	MACRO recall: 0.9055
	MACRO f1_score: 0.9034

	MICRO precision: 0.9271
	MICRO recall: 0.9271
	MICRO f1_score: 0.9271

	Accuracy: 0.9271


32

Testing batch #4:
	MACRO precision: 0.9303
	MACRO recall: 0.9330
	MACRO f1_score: 0.9315

	MICRO precision: 0.9453
	MICRO recall: 0.9453
	MICRO f1_score: 0.9453

	Accuracy: 0.9453


32

Testing batch #5:
	MACRO precision: 0.9338
	MACRO recall: 0.9382
	MACRO f1_score: 0.9359

	MICRO precision: 0.9500
	MICRO recall: 0.9500
	MICRO f1_score: 0.9500

	Accuracy: 0.9500


32

Testing batch #6:
	MACRO precision: 0.9445
	MACRO recall: 0.9485
	

# Statistical analysis, Age

In [10]:
young_metrics = eval_results_young.get_metrics_table_as_df()
young_metrics.style

Unnamed: 0_level_0,macro,macro,macro,micro,micro,micro,Unnamed: 7_level_0
metrics,precision,recall,f1_score,precision,recall,f1_score,accuracy
model,0.917681,0.912624,0.914073,0.9282511210762332,0.9282511210762332,0.9282511210762332,0.928251


In [11]:
middle_aged_metrics = eval_results_middle_aged.get_metrics_table_as_df()
middle_aged_metrics.style

Unnamed: 0_level_0,macro,macro,macro,micro,micro,micro,Unnamed: 7_level_0
metrics,precision,recall,f1_score,precision,recall,f1_score,accuracy
model,0.911,0.917784,0.912889,0.9158878504672896,0.9158878504672896,0.9158878504672896,0.915888


In [12]:
old_metrics = eval_results_old.get_metrics_table_as_df()
old_metrics.style

Unnamed: 0_level_0,macro,macro,macro,micro,micro,micro,Unnamed: 7_level_0
metrics,precision,recall,f1_score,precision,recall,f1_score,accuracy
model,0.876369,0.910477,0.886637,0.8936170212765957,0.8936170212765957,0.8936170212765957,0.893617


In [13]:
import pandas as pd
import pipe as pipe

young_metrics_list = list(young_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))
middle_aged_metrics_list = list(middle_aged_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))
old_metrics_list = list(old_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))

data = young_metrics_list + middle_aged_metrics_list + old_metrics_list
groups = ['Young'] * len(young_metrics_list) + ['Middle-aged'] * len(middle_aged_metrics_list) + ['Old'] * len(old_metrics_list)
df = pd.DataFrame({'Score': data, 'Group': groups})

In [14]:
from scipy.stats import kruskal
import scikit_posthocs as sp

kruskal_stat, kruskal_p_value = kruskal(young_metrics_list, middle_aged_metrics_list, old_metrics_list)

print(f"Kruskal-Wallis Test: H={kruskal_stat}, p-value={kruskal_p_value}")

if kruskal_p_value < 0.05:
    print("The differences among the groups are statistically significant.")
    
    nemenyi_test = sp.posthoc_nemenyi(df, val_col='Score', group_col='Group')
    print("\nNemenyi test results:\n", nemenyi_test)
else:
    print("The differences among the groups are not statistically significant.")

Kruskal-Wallis Test: H=14.811731315042582, p-value=0.0006076778566064679
The differences among the groups are statistically significant.

Nemenyi test results:
              Middle-aged       Old     Young
Middle-aged     1.000000  0.029589  0.553609
Old             0.029589  1.000000  0.000914
Young           0.553609  0.000914  1.000000


# Statistical analysis, Gender

In [15]:
male_metrics = eval_results_male.get_metrics_table_as_df()
male_metrics.style

Unnamed: 0_level_0,macro,macro,macro,micro,micro,micro,Unnamed: 7_level_0
metrics,precision,recall,f1_score,precision,recall,f1_score,accuracy
model,0.891114,0.908321,0.897155,0.897196261682243,0.897196261682243,0.897196261682243,0.897196


In [16]:
female_metrics = eval_results_female.get_metrics_table_as_df()
female_metrics.style

Unnamed: 0_level_0,macro,macro,macro,micro,micro,micro,Unnamed: 7_level_0
metrics,precision,recall,f1_score,precision,recall,f1_score,accuracy
model,0.927755,0.928987,0.92809,0.9426627793974732,0.9426627793974732,0.9426627793974732,0.942663


In [17]:
import pandas as pd
import pipe as pipe

male_metrics_list = list(male_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))
female_metrics_list = list(female_metrics.iloc[0].tolist() | pipe.map(lambda x: float(x)))

data = male_metrics_list + female_metrics_list 
groups = ['Male'] * len(male_metrics_list) + ['Female'] * len(female_metrics_list)
df_gender = pd.DataFrame({'Score': data, 'Group': groups})

In [18]:
from scipy.stats import kruskal
import scikit_posthocs as sp

kruskal_stat, kruskal_p_value = kruskal(male_metrics_list, female_metrics_list)

print(f"Kruskal-Wallis Test: H={kruskal_stat}, p-value={kruskal_p_value}")

# Interpret the result
if kruskal_p_value < 0.05:
    print("The differences among the groups are statistically significant.")

    nemenyi_test = sp.posthoc_nemenyi(df_gender, val_col='Score', group_col='Group')
    print("\nNemenyi test results:\n", nemenyi_test)
else:
    print("The differences among the groups are not statistically significant.")

Kruskal-Wallis Test: H=10.250574712643674, p-value=0.0013664202406668473
The differences among the groups are statistically significant.

Nemenyi test results:
           Female      Male
Female  1.000000  0.001366
Male    0.001366  1.000000


# Formatting results

In [101]:
import pipe as pipe

def extract_metrics(metrics_list):
    return metrics_list[:3] + metrics_list[-1:]

def restructure_metrics_list_list(metrics_list_list):
    _precisions = []
    _recalls = []
    _f1_scores = []
    _accuracies = []
    for metrics_list in metrics_list_list:
        _precisions.append(metrics_list[0])
        _recalls.append(metrics_list[1])
        _f1_scores.append(metrics_list[2])
        _accuracies.append(metrics_list[3])
        
    return [_precisions, _recalls, _f1_scores, _accuracies]

def format_metrics(metrics_list, decimal_places=4):
    age_avg = sum(metrics_list[:3]) / 3
    gender_avg = sum(metrics_list[-2:]) / 2
    total_avg = sum(metrics_list) / len(metrics_list)
    
    age_avg = format(age_avg, f".{decimal_places}f")
    gender_avg = format(gender_avg, f".{decimal_places}f")
    total_avg = format(total_avg, f".{decimal_places}f")
        
    return (list(metrics_list[:3] | pipe.map(lambda x: format(x, f".{decimal_places}f"))) 
            + [age_avg, ''] 
            + list(metrics_list[-2:] | pipe.map(lambda x: format(x, f".{decimal_places}f"))) 
            + [gender_avg, '', total_avg]) 

# formatting the performance metrics
metrics = [
    young_metrics_list,
    middle_aged_metrics_list,
    old_metrics_list,
    male_metrics_list,
    female_metrics_list
]

metrics_formatted = list(metrics 
                         | pipe.map(extract_metrics) 
                         | pipe.Pipe(restructure_metrics_list_list) 
                         | pipe.map(format_metrics))

precisions = metrics_formatted[0]
recalls = metrics_formatted[1]
f1_scores = metrics_formatted[2]
accuracies = metrics_formatted[3]

# formatting the no. of images
trainsets = [
    young_trainset,
    middle_aged_trainset,
    old_trainset,
    male_trainset,
    female_trainset,
]

images = format_metrics(list(trainsets | pipe.map(lambda _trainset: _trainset.__len__())), decimal_places=1)

In [120]:
import pandas as pd

data = {
    'Attribute': ['Age', '', '', '', '‎', 'Gender', '', '', '‎', 'Overall System Mean'],
    'Group': ['Young', 'Middle-aged', 'Senior', '(Mean)', '', 'Male', 'Female', '(Mean)', '', ''],
    '#Images': images,
    'Accuracy': accuracies,
    'Precision': precisions,
    'Recall': recalls,
    'F1-Score': f1_scores,
}

df = pd.DataFrame(data)
styled_df = df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '1px solid black'), ('text-align', 'center'), ('padding', '8px')]},
    {'selector': 'td', 'props': [('border', '1px solid black'), ('text-align', 'center'), ('padding', '8px')]}
]).set_properties(**{
    'border': '1px solid black',
    'text-align': 'center',
    'padding': '8px'
})
df.index = [''] * len(df)
df.style

Unnamed: 0,Attribute,Group,#Images,Accuracy,Precision,Recall,F1-Score
,Age,Young,892.0,0.9283,0.9177,0.9126,0.9141
,,Middle-aged,1177.0,0.9159,0.911,0.9178,0.9129
,,Senior,235.0,0.8936,0.8764,0.9105,0.8866
,,(Mean),768.0,0.9126,0.9017,0.9136,0.9045
,‎,,,,,,
,Gender,Male,1284.0,0.8972,0.8911,0.9083,0.8972
,,Female,1029.0,0.9427,0.9278,0.929,0.9281
,,(Mean),1156.5,0.9199,0.9094,0.9187,0.9126
,‎,,,,,,
,Overall System Mean,,923.4,0.9155,0.9048,0.9156,0.9078
