# Let's train a classifier

In [1]:
import os
# os.system("pip install pandas")
# os.system("pip install torchvision")
os.system("CUDA_LAUNCH_BLOCKING=1")
import time

import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from torchvision import transforms
from torchvision.models import resnet34, ResNet34_Weights

import time

import matplotlib.pyplot as plt
from PIL import Image
from tqdm import tqdm


if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

torch.manual_seed(1)

<torch._C.Generator at 0x7f1707d7f450>

In [2]:
# choose experiment
size_train = 86744
experiments = ["FairFace", "CelebA", "CelebA only white", "CelebA augmented"]
exp = 2

# set celeb paths
celeb_attr_path = "datasets/celeba/list_attr_celeba.txt"
celeb_partitions_path = 'datasets/celeba/list_eval_partition.txt'
celeb_race_path = "CelebA/races/races_ff.csv"
celeb_label_dir = "CelebA/labels_split/"
celeb_img_dir = "CelebA/cropped/"
celeb_img_aug_dir = "CelebA/augmented/"
celeb_train_csv = f"train_{size_train}_samples_random.csv" # "train_total.csv"
celeb_train_only_white_csv = f"train_{size_train}_samples_random_white.csv"
celeb_train_aug_csv = f"train_aug_{size_train}_samples.csv"
celeb_val_csv = "val_total.csv"
celeb_test_csv = "test_total.csv"


# set fairface paths
ff_img_dir = "fairface/dataset/fairface-img-margin125-trainval"
ff_label_dir = "fairface/dataset/"
ff_train_csv = "fairface_label_train.csv"
ff_val_csv = "fairface_label_val.csv"


# set hyperparameters
learning_rates = [2e-5, 2e-5, 2e-5, 2e-5]
lr = learning_rates[exp]
num_epochs = 10

# Architecture
feat_size = (256, 256)
bs_train = 128
bs_val = 128
bs_test = 128
device = 'cuda:4'


races = ["Black", "Indian", "Latino", "Middle Eastern", "Southeast Asian", "East Asian", "White"]
ignored_attributes = ["Black_Hair", "Blond_Hair", "Brown_Hair", "Pale_Skin"]

In [3]:
# define datasets
class CelebaDataset(Dataset):
    """Custom Dataset for loading CelebA face images"""

    def __init__(self, csv_path, img_dir, transform=None, ignored_attributes=[]):
    
        df = pd.read_csv(csv_path, index_col=None)
        # print(df.head())
        self.img_dir = img_dir
        self.csv_path = csv_path
        self.img_names = df["Image_Name"].values
        self.races = df["Race"].values
        drop_cols = ["Image_Name", "Race"] + ignored_attributes
        self.y = np.expand_dims(np.array(df["Male"].values), axis=1) #df.drop(drop_cols, axis=1).values #
        self.transform = transform

    def __getitem__(self, index):
        img = Image.open(os.path.join(self.img_dir,
                                      self.img_names[index]))
        
        if self.transform is not None:
            img = self.transform(img)
        
        label = self.y[index]
        gt_race = self.races[index]
        return img, label, gt_race

    def __len__(self):
        return self.y.shape[0]
    


class FairFaceDataset(Dataset):
    """Custom Dataset for loading FairFace images"""

    def __init__(self, csv_path, img_dir, transform=None):
    
        df = pd.read_csv(csv_path, index_col=None)
        # print(df.head())
        self.img_dir = img_dir
        self.csv_path = csv_path
        self.img_names = df["file"].values
        self.races = df["race"].replace("Latino_Hispanic", "Latino").values
        gender = df["gender"].replace("Male", 1).replace("Female", 0)
        self.y = np.expand_dims(np.array(gender.values), axis=1)
        self.transform = transform

    def __getitem__(self, index):
        img = Image.open(os.path.join(self.img_dir,
                                      self.img_names[index]))
        
        if self.transform is not None:
            img = self.transform(img)
        
        label = self.y[index]
        gt_race = self.races[index]
        return img, label, gt_race

    def __len__(self):
        return self.y.shape[0]

In [4]:
# create datasets based on current experiment
num_workers = 6
custom_transform = transforms.Compose([transforms.Resize(feat_size),
                                       transforms.ToTensor()])

# training dataset
if experiments[exp].startswith("CelebA"):

    if experiments[exp].endswith("augmented"):
        train_csv = celeb_train_aug_csv
        train_img_dir = celeb_img_aug_dir
    
    else:
        if "only white" in experiments[exp]:
            train_csv = celeb_train_only_white_csv
        else:
            train_csv = celeb_train_csv
        train_img_dir = celeb_img_dir

    train_dataset = CelebaDataset(csv_path=celeb_label_dir + train_csv,
                                img_dir=train_img_dir,
                                transform=custom_transform,
                                ignored_attributes=ignored_attributes)

if experiments[exp].startswith("FairFace"):
    train_dataset = FairFaceDataset(csv_path=ff_label_dir + ff_train_csv,
                                    img_dir=ff_img_dir,
                                    transform=custom_transform)


# validation dataset
val_dataset = FairFaceDataset(csv_path=ff_label_dir + ff_val_csv,
                                img_dir=ff_img_dir,
                                transform=custom_transform)

# val_dataset = CelebaDataset(csv_path=celeb_label_dir + celeb_val_csv,
#                             img_dir=celeb_img_dir,
#                             transform=custom_transform,
#                             ignored_attributes=ignored_attributes)


# test datasets
test_dataset_celeb = CelebaDataset(csv_path=celeb_label_dir + celeb_test_csv,
                            img_dir=celeb_img_dir,
                            transform=custom_transform,
                            ignored_attributes=ignored_attributes)

test_dataset_ff = val_dataset


# create dataloaders on these datasets
train_loader = DataLoader(dataset=train_dataset,
                          batch_size=bs_train,
                          shuffle=True,
                          num_workers=num_workers)

val_loader = DataLoader(dataset=val_dataset,
                          batch_size=bs_val,
                          shuffle=False,
                          num_workers=num_workers)

test_loader_celeb = DataLoader(dataset=test_dataset_celeb,
                          batch_size=bs_test,
                          shuffle=False,
                          num_workers=num_workers)

test_loader_ff = DataLoader(dataset=test_dataset_ff,
                          batch_size=bs_test,
                          shuffle=False,
                          num_workers=num_workers)

In [5]:
# build model, define loss and create optimizer
model = resnet34(weights=ResNet34_Weights.DEFAULT)
model.to(device)
num_attr_predicted = train_dataset.y.shape[1]
fc_layer = nn.Linear(1000, num_attr_predicted, device=device)
sigmoid = nn.Sigmoid()
bin_ce = nn.BCELoss()
params = list(model.parameters()) + list(fc_layer.parameters())
optimizer = torch.optim.Adam(params, lr=lr)

In [6]:
# define evaluation procedure
def evaluate_metrics(model, data_loader, device, show_tqdm=False):

    correct_predictions = np.zeros(len(races))
    true_pos = np.zeros(len(races))
    true_neg = np.zeros(len(races))
    positive_preds = np.zeros(len(races))
    positive_targets = np.zeros(len(races))
    num_examples = np.zeros(len(races))
    total_examples = len(data_loader.dataset) 

    # total_it = int(np.ceil(total_examples / data_loader.batch_size))
    for _, (features, targets, gt_races) in tqdm(enumerate(data_loader), total=len(data_loader), desc="Evaluating", disable=not show_tqdm):

        features = features.to(device)
        probas = sigmoid(fc_layer(model(features)))
        prediction = (probas >= 0.5).cpu().numpy()
        targets = targets.numpy()

        # prepape annotated races for metric split afterwards
        gt_races = np.array([races.index(race) for race in gt_races])
        gt_races = np.expand_dims(gt_races, axis=1)
        gt_races = np.broadcast_to(gt_races, prediction.shape)

        # collect the necessary data split by annotated race
        for j in range(len(races)):
            correct_preds = (gt_races == j) & (prediction == targets)
            true_pos[j] += (correct_preds & (prediction == 1)).sum()
            true_neg[j] += (correct_preds & (prediction == 0)).sum()
            correct_predictions[j] += correct_preds.sum()
            positive_targets[j] += ((gt_races == j) & (targets == 1)).sum()
            positive_preds[j] += np.where(gt_races == j, prediction, 0).sum()
            num_examples[j] += (gt_races == j).sum()

    # calculate and return metrics    
    zero = 1e-10
    print("Race distribution:", num_examples/targets.shape[1], "Total:", total_examples)

    total_accuracy = correct_predictions.sum() / num_examples.sum()
    accuracies = correct_predictions / (num_examples + zero)
    accs_out = [f"{a:.2%}" for a in accuracies]
    max_acc_disparity = np.log(max(accuracies)/min(accuracies))

    total_precision = true_pos.sum() / (positive_preds.sum() + zero)
    precisions = [f"{p:.2%}" for p in true_pos / (positive_preds + zero)]

    total_recall = true_pos.sum() / (positive_targets.sum() + zero)
    recalls = [f"{r:.2%}" for r in true_pos / (positive_targets + zero)]
    return total_accuracy, accs_out, max_acc_disparity, total_precision, precisions, total_recall, recalls


def get_elapsed_time(start_time):
    elapsed = int(time.time() - start_time)
    m, s = divmod(elapsed, 60)
    h, m = divmod(m, 60)
    return f"{h}:{m:02d}:{s:02d}"

In [7]:
# Training loop
start_time = time.time()

print(f"Initiating experiment '{experiments[exp]}' with a lr of {lr} and {size_train} samples on device {device}")


for epoch in range(num_epochs):
    
    model.train()
    pbar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {(epoch+1):02d}/{num_epochs:02d}")
    for _, (features, targets, _) in pbar:
        
        features = features.to(device)
        targets = targets.float().to(device)
            
        # forward and backward pass
        model_output = model(features)
        logits = sigmoid(fc_layer(model_output))
        loss = bin_ce(logits, targets)
        pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        optimizer.zero_grad()
        
        loss.backward()
        
        # update model params 
        optimizer.step()
        
        # if batch_idx == 0:
        #     break 

    model.eval()
    with torch.set_grad_enabled(False): # save memory during inference
        acc_total, accs, max_acc_disp, prec_total, precs, rec_total, recs = evaluate_metrics(model, val_loader, device)
        print(f"Evaluation epoch {(epoch+1):02d}/{num_epochs:02d}:")
        print(f"Total accuracy: {acc_total:.2%}\t| Accuracies:\t{accs} | Max disparity: {max_acc_disp:.4f}")
        print(f"Total precision: {prec_total:.2%}\t| Precisions:\t{precs}")
        print(f"Total recall: {rec_total:.2%}\t| Recalls:\t{recs}\n")
    
print(f"Total Training Time: {get_elapsed_time(start_time)}")

Initiating experiment 'CelebA only white' with a lr of 2e-05 and 86744 samples on device cuda:4


Epoch 01/10: 100%|██████████| 678/678 [04:01<00:00,  2.81it/s, loss=0.0381]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 01/10:
Total accuracy: 81.50%	| Accuracies:	['74.23%', '80.80%', '83.92%', '87.18%', '79.86%', '79.61%', '84.75%'] | Max disparity: 0.1608
Total precision: 81.22%	| Precisions:	['72.36%', '77.63%', '80.86%', '89.73%', '82.80%', '82.15%', '83.22%']
Total recall: 84.55%	| Recalls:	['80.60%', '86.19%', '87.89%', '91.39%', '77.28%', '75.80%', '89.75%']



Epoch 02/10: 100%|██████████| 678/678 [03:56<00:00,  2.87it/s, loss=0.0272]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 02/10:
Total accuracy: 79.45%	| Accuracies:	['73.07%', '77.37%', '81.82%', '86.19%', '77.46%', '78.65%', '81.92%'] | Max disparity: 0.1651
Total precision: 76.45%	| Precisions:	['68.85%', '71.05%', '76.27%', '86.96%', '76.33%', '77.87%', '78.68%']
Total recall: 88.35%	| Recalls:	['86.86%', '91.90%', '91.17%', '93.48%', '82.04%', '80.18%', '91.09%']



Epoch 03/10: 100%|██████████| 678/678 [03:53<00:00,  2.90it/s, loss=0.0021]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 03/10:
Total accuracy: 83.11%	| Accuracies:	['76.03%', '82.65%', '84.90%', '88.01%', '81.98%', '82.26%', '85.90%'] | Max disparity: 0.1463
Total precision: 82.25%	| Precisions:	['71.73%', '79.10%', '81.57%', '91.03%', '84.58%', '84.29%', '85.03%']
Total recall: 86.79%	| Recalls:	['87.98%', '88.45%', '89.28%', '91.14%', '79.86%', '79.41%', '89.57%']



Epoch 04/10: 100%|██████████| 678/678 [03:50<00:00,  2.94it/s, loss=0.0001]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 04/10:
Total accuracy: 81.66%	| Accuracies:	['76.16%', '81.93%', '84.53%', '87.34%', '78.66%', '77.35%', '85.28%'] | Max disparity: 0.1371
Total precision: 83.83%	| Precisions:	['74.43%', '80.67%', '83.79%', '91.77%', '86.39%', '83.70%', '86.55%']
Total recall: 80.92%	| Recalls:	['81.60%', '83.67%', '84.74%', '89.18%', '69.93%', '68.08%', '86.01%']



Epoch 05/10: 100%|██████████| 678/678 [03:51<00:00,  2.93it/s, loss=0.0001]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 05/10:
Total accuracy: 82.32%	| Accuracies:	['75.96%', '81.20%', '85.09%', '87.68%', '80.42%', '79.55%', '85.95%'] | Max disparity: 0.1434
Total precision: 81.57%	| Precisions:	['71.62%', '77.92%', '82.15%', '89.90%', '82.62%', '81.86%', '85.40%']
Total recall: 85.98%	| Recalls:	['88.11%', '86.72%', '88.78%', '92.00%', '78.91%', '76.06%', '89.13%']



Epoch 06/10: 100%|██████████| 678/678 [03:53<00:00,  2.90it/s, loss=0.0737]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 06/10:
Total accuracy: 82.44%	| Accuracies:	['76.74%', '82.26%', '84.84%', '88.25%', '78.73%', '80.06%', '85.85%'] | Max disparity: 0.1399
Total precision: 83.92%	| Precisions:	['75.62%', '80.48%', '83.64%', '91.68%', '84.55%', '84.93%', '86.30%']
Total recall: 82.61%	| Recalls:	['80.73%', '84.86%', '85.75%', '90.77%', '72.24%', '73.23%', '87.61%']



Epoch 07/10: 100%|██████████| 678/678 [03:50<00:00,  2.94it/s, loss=0.0001]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 07/10:
Total accuracy: 82.10%	| Accuracies:	['75.13%', '80.61%', '84.60%', '87.76%', '81.41%', '79.81%', '85.32%'] | Max disparity: 0.1554
Total precision: 81.44%	| Precisions:	['72.94%', '76.84%', '80.96%', '90.30%', '83.52%', '80.77%', '84.58%']
Total recall: 85.67%	| Recalls:	['81.98%', '87.25%', '89.53%', '91.64%', '80.00%', '78.38%', '88.95%']



Epoch 08/10: 100%|██████████| 678/678 [03:51<00:00,  2.93it/s, loss=0.0028]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 08/10:
Total accuracy: 82.13%	| Accuracies:	['75.58%', '80.47%', '84.17%', '88.59%', '79.58%', '80.52%', '85.85%'] | Max disparity: 0.1588
Total precision: 81.55%	| Precisions:	['71.44%', '76.60%', '81.38%', '90.91%', '83.09%', '84.17%', '84.66%']
Total recall: 85.57%	| Recalls:	['87.36%', '87.38%', '87.64%', '92.25%', '76.19%', '75.29%', '90.02%']



Epoch 09/10: 100%|██████████| 678/678 [03:51<00:00,  2.93it/s, loss=0.0056]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 09/10:
Total accuracy: 81.97%	| Accuracies:	['75.19%', '82.26%', '83.24%', '87.59%', '79.58%', '80.26%', '85.47%'] | Max disparity: 0.1526
Total precision: 81.90%	| Precisions:	['70.96%', '79.95%', '81.12%', '91.18%', '82.70%', '83.50%', '85.21%']
Total recall: 84.60%	| Recalls:	['87.48%', '85.79%', '85.62%', '90.28%', '76.73%', '75.55%', '88.32%']



Epoch 10/10: 100%|██████████| 678/678 [03:51<00:00,  2.93it/s, loss=0.0064]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954
Evaluation epoch 10/10:
Total accuracy: 82.76%	| Accuracies:	['76.35%', '83.44%', '85.03%', '87.26%', '80.85%', '80.26%', '85.85%'] | Max disparity: 0.1336
Total precision: 84.66%	| Precisions:	['75.75%', '81.93%', '83.70%', '92.41%', '86.36%', '85.10%', '87.35%']
Total recall: 82.32%	| Recalls:	['79.35%', '85.52%', '86.13%', '88.31%', '74.97%', '73.49%', '86.19%']

Total Training Time: 0:41:58


In [8]:
# evaluate experiment on test sets
with torch.set_grad_enabled(False): # save memory during inference
    # evaluation CelebA
    acc_total, accs, max_acc_disp, prec_total, precs, rec_total, recs = evaluate_metrics(model, test_loader_celeb, device, show_tqdm=True)
    print(f"\nEvaluation CelebA test set ({experiments[exp]}):")
    print(f"Total accuracy: {acc_total:.2%}\t| Accuracies:\t{accs}")
    print(f"Maximum accuracy disparity: {max_acc_disp:.4f}")
    print(f"Total precision: {prec_total:.2%}\t| Precisions:\t{precs}")
    print(f"Total recall: {rec_total:.2%}\t| Recalls:\t{recs}\n")

    # evaluation FairFace
    acc_total, accs, max_acc_disp, prec_total, precs, rec_total, recs = evaluate_metrics(model, test_loader_ff, device, show_tqdm=True)
    print(f"\nEvaluation FairFace test set ({experiments[exp]}):")
    print(f"Total accuracy: {acc_total:.2%}\t| Accuracies:\t{accs}")
    print(f"Maximum accuracy disparity: {max_acc_disp:.4f}")
    print(f"Total precision: {prec_total:.2%}\t| Precisions:\t{precs}")
    print(f"Total recall: {rec_total:.2%}\t| Recalls:\t{recs}\n")


Evaluating: 100%|██████████| 156/156 [00:28<00:00,  5.41it/s]

Race distribution: [ 1461.   553.  1269.  1538.   311.  1777. 13053.] Total: 19962

Evaluation CelebA test set (CelebA only white):
Total accuracy: 97.33%	| Accuracies:	['93.84%', '97.29%', '98.74%', '97.53%', '91.64%', '95.78%', '97.91%']
Maximum accuracy disparity: 0.0746
Total precision: 98.23%	| Precisions:	['97.80%', '97.55%', '97.97%', '98.37%', '94.70%', '96.64%', '98.60%']
Total recall: 94.80%	| Recalls:	['91.20%', '95.22%', '97.40%', '96.52%', '86.81%', '90.07%', '95.74%']




Evaluating: 100%|██████████| 86/86 [00:15<00:00,  5.71it/s]


Race distribution: [1556. 1516. 1623. 1209. 1415. 1550. 2085.] Total: 10954

Evaluation FairFace test set (CelebA only white):
Total accuracy: 82.76%	| Accuracies:	['76.35%', '83.44%', '85.03%', '87.26%', '80.85%', '80.26%', '85.85%']
Maximum accuracy disparity: 0.1336
Total precision: 84.66%	| Precisions:	['75.75%', '81.93%', '83.70%', '92.41%', '86.36%', '85.10%', '87.35%']
Total recall: 82.32%	| Recalls:	['79.35%', '85.52%', '86.13%', '88.31%', '74.97%', '73.49%', '86.19%']

