In [None]:
##UTKFace Dataset

In [None]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import random

from torch.utils.data import Dataset, DataLoader
import matplotlib.image as img
from sklearn.model_selection import train_test_split

import torchvision.models as models



In [None]:
#Fix random seed
sd = 0
np.random.seed(sd)
torch.backends.cudnn.deterministic = True
torch.manual_seed(sd)
random.seed(sd)
if torch.cuda.is_available():
  torch.cuda.manual_seed_all(sd)

In [None]:
# dict to map given label to a number
dict_age_to_number = {'0-2': 0,
                      '3-9' : 1,
                      '10-19' : 2,
                      '20-29' : 3,
                      '30-39' : 4,
                      '40-49' : 5,
                      '50-59' : 6,
                      '60-69' : 7,
                      'more than 70' : 8}

In [None]:
data_path = r'./UTKFace/'

data_labels = os.listdir('UTKFace')

clean_labels = []

age = []
age2 = []
gender = []
race = []
for f in data_labels:
    temp = f.split('_')
    if len(temp[2])>1:
        continue
    age.append(temp[0])
    age2.append(int(temp[0]))
    gender.append(temp[1])
    race.append(temp[2])
    clean_labels.append(f)


In [None]:
#Assign labels to samples
age_class = []
for a in age2:
    if a<=2 and a>=0:
        age_class.append('0-2')
    elif a>=3 and a<=9:
        age_class.append('3-9')
    elif a>=10 and a<=19:
        age_class.append('10-19')
    elif a>=20 and a<=29:
        age_class.append('20-29')
    elif a>=30 and a<=39:
        age_class.append('30-39')
    elif a>=40 and a<=49:
        age_class.append('40-49')
    elif a>=50 and a<=59:
        age_class.append('50-59')
    elif a>=60 and a<=69:
        age_class.append('60-69')
    elif a>=70:
        age_class.append('more than 70')
    else:
        print('ErrorA')
        print(a)
        
gender_class = []
for g in gender:
    if g=='0':
        gender_class.append('Male')
    elif g=='1':
        gender_class.append('Female')
    else:
        print('ErrorG')
        print(g)
        
race_class = []
for g in race:
    if g=='0':
        race_class.append('White')
    elif g=='1':
        race_class.append('Black')
    elif g=='2':
        race_class.append('Asian')
    elif g=='3':
        race_class.append('Indian')
    elif g=='4':
        race_class.append('Other')
    else:
        print('ErrorR')
        print(g)


In [None]:
#create pandas dataframe
df = {'file': clean_labels, 'age': age_class, 'gender': gender_class, 'race': race_class}

df = pd.DataFrame(data=df)

In [None]:
# categories
age_list = ['3-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', 'more than 70']

In [None]:
#equalize proportions of ages
for age in age_list:
    print(age)
    ff1 = (df.gender=='Male') & (df.age==age)
    d1 = df[ff1]
    
    ff2 = (df.gender=='Female') & (df.age==age)
    d2 = df[ff2]
    ilist1 = df[((df.gender == 'Male') &( df.age == age) )].index
    ilist2 = df[((df.gender == 'Female') &( df.age == age) )].index
    
    
    
    l1 = len(ilist1)
    l2 = len(ilist2)
    
    remove = np.abs(l1-l2)

    
    if l1<l2:
        df = df.drop(ilist2[0:remove])
    elif l2<l1:
        df = df.drop(ilist1[0:remove])

train_labels = df



In [None]:
#Dataloader
class UTKFaceDataset(Dataset):
    def __init__(self, data, path , transform = None):
        super().__init__()
        self.data = data.values
        self.path = path
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,index):
        img_name = self.data[index][0]
        label = dict_age_to_number[self.data[index][1]]   # index 3 for race, need as tensor -> convert to number from str first
        label = torch.tensor(label)
        img_path = os.path.join(self.path, img_name)
        image = img.imread(img_path)
        
        if self.transform is not None:
            image = self.transform(image)
        return image, label

In [None]:
#Transforms go here
train_transform = transforms.Compose([transforms.ToTensor()])
test_transform = transforms.Compose([transforms.ToTensor()])
valid_transform = transforms.Compose([transforms.ToTensor()])

In [None]:
#Split datasets
batch_size = 25

def resample_dataset_gender(data,frac):
    flagsM = data['gender']=='Male'
    flagsF = data['gender']=='Female'
    data_M = data[flagsM]
    data_F = data[flagsF]
    baseline = min(len(data_M),len(data_F))
    
    tempM = data_M[0:int(frac*baseline)]
    tempF = data_F[0:int((1-frac)*baseline)]
    frames = [tempM,tempF]
    final_split = pd.concat(frames)
    final_split = final_split.sample(frac=1)
    
    return final_split


train_split_t, val_split_t_dash = train_test_split(train_labels, stratify=train_labels.gender, test_size=0.2)
val_split_t, test_labels = train_test_split(val_split_t_dash, stratify=val_split_t_dash.gender, test_size=0.5)

#Ensure val set has equal representation
flagsM = val_split_t['gender']=='Male'
flagsF = val_split_t['gender']=='Female'
data_M = val_split_t[flagsM]
data_F = val_split_t[flagsF]

tempM = data_M
tempF = data_F
final_split = tempM
val_split = final_split.sample(frac=1)

flagsM = test_labels['gender']=='Male'
flagsF = test_labels['gender']=='Female'
test_split_M = test_labels[flagsM]
test_split_F = test_labels[flagsF]

print(test_split_M['gender'].value_counts())
print(test_split_M['age'].value_counts())

print(test_split_F['gender'].value_counts())
print(test_split_F['age'].value_counts())


valid_data = UTKFaceDataset(val_split, data_path, valid_transform )
test_data_M = UTKFaceDataset(test_split_M, data_path, test_transform )
test_data_F = UTKFaceDataset(test_split_F, data_path, test_transform )

valid_loader = DataLoader(dataset = valid_data, batch_size = batch_size, shuffle=False, num_workers=0)
test_loader_M = DataLoader(dataset = test_data_M, batch_size = batch_size, shuffle=False, num_workers=0)
test_loader_F = DataLoader(dataset = test_data_F, batch_size = batch_size, shuffle=False, num_workers=0)

In [None]:
# CPU or GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

In [None]:
def test_performance(model,dataL,criterion):

    model.eval()
    model.to(device)

    test_loss = 0
    test_acc = 0
    temp_test_acc = []

    for data, target in dataL:

        data = data.to(device)
        target = target.to(device)

        output = model(data)

        loss = criterion(output, target)
        # update-average-validation-loss 
        test_loss += loss.item() * data.size(0)

        op_temp = output.detach().cpu().numpy()
        op_temp = np.argmax(op_temp,axis=1)

        test_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

    ttacc  = test_acc/len(dataL.sampler)
    test_loss_M = test_loss/len(dataL.sampler)
    
    test_print = 'Test Loss: {:.3f} \tTest Acc: {:.3f}'.format(
        test_loss_M, ttacc)

    print(test_print)
    return test_print,ttacc

In [None]:
#Train model
def write_file(fname,string,act):
    with open(fname, act) as text_file:
        text_file.write(string+'\n')

###Combined iteration
def train_model(male_frac,train_split_t,valid_loader,test_loader_M,test_loader_F):

    num_epochs = 65
    num_classes = 9  # for age
    learning_rate = 0.0006

    check_point_dir = 'male'+str(male_frac)
    
    if not os.path.isdir(f"checkpoints/"+check_point_dir):
        os.makedirs(f"checkpoints/"+check_point_dir)
        print("Output directory is created")
        
    #make logger text file
    text_path = f"checkpoints/"+check_point_dir+"/"+"log.txt"
    try:
        os.remove(text_path)
    except OSError:
        pass
    
    write_file(text_path,'********* Male fraction: {} *********'.format(male_frac),'a')
    
    train_split = resample_dataset_gender(train_split_t,male_frac)
    
    write_file(text_path,str(train_split['age'].value_counts()),'a')
    
    write_file(text_path,str(train_split['gender'].value_counts()),'a')
    
    #Dataloaders
    train_data = UTKFaceDataset(train_split, data_path, train_transform )

    train_loader = DataLoader(dataset = train_data, batch_size = batch_size, shuffle=True, num_workers=0)
    
    model = models.resnet34(pretrained=False)
    model.fc = nn.Linear(512, num_classes)
    model.load_state_dict(torch.load(f"model_init_9class.pt"))
    model.to(device)
    criterion = nn.CrossEntropyLoss()
    
    optimizer = torch.optim.AdamW(
        model.parameters(), 
        lr=learning_rate, 
        betas=(0.5, 0.999), 
        weight_decay=0.05
        )

    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=num_epochs, 
        eta_min=0.01 * learning_rate, verbose=True
        )
    
    
    
    # Actual training of model

    train_losses = []
    valid_losses = []

    train_accuracies = []
    val_accuracies = []
    
    valid_accuracy = []
    test_accuracy_M = []
    test_accuracy_F = []

    print("Training model...")

    best_val_acc = 0

    for epoch in range(1, num_epochs+1):
        # keep track of train/val loss
        train_loss = 0.0
        valid_loss = 0.0

        # training the model
        model.train()
        temp_train_acc = 0.0
        for data, target in train_loader:

            data = data.to(device)
            target = target.to(device)

            optimizer.zero_grad()                   # init gradients to zeros
            output = model(data)                    # forward pass
    #         print(output)
    #         print(target)
            loss = criterion(output, target)        # compute loss
            loss.backward()                         # loss backwards
            optimizer.step()                        # update model params

            train_loss += loss.item() * data.size(0)

            op_temp = output.detach().cpu().numpy()
            op_temp = np.argmax(op_temp,axis=1)

            temp_train_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)
            
        
        # validate-the-model
        model.eval()
        temp_val_acc = 0.0
        for data, target in valid_loader:

            data = data.to(device)
            target = target.to(device)

            output = model(data)

            loss = criterion(output, target)

            # update-average-validation-loss 
            valid_loss += loss.item() * data.size(0)

            op_temp = output.detach().cpu().numpy()
            op_temp = np.argmax(op_temp,axis=1)

            temp_val_acc += np.mean(op_temp==target.detach().cpu().numpy())*data.size(0)

        tvacc  = np.mean(np.array(temp_val_acc))

        if tvacc>best_val_acc:
            best_val_acc = tvacc
            torch.save(model.state_dict(), f"checkpoints/"+check_point_dir+"/model_best.pt")
            print('Model saved')
            write_file(text_path,'Model saved','a')

        # calculate-average-losses
        train_loss = train_loss/len(train_loader.sampler)
        valid_loss = valid_loss/len(valid_loader.sampler)
        
        ttacc  = temp_train_acc/len(train_loader.sampler)
        tvacc  = temp_val_acc/len(valid_loader.sampler)
        
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)

        train_accuracies.append(ttacc)
        val_accuracies.append(tvacc)

        scheduler.step()

        # print-training/validation-statistics 
        train_print = 'Epoch: {} \tTr Loss: {:.3f} \tTr Acc: {:.3f} \tVal Loss: {:.3f} \tVal Acc: {:.3f}'.format(
            epoch, train_loss, ttacc, valid_loss, tvacc)
        print(train_print)

        test_print_M,ttacc_M = test_performance(model,test_loader_M,criterion)
        test_print_F,ttacc_F = test_performance(model,test_loader_F,criterion)

        valid_accuracy.append(tvacc)
        test_accuracy_M.append(ttacc_M)
        test_accuracy_F.append(ttacc_F)
        
        write_file(text_path,train_print,'a')
        
        write_file(text_path,test_print_M,'a')
        
        write_file(text_path,test_print_F,'a')
            
    path_val = f"checkpoints/"+check_point_dir+"/"+"validation_accuracy"
    path_M = f"checkpoints/"+check_point_dir+"/"+"test_accuracy_M"
    path_F = f"checkpoints/"+check_point_dir+"/"+"test_accuracy_F"
    valid_accuracy = np.array(valid_accuracy)
    test_accuracy_F = np.array(test_accuracy_M)
    test_accuracy_L = np.array(test_accuracy_F)
    np.save(path_val, valid_accuracy)
    np.save(path_M, test_accuracy_M)
    np.save(path_F, test_accuracy_F)
    


In [None]:
male_fracs = np.linspace(0.0,1.0,11)
for male_frac in male_fracs:
    print('********* Male fraction: {} *********'.format(male_frac))
    train_model(male_frac,train_split_t,valid_loader,test_loader_M,test_loader_F)