# Unify datasets

In [None]:
import os
import csv
import shutil
import numpy as np
from tqdm import tqdm
from decord import VideoReader
from decord import cpu
import cv2
import xml.etree.ElementTree as ET
import pandas as pd
import glob

output_root = 'C:/DATASETS/AGE-FER'
output_labels_path = os.path.join(output_root, 'datasets-labels')
output_imgs_path = os.path.join(output_root, 'images')

global_labels = ['anger', 'disgust', 'fear', 'happiness', 'sadness', 'surprise', 'neutral', 'contempt', 'pleased', 'curiosity', 'uncertainty', 'excitement', 'frustration']
csv_columns = ['dataset','user_id','name','class','age','gender','race','perspective', 'age_group', 'subset', 'auto_age', 'auto_gender', 'auto_perspective']
gender_names = ['female', 'male']

## Common functions

### Frame sampling

In [None]:
def get_frames(video_path, skip=.5, mode='auto', first_frame=0, max_frame=-1):
    with open(video_path, 'rb') as f:
        vr = VideoReader(f, ctx=cpu(0))
        frames = frame_selection(vr.get_avg_fps(), len(vr), skip, mode, first_frame, max_frame)
        return vr.get_batch(frames).asnumpy()[:,:,:,::-1]
    
def frame_selection(fps, total_frames, skip=.5, mode='auto', first_frame=0, max_frame=-1):
    if mode == 'frame':
        frame_skip = skip
    elif mode == 'second':
        frame_skip = int(fps * skip)
    elif mode == 'part':
        frame_skip = total_frames // (skip - 1)
        if (total_frames % (skip - 1)) == 0:
            frame_skip -= 1
    elif mode == 'auto':
        # every .5s
        if (total_frames / fps) <= 2.5:
            frame_skip = int(fps * .5)
        # skip parts
        else:
            frame_skip = total_frames // (skip - 1)
            if (total_frames % (skip - 1)) == 0:
                frame_skip -= 1

    frames = [i for i in range(first_frame, total_frames, frame_skip)]
    
    # Set last frame according to the set max frame (useful to avoid picking last frame, for example)
    if max_frame != -1:
        mf = len(list(range(total_frames))[:max_frame+1])
        if frames[-1] > mf:
            frames[-1] = mf
    
    return frames
    
def get_frames_opencv(video_path, skip=.5, mode='auto', first_frame=0, max_frame=-1):
    cap = cv2.VideoCapture(video_path, cv2.CAP_ANY)
    # cap.set(cv2.CAP_PROP_FOURCC, cv2.VideoWriter.fourcc('m','p','g','2'))
    vr = VideoReader(video_path, ctx=cpu(0))
    frames = frame_selection(vr.get_avg_fps(), len(vr), skip, mode, first_frame)
    
    selected_frames = np.zeros(shape=[len(frames), int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT)), int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)), 3], dtype='uint8')
    frame_i = 0
    frame_selection_i =0
    while cap.isOpened():
        ret, frame = cap.read()
        
        # Finish
        if not ret or frame_selection_i >= len(frames):
            if frame_selection_i < len(frames):
                selected_frames = selected_frames[:frame_selection_i,...]
            break
        
        # Correct frame
        if frames[frame_selection_i] == frame_i:
            selected_frames[frame_selection_i,...] = frame
            frame_selection_i += 1
        
        frame_i += 1
    cap.release()
    return selected_frames

### Age and Gender Estimation

In [None]:
def get_age_gender_videos_user(video_paths, step=1, age_mode='median', gender_mode='mode', gender_th=.3, age_max_std=None, reader='decord', mode=1):
    """Get average age and gender of a set of videos of the same user."""
    
    ages = []
    genders = []
    
    for video_path in video_paths:
        if reader == 'decord':
            ages2, genders2 = get_ages_genders_video(video_path, gender_name=False, step=step)
        elif reader == 'opencv':
            ages2, genders2 = get_ages_genders_video_opencv(video_path, gender_name=False, step=step, mode=mode)
        ages.extend(ages2)
        genders.extend(genders2)
            
    return estimate_age_gender(ages, genders, age_mode, gender_mode, gender_th, age_max_std)

def get_ages_genders_video(video_path, gender_name=False, step=1, mode=1):
    """Get list of ages and genders of a video, each element corresponding to one frame"""
    
    ages = []
    genders = []
    
    with open(video_path, 'rb') as f:
        vr = VideoReader(f, ctx=cpu(0))
        
        for i in range(0, len(vr), step):
            frame = vr[i].asnumpy()
            frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            
            if mode == 1:
                age, gender = get_age_gender(frame, gender_name=False)
            else:
                age, gender = get_age_gender2(frame, gender_name=False)

            if age is not None:
                ages.append(age)
                genders.append(gender)
        
    return ages, genders

def get_ages_genders_video_opencv(video_path, gender_name=False, step=1, mode=1):
    """Get list of ages and genders of a video, each element corresponding to one frame"""
    
    ages = []
    genders = []
    cap = cv2.VideoCapture(video_path, cv2.CAP_ANY)
    
    while cap.isOpened():
        
        ret, frame = cap.read()
        
        # Finish
        if not ret:
            break
            
        frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

        if mode == 1:
            age, gender = get_age_gender(frame, gender_name=False)
        else:
            age, gender = get_age_gender2(frame, gender_name=False)

        if age is not None:
            ages.append(age)
            genders.append(gender)
            
    cap.release()
    return ages, genders

def get_age_gender_imgs_user(img_paths, age_mode='median', gender_mode='mode', gender_th=.3, age_max_std=None, mode=1):
    """Get average age and gender of a set of images of the same user."""
    
    ages = []
    genders = []
    
    for img_path in img_paths:
    
        img = cv2.imread(img_path)
        if img is None:
            print('Error al abrir la imagen:', img_path)
            continue
        
        if mode == 1:
            age, gender = get_age_gender(img, gender_name=False)
        else:
            age, gender = get_age_gender2(img, gender_name=False)
        if age is not None:
            ages.append(age)
            genders.append(gender)
            
    return estimate_age_gender(ages, genders, age_mode, gender_mode, gender_th, age_max_std)

def estimate_age_gender(ages, genders, age_mode='median', gender_mode='mode', gender_th=.3, age_max_std=None):
    
    # Gender
    if len(genders) < 1:
        gender = None
    elif gender_mode == 'mode':
        genders_mean = np.mean(genders)
        gender = gender_names[round(genders_mean)] if len(genders) > 0 and (genders_mean <= gender_th or genders_mean >= (1-gender_th)) else None
    else:
        print('Wrong gender mode')
        gender = None
    
    # Age
    if len(ages) < 1:
        age = None
    elif age_mode == 'mean':
        age = round(np.mean(ages))
    elif age_mode == 'median':
        age = round(np.median(ages))
    else:
        print('Wrong gender mode')
        age = None
        
    if age_max_std is not None and len(ages) > 0:
        if np.std(ages) > age_max_std:
            age = None
    
    return age, gender

Using MiVOLO:

In [None]:
from mivolo.predictor import Predictor
from mivolo.model.mi_volo import MiVOLO
from mivolo.data.misc import prepare_classification_images
import torch

class Config():
    def __init__(self, detector_weights, checkpoint, device, with_persons, disable_faces, draw):
        self.detector_weights = detector_weights
        self.checkpoint = checkpoint
        self.device = device
        self.with_persons = with_persons
        self.disable_faces = disable_faces
        self.draw = draw
        
config = Config(
    detector_weights="../weights/yolov8x_person_face.pt",
    checkpoint="../weights/mivolo_imbd.pth.tar",
    device="cuda:0",
    with_persons=True,
    disable_faces=False,
    draw = False
)
predictor = Predictor(config, verbose=False)
recognizer = predictor.age_gender_model

def get_age_gender(img, gender_name=True):
    """Get age and gender of an image."""
    
    detected_objects, _ = predictor.recognize(img)
    
    # No faces and no persons detected
    if len(detected_objects.ages) == 0 and len(detected_objects.genders) == 0:
        print('No faces nor persons found.')
        return None, None
    
    age = detected_objects.ages[0]
    gender = detected_objects.genders[0]
    if not gender_name:
        if gender == 'male':
            gender = 1
        elif gender == 'female':
            gender = 0

    return age, gender

def get_age_gender2(img, gender_name=True):
    """Get age and gender of an image without detection."""
    
    # Prepare image for classification
    img = prepare_classification_images([img.astype(np.uint8)], recognizer.input_size, recognizer.data_config["mean"], recognizer.data_config["std"], device=recognizer.device)
    img = torch.cat((img, img), dim=1)

    # Recognize emotions
    output = recognizer.inference(img)
    
    if recognizer.meta.only_age:
        age_output = output
        gender_probs, gender_indx = None, None
    else:
        age_output = output[:, 2]
        gender_output = output[:, :2].softmax(-1)
        gender_probs, gender_indx = gender_output.topk(1)
        
    # get age
    age = age_output[0].item()
    age = age * (recognizer.meta.max_age - recognizer.meta.min_age) + recognizer.meta.avg_age
    age = round(age)

    # get gender
    if gender_probs is not None:
        gender = "male" if gender_indx[0].item() == 0 else "female"
    
    if not gender_name:
        if gender == 'male':
            gender = 1
        elif gender == 'female':
            gender = 0

    return age, gender

### Head Pose and Facial Landmarks Estimation

In [None]:
def pose_to_text(pose):
    """Convert pose to text."""
    
    if pose is None:
        return None
    elif pose[1] < 22.5 and pose[1] > -22.5:
        return 'front'
    elif pose[1] > 0 and pose[1] < 67.5:
        return 'half_right'
    elif pose[1] < 0 and pose[1] > -67.5:
        return 'half_left'
    elif pose[1] > 0 and pose[1] < 112.5:
        return 'full_right'
    elif pose[1] < 0 and pose[1] > -112.5:
        return 'full_left'
    else:
        return 'back'

Using SPIGA:

In [None]:
from ultralytics.yolo.engine.model import YOLO
from spiga.inference.config import ModelConfig
from spiga.inference.framework import SPIGAFramework

weights = '../weights/yolov8x_person_face.pt'
yolo = YOLO(weights)
yolo.fuse()

dataset = 'wflw'
cfg = ModelConfig(dataset)
cfg.load_model_url = None
cfg.model_weights_path = '../weights'
cfg.model_weights = 'spiga_wflw.pt'
processor = SPIGAFramework(cfg)

def get_bbox_hw(img):
    # YOLO detect face
    yolo_pred = yolo(img, conf=.4, iou=.7, half=True, verbose=False)
    
    if len(yolo_pred) < 1:
        return None
    
    yolo_pred = yolo_pred[0].boxes
    classes = yolo_pred.cls.numpy(force=True)
    bboxes = yolo_pred.xyxy.numpy(force=True)
    
    if not 1 in classes:
        return None
    
    face_bbox = bboxes[np.where(classes == 1)[0]][0].astype('int')
    face_bbox_hw = np.array([face_bbox[0], face_bbox[1], face_bbox[2] - face_bbox[0], face_bbox[3] - face_bbox[1]])
    return face_bbox_hw

def get_spiga_feature(img, use_detector=True, feature='landmarks'):
    
    if use_detector:
        face_bbox_hw = get_bbox_hw(img)
    else:
        face_bbox_hw = [0, 0, img.shape[1], img.shape[0]]
    
    if face_bbox_hw is None:
        return None

    features = processor.inference(img, [face_bbox_hw])
    return np.array(features[feature][0])

def get_keypoints(img, use_detector=True):
    landmarks = get_spiga_feature(img, use_detector, 'landmarks')
    return [np.mean(landmarks[60:68], axis=0), np.mean(landmarks[68:76], axis=0), landmarks[53], landmarks[88], landmarks[92]]

def get_pose(img, use_detector=True):
    return get_spiga_feature(img, use_detector, 'headpose')

# Label the datasets

## AffectNet

In [None]:
dataset_name = 'AffectNet'
input_path = 'C:/DATASETS/AffectNetUniques'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
duplicates_csv = os.path.join(input_path, 'annotations_IJIMAI.csv')
labels = ['neutral','happiness','sadness','surprise','fear','disgust','anger','contempt']
id_counter = 1
users = {}

# Build dict with duplicates. Use name as key
duplicates = {}
with open(duplicates_csv, 'r') as f:

    # Read csv as dict
    reader = csv.DictReader(f)
    for row in reader:
        duplicates[(row['folder'] + '_' + row['name']).upper()] = {'gender': row['gender'], 'duplicated': True if row['duplicated'] == 'True' else False}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for class_i, label in enumerate(labels):
        for img_name in os.listdir(os.path.join(input_path, str(class_i))):

            # Check if duplicate or if gender is annotated
            gender = None
            if img_name.upper() in duplicates:
                
                # If duplicate, skip
                if duplicates[img_name]['duplicated']:
                    continue
                else:
                    gender = duplicates[img_name]['gender']
                    if gender == 'M':
                        gender = 'male'
                    elif gender == 'F':
                        gender = 'female'
                    else:
                        gender = None
            
            # User ID
            row_id = id_counter
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': None, #user_id,
                'name': img_name,
                'class': label,
                'age': None,
                'gender': gender,
                'race': None,
                'perspective': None,
                'age_group': None,
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, str(class_i), img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_imgs_user([row['img_path'] for row in user], mode=2)
        for row in user:

            img_path = row.pop('img_path')
            perspective = pose_to_text(get_pose(cv2.imread(img_path), use_detector=False))

            row['auto_age'] = 1 if age is not None else 0
            row['age'] = age
            row['auto_gender'] = 1 if gender is not None else 0
            row['gender'] = gender
            row['auto_perspective'] = 1 if perspective is not None else 0
            row['perspective'] = perspective
            spamwriter.writerow(row)

## NHFI

Images with same name at different label folders. Renamed to "label_img-name.png". 

In [None]:
dataset_name = 'NHFI'
input_path = 'C:/DATASETS/NHFI'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['sadness', 'neutrality', 'anger', 'disgust','surprise','fear', 'happiness', 'contempt']
labels_out = ['sadness', 'neutral', 'anger', 'disgust','surprise','fear', 'happiness', 'contempt']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for label in labels_in:
        for img_name in os.listdir(os.path.join(input_path, label)):
            
            # User ID
            row_id = id_counter
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': None, # user_id,
                'name': label + '_' + img_name,
                'class': labels_out[labels_in.index(label)],
                'age': None,
                'gender': None,
                'race': None,
                'perspective': None, 
                'age_group': None,
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, label, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_imgs_user([row['img_path'] for row in user], mode=2)
        for row in user:

            img_path = row.pop('img_path')
            perspective = pose_to_text(get_pose(cv2.imread(img_path), use_detector=False))

            row['auto_age'] = 1 if row['age'] is None and age is not None else 0
            row['age'] = age
            row['auto_gender'] = 1 if row['gender'] is None and gender is not None else 0
            row['gender'] = gender
            row['auto_perspective'] = 1 if row['perspective'] is None and perspective is not None else 0
            row['perspective'] = perspective
            spamwriter.writerow(row)

## RAF-DB

In [None]:
dataset_name = 'RAF-DB'
input_path = 'C:/DATASETS/RAF-DB/DATASET'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['5', '7', '6', '3', '1', '2', '4']
labels_out = ['sadness', 'neutral', 'anger', 'disgust','surprise','fear', 'happiness']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for split in ['train', 'test']:
        for label in labels_in:
            for img_name in os.listdir(os.path.join(input_path, split, label)):
                
                # User ID
                row_id = id_counter
                if row_id in users:
                    user_id = users[row_id][0]['user_id']
                else:
                    user_id = dataset_name + '-' + str(id_counter)
                    users[row_id] = []
                    id_counter += 1

                users[row_id].append({
                    'dataset': dataset_name, 
                    'user_id': None, # user_id,
                    'name': img_name,
                    'class': labels_out[labels_in.index(label)],
                    'age': None,
                    'gender': None,
                    'race': None,
                    'perspective': None, 
                    'age_group': None,
                    'subset': split,
                    'auto_age': 0,
                    'auto_gender': 0,
                    'auto_perspective': 0,
                    'img_path': os.path.join(input_path, split, label, img_name)})
        
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_imgs_user([row['img_path'] for row in user], mode=2)
        for row in user:

            img_path = row.pop('img_path')
            perspective = pose_to_text(get_pose(cv2.imread(img_path), use_detector=False))

            row['auto_age'] = 1 if row['age'] is None and age is not None else 0
            row['age'] = age
            row['auto_gender'] = 1 if row['gender'] is None and gender is not None else 0
            row['gender'] = gender
            row['auto_perspective'] = 1 if row['perspective'] is None and perspective is not None else 0
            row['perspective'] = perspective
            spamwriter.writerow(row)

## DDCF

- First merge folders 40males and 40femaleses
- Delete folders "edited" for some subjects
- Pleased considered as "happiness" expression
- Race: caucasian, according to article

In [None]:
dataset_name = 'DDCF'
input_path = 'C:/DATASETS/DDCF'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['Afraid','Angry','Disgusted','Happy','Neutral','Pleased','Sad','Surprised']
labels_out = ['fear','anger','disgust','happiness','neutral','pleased','sadness','surprise']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for class_i, label in enumerate(labels_in):
        for dir_name in os.listdir(os.path.join(input_path, label)):
            
            for img_name in os.listdir(os.path.join(input_path, label, dir_name)):
        
                # User ID
                if dir_name in users:
                    user_id = users[dir_name]
                else:
                    user_id = dataset_name + '-' + str(id_counter)
                    users[dir_name] = user_id
                    id_counter += 1
                
                # Gender
                gender = img_name.split('_')[1].split('yo')[1]
                if gender == 'F' or gender == 'f':
                    gender = 'female'
                elif gender == 'M' or gender == 'm':
                    gender = 'male'
                else:
                    gender = None
                    print('Gender error:', img_name)
                
                # Perspective
                perspective = None
                if 'far right' in img_name:
                    perspective = 'full_right'
                elif 'far left' in img_name:
                    perspective = 'full_left'
                elif 'front' in img_name or 'Front' in img_name:
                    perspective = 'front'
                elif 'right' in img_name:
                    perspective = 'half_right'
                elif 'left' in img_name:
                    perspective = 'half_left'
                else:
                    print(img_name)
                    
                spamwriter.writerow({
                    'dataset': dataset_name, 
                    'user_id': user_id,
                    'name': img_name,
                    'class': labels_out[class_i],
                    'age': img_name.split('_')[1].split('yo')[0],
                    'gender': gender,
                    'race': 'caucasian',
                    'perspective': perspective, 
                    'age_group': None,
                    'subset': None,
                    'auto_age': 0,
                    'auto_gender': 0,
                    'auto_perspective': 0})

## DEFSS

- First convert "Model Descriptives.xlsx" to CSV

In [None]:
dataset_name = 'DEFSS'
input_path = 'C:/DATASETS/DEFSS'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['Happy','Sad','Fear','Angry','Neutral']
labels_out = ['happiness','sadness','fear','anger','neutral']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile, open(os.path.join(input_path, 'Photo Descriptives.csv'), 'r') as csvfile_input:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    reader = csv.DictReader(csvfile_input, delimiter=';')
    for row in reader:
        
        # User ID
        if row['ID'] in users:
            user_id = users[row['ID']]
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row['ID']] = user_id
            id_counter += 1
        
        # File name, fixing some naming errors
        file_name = row['ID']+'_'+row['Sex']+row['Age']+'_'+row['Emotion']+'.jpg'
        if not os.path.exists(os.path.join(input_path, 'images', file_name)):
            file_name = row['ID']+'_'+row['Age']+row['Sex']+'_'+row['Emotion']+'.jpg'
            if not os.path.exists(os.path.join(input_path, 'images', file_name)):
                file_name = row['ID']+'_'+row['Sex']+row['Age']+'_'+row['Emotion']+'_s.jpg'
                if not os.path.exists(os.path.join(input_path, 'images', file_name)):
                    print('Not found:', file_name)
                  
        # Gender
        gender = row['Sex']
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', img_name)

        # Race
        race = row['Race'].lower()
        if race == 'black/white':
            race = None
        elif race == 'native/white':
            race = 'white'
        elif race == 'other':
            race = None
        elif race == 'not identified':
            race = None
        elif race not in ['black', 'white', 'hispanic', 'asian']:
            print(file_name, race)
                    
        spamwriter.writerow({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': file_name,
            'class': labels_out[labels_in.index(row['Emotion'])],
            'age': row['Age'],
            'gender': gender,
            'race': race,
            'perspective': 'front', 
            'age_group': None,
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0})

## FACES

- Race: caucasian, according to article

In [None]:
dataset_name = 'FACES'
input_path = 'C:/DATASETS/FACES/FACES'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['a','d','f','h','n','s',]
labels_out = ['anger','disgust','fear','happiness','neutral','sadness']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for img_name in os.listdir(os.path.join(input_path)):
        img_name_split = img_name.split('_')
            
        # Gender
        gender = img_name_split[2]
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', img_name)
            
        # Age group
        age_group = img_name_split[1]
        if age_group == 'o':
            age_group = 'elderly'
        elif age_group == 'y':
            age_group = 'young'
        elif age_group == 'm':
            age_group = 'middle-age'
        else:
            age_group = None
            print('Age group error')
        
        # User ID
        row_id = img_name_split[0]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1
            
        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': img_name,
            'class': labels_out[labels_in.index(img_name_split[3])],
            'age': None,
            'gender': gender,
            'race': 'caucasian',
            'perspective': 'front', 
            'age_group': age_group,
            'subset': None,
            'auto_age': 1,
            'auto_gender': 0,
            'auto_perspective': 0})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([os.path.join(input_path, row['name']) for row in user])
        for row in user:
            row['age'] = age
            row['auto_age'] = 1 if age is not None else 0
            spamwriter.writerow(row)

## NIMH-ChEFS

- Afraid_averted/DSC_4644.JPG deleted, since it is repeated with another name (M2FA_4644.jpg)
- Naming error: F6_2FA_5449.jpg changed to M6_2FA_5449.jpg, M6FA__5449.jpg changed to M6_3FA_5449.jpg

In [None]:
dataset_name = 'NIMH-ChEFS'
input_path = 'C:/DATASETS/NIMH-ChEFS'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['Angry','Afraid','Happy','Neutral','Sad',]
labels_out = ['anger','fear','happiness','neutral','sadness']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for dir_name in os.listdir(os.path.join(input_path)):
        for img_name in os.listdir(os.path.join(input_path, dir_name)):

            # Gender
            gender = img_name[0]
            if gender == 'F' or gender == 'f':
                gender = 'female'
            elif gender == 'M' or gender == 'm':
                gender = 'male'
            else:
                gender = None
                print('Gender error:', img_name)
            
            # Perspective
            perspective = dir_name.split('_')[1]
            if perspective == 'direct':
                perspective = 'front'
            elif perspective == 'averted':
                perspective = 'averted_gaze'
            else:
                perspective = None
                print('Perspective error')
        
            # User ID
            if len(img_name.split('_')[0]) == 2:
                row_id = img_name[0:2]
            elif len(img_name.split('_')[0]) == 3:
                row_id = img_name[0:3]
            elif len(img_name.split('_')[0]) == 4:
                row_id = img_name[0:2]
            elif len(img_name.split('_')[0]) == 5:
                row_id = img_name[0:3]
                
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': user_id,
                'name': img_name,
                'class': labels_out[labels_in.index(dir_name.split('_')[0])],
                'age': None,
                'gender': gender,
                'race': None,
                'perspective': perspective, 
                'age_group': '10-17',
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, dir_name, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([row.pop('img_path') for row in user])
        for row in user:
            row['age'] = np.clip(age, 10, 17) if age is not None else None
            row['auto_age'] = 1 if row['age'] is not None else 0
            spamwriter.writerow(row)

## RaFD

- age_group = child or adult. 

In [None]:
dataset_name = 'RaFD'
input_path = 'C:/DATASETS/RaFD'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['sad', 'neutral', 'angry', 'contemptuous', 'disgusted', 'surprised', 'fearful', 'happy']
labels_out = ['sadness', 'neutral', 'anger', 'contempt', 'disgust','surprise','fear', 'happiness']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for img_name in os.listdir(os.path.join(input_path)):
        img_name_split = img_name.split('_')
            
        # Perspective
        perspective_1 = img_name_split[0]
        if perspective_1 == 'Rafd000':
            perspective_1 = 'full_left'
        elif perspective_1 == 'Rafd045':
            perspective_1 = 'half_left'
        elif perspective_1 == 'Rafd090':
            perspective_1 = 'front'
        elif perspective_1 == 'Rafd135':
            perspective_1 = 'half_right'
        elif perspective_1 == 'Rafd180':
            perspective_1 = 'full_right'
        else:
            perspective_1 = None
            print('Perspective error:', img_name)
            
        perspective_2 = img_name_split[5].split('.')[0]
        if perspective_2 == 'left':
            perspective_2 = '_gaze_left'
        elif perspective_2 == 'right':
            perspective_2 = '_gaze_right'
        elif perspective_2 == 'frontal':
            perspective_2 = ''
        else:
            perspective_2 = None
            print('Perspective error:', img_name)
        
        perspective = (perspective_1 if perspective_1 is not None else '') + (perspective_2 if perspective_2 is not None else '')
        
        # Race
        race = img_name_split[2].lower()
        if race == 'kid':
            race = 'caucasian'
        elif race != 'caucasian' and race != 'moroccan':
            print('Error in race:', img_name)
        
        # Age group
        if img_name_split[2].lower() == 'kid':
            age_group = 'kid'
        else:
            age_group = 'adult'
        
        # User ID
        row_id = img_name_split[1]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': img_name,
            'class': labels_out[labels_in.index(img_name_split[4])],
            'age': None,
            'gender': img_name_split[3],
            'race': race,
            'perspective': perspective,
            'age_group': age_group,
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'img_path': os.path.join(input_path, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([row.pop('img_path') for row in user])
        for row in user:
            row['age'] = age
            row['auto_age'] = 1 if row['age'] is not None else 0
            spamwriter.writerow(row)

## FER2013

In [None]:
dataset_name = 'FER2013'
input_path = 'C:/DATASETS/FER2013'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['Sad', 'Neutral', 'Angry', 'Disgust','Surprise','Fear', 'Happy']
labels_out = ['sadness', 'neutral', 'anger', 'disgust','surprise','fear', 'happiness']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for label in labels_in:
        for img_name in os.listdir(os.path.join(input_path, label)):
            
            # User ID
            row_id = id_counter
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': None, # user_id,
                'name': img_name,
                'class': labels_out[labels_in.index(label)],
                'age': age,
                'gender': gender,
                'race': None,
                'perspective': None, 
                'age_group': None,
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, label, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_imgs_user([row['img_path'] for row in user], mode=2)
        for row in user:

            img_path = row.pop('img_path')
            perspective = pose_to_text(get_pose(cv2.imread(img_path), use_detector=False))

            row['auto_age'] = 1 if row['age'] is None and age is not None else 0
            row['age'] = age
            row['auto_gender'] = 1 if row['gender'] is None and gender is not None else 0
            row['gender'] = gender
            row['auto_perspective'] = 1 if row['perspective'] is None and perspective is not None else 0
            row['perspective'] = perspective
            spamwriter.writerow(row)

## ExpW

In [None]:
dataset_name = 'ExpW'
input_path = 'C:/DATASETS/ExpW'
imgs_path = 'C:/DATASETS/ExpW/origin'
cropped_imgs_path = 'C:/DATASETS/ExpW/cropped'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_in =  ['4', '6', '0', '1','5','2', '3']
labels_out = ['sadness', 'neutral', 'anger', 'disgust','surprise','fear', 'happiness']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile, open(os.path.join(input_path, 'label.lst'), 'r') as csvfile_input:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    reader = csv.DictReader(csvfile_input, delimiter=' ')
    
    for row in tqdm(reader):
        
        img_name = row['image_name']
        new_img_name = str(id_counter) + '_' + img_name
        img = cv2.imread(os.path.join(imgs_path, img_name))
        
        # Square crop img
        top = int(row['face_box_top'])
        left = int(row['face_box_left'])
        bottom = int(row['face_box_bottom'])
        right = int(row['face_box_right'])

        crop_size = max(right - left, bottom - top)
        top = max(0, top - (crop_size - (bottom - top)) // 2)
        left = max(0, left - (crop_size - (right - left)) // 2)
        bottom = min(img.shape[0], top + crop_size)
        right = min(img.shape[1], left + crop_size)

        img = img[top:bottom, left:right]

        # Save cropped image
        cv2.imwrite(os.path.join(cropped_imgs_path, new_img_name), img)

        # Estimate age and gender
        age, gender = get_age_gender2(img)
        perspective = pose_to_text(get_pose(img, use_detector=False))

        spamwriter.writerow({
            'dataset': dataset_name, 
            'user_id': None, # user_id,
            'name': new_img_name,
            'class': labels_out[labels_in.index(row['expression_label'])],
            'age': age,
            'gender': gender,
            'race': None,
            'perspective': perspective, 
            'age_group': None,
            'subset': None,
            'auto_age': 1 if age is not None else 0,
            'auto_gender': 1 if gender is not None else 0,
            'auto_perspective': 1 if perspective is not None else 0})
        
        id_counter += 1

## WSEFEP

- First convert "WSEFEP - norms & FACS.xlsx" to CSV
- Added "MG_0850.jpg" to the CSV, because the image existed but the label did not
- Changed "MG_1317.jpg" to "MG_1317.JPG" in CSV
- Changed "AD_8286.jpg" to "AD_8268.jpg" in CSV
- Changed "JS_0491.jpg" to "JS_0449.jpg" in CSV
- Changed "SO_0052.jpg" to "SO_0053.jpg" in CSV
- race = 'polish', according to article

In [None]:
dataset_name = 'WSEFEP'
input_path = 'C:/DATASETS/WSEFEP'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'joy': 'happiness',
    'sadness': 'sadness',
    'fear': 'fear',
    'anger': 'anger',
    'neutral': 'neutral', 
    'surprise': 'surprise', 
    'disgust': 'disgust'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile, open(os.path.join(input_path, 'WSEFEP - norms & FACS.csv'), 'r') as csvfile_input:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    reader = csv.DictReader(csvfile_input, delimiter=';')
    for row in reader:
        
        # File name, fixing some naming errors
        file_name = row['Picture ID']
        if not os.path.exists(os.path.join(input_path, 'images', file_name)):
            print('Not found:', file_name)
                  
        # Gender
        gender = row['Male/ Female']
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', row)
        
        # User ID
        row_id = row['Displayer ID']
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': file_name,
            'class': labels_dict[row['Display']],
            'age': None,
            'gender': gender,
            'race': 'polish',
            'perspective': 'front', 
            'age_group': '20-30',
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'img_path': os.path.join(input_path, 'images', file_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([row.pop('img_path') for row in user])
        for row in user:
            row['age'] = np.clip(age, 20, 30) if age is not None else None
            row['auto_age'] = 1 if row['age'] is not None else 0
            spamwriter.writerow(row)

## KDEF

- Deleted black images: AF01SUFR, AF10AFFR, AF11NEHL, AF20DIHL, AM25DIFL, AM34DIFR, BF13NEHR, BM21DIFL, BM22DIHL, BM24DIFL
- Deleted too bright images: AM17DIHR, BM17NES
- Deleted corrupted images: AM02HAFR
- Renamed: "AF31V.JPG" to "AF31SAHL.JPG", "AM31H.JPG" to "AM31SUHR.JPG"
- race = swedish

In [None]:
dataset_name = 'KDEF'
input_path = 'C:/DATASETS/KDEF/images'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'HA': 'happiness',
    'SA': 'sadness',
    'AF': 'fear',
    'AN': 'anger',
    'NE': 'neutral', 
    'SU': 'surprise', 
    'DI': 'disgust'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for dir_name in os.listdir(os.path.join(input_path)):
        for img_name in os.listdir(os.path.join(input_path, dir_name)):

            # Gender
            gender = dir_name[1]
            if gender == 'F' or gender == 'f':
                gender = 'female'
            elif gender == 'M' or gender == 'm':
                gender = 'male'
            else:
                gender = None
                print('Gender error:', img_name)
            
            # Perspective
            perspective = img_name[6:8]
            if perspective == 'FL':
                perspective = 'full_left'
            elif perspective == 'FR':
                perspective = 'full_right'
            elif perspective == 'HL':
                perspective = 'half_left'
            elif perspective == 'HR':
                perspective = 'half_right'
            elif perspective[0] == 'S':
                perspective = 'front'
            else:
                perspective = None;
                print('Perspective error:', img_name)
                
            # Label
            try:
                label = labels_dict[img_name[4:6]]
            except:
                print('Label error:', img_name)
        
            # User ID
            row_id = dir_name[1:]
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': user_id,
                'name': img_name,
                'class': label,
                'age': None,
                'gender': gender,
                'race': 'swedish',
                'perspective': perspective, 
                'age_group': '20-30',
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, dir_name, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([row.pop('img_path') for row in user])
        for row in user:
            row['age'] = np.clip(age, 20, 30) if age is not None else None
            row['auto_age'] = 1 if row['age'] is not None else 0
            spamwriter.writerow(row)

## JAFFE

- race = japanese, according to article
- age_group = young

In [None]:
dataset_name = 'JAFFE'
input_path = 'C:/DATASETS/JAFFE'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'HA': 'happiness',
    'SA': 'sadness',
    'FE': 'fear',
    'AN': 'anger',
    'NE': 'neutral', 
    'SU': 'surprise', 
    'DI': 'disgust'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for img_name in os.listdir(os.path.join(input_path)):

        # Label
        try:
            label = labels_dict[img_name[3:5]]
        except:
            print('Label error:', img_name)
        
        # User ID
        row_id = img_name[:2]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': img_name,
            'class': label,
            'age': None,
            'gender': 'female',
            'race': 'japanese',
            'perspective': 'front', 
            'age_group': 'young',
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'img_path': os.path.join(input_path, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([row.pop('img_path') for row in user])
        for row in user:
            row['age'] = age
            row['auto_age'] = 1 if row['age'] is not None else 0
            spamwriter.writerow(row)

## FEGA

- race = caucasian

In [None]:
dataset_name = 'FEGA'
input_path = 'C:/DATASETS/FEGA'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Allegria': 'happiness',
    'Tristezza': 'sadness',
    'Paura': 'fear',
    'Arrabbiato': 'anger',
    'Neutra': 'neutral', 
    'Sorpresa': 'surprise', 
    'Disgusto': 'disgust'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for img_name in os.listdir(os.path.join(input_path)):
        img_name_split = img_name.split('_')

        # Label
        try:
            label = labels_dict[img_name_split[4]]
        except:
            print('Label error:', img_name)
                  
        # Gender
        gender = img_name_split[1]
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', img_name)
        
        # User ID
        row_id = img_name_split[1] + '_' + img_name_split[2] + '_' + img_name_split[3]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': img_name,
            'class': label,
            'age': img_name_split[2],
            'gender': gender,
            'race': 'caucasian',
            'perspective': 'front', 
            'age_group': None,
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'img_path': os.path.join(input_path, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        for row in user:
            row.pop('img_path')
            spamwriter.writerow(row)

## LIFESPAN

- 'Annoyed' and 'Grumpy' considered as anger

In [None]:
dataset_name = 'LIFESPAN'
input_path = 'C:/DATASETS/LIFESPAN/Expressions'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
id_counter = 1
users = {}
labels_dict = {
    'happy': 'happiness',
    'sad': 'sadness',
    'annoyed': 'anger',
    'angry': 'anger',
    'grumpy': 'anger',
    'neutral': 'neutral', 
    'profile': 'neutral', 
    'surprised': 'surprise',
    'disgusted': 'disgust'
}

prefixes = ['', 'EM' 'TSF', 'J', 'W', 'TM']

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()

    for dir1 in os.listdir(os.path.join(input_path)):
        for img_name in os.listdir(os.path.join(input_path, dir1)):

            # Gender
            gender = 'female' if 'female' in img_name else 'male'

            race = img_name.split(gender)[0]
            if race in [i+'W' for i in prefixes]:
                race = 'white'
            elif race in [i+'B' for i in prefixes]:
                race = 'black'
            elif race in [i+'A' for i in prefixes]:
                race = 'asian'
            elif race in [i+'I' for i in prefixes]:
                race = 'indian'
            elif race in [i+'H' for i in prefixes]:
                race = 'hispanic'
            else:
                print(img_name)
            
            # Age
            age = img_name.split(gender)[1][0:2]
            if age[0] == '_':
                age = img_name.split(gender)[1][1:3]

            # Expression
            expression = dir1.split()[0].lower()
    
            # User ID
            # if expression in ['happy', 'neutral', 'profile']:
            #     aux = img_name.split(expression)[0]
            #     aux1 = race = img_name.split(gender)[0][-1]
            #     row_id = aux1 + gender + aux.split(gender)[1]
            # else:
            row_id = img_name.split(expression)[0]
            
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': user_id,
                'name': img_name,
                'class': labels_dict[expression],
                'age': age,
                'gender': gender,
                'race': race,
                'perspective': 'front' if expression != 'profile' else 'full_right', 
                'age_group': None,
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, str(class_i), img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        for row in user:
            row.pop('img_path')
            spamwriter.writerow(row)

## Google-FE-Test

In [None]:
dataset_name = 'Google-FE-Test'
input_path = 'C:/DATASETS/Google-FE-Test'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels = ['anger','disgust','fear','happiness','neutral','sadness','surprise']
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for class_i, label in enumerate(labels):
        for img_name in os.listdir(os.path.join(input_path, str(class_i))):            
        
            # User ID
            row_id = id_counter
            if row_id in users:
                user_id = users[row_id][0]['user_id']
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users[row_id] = []
                id_counter += 1

            users[row_id].append({
                'dataset': dataset_name, 
                'user_id': None,
                'name': img_name,
                'class': label,
                'age': None,
                'gender': None,
                'race': None,
                'perspective': 'front', 
                'age_group': None,
                'subset': None,
                'auto_age': 0,
                'auto_gender': 0,
                'auto_perspective': 0,
                'img_path': os.path.join(input_path, str(class_i), img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_imgs_user([row.pop('img_path') for row in user], mode=2)
        for row in user:
            row['age'] = age
            row['auto_age'] = 1 if row['age'] is not None else 0
            row['gender'] = gender
            row['auto_gender'] = 1 if row['gender'] is not None else 0
            spamwriter.writerow(row)

## BU-4DFE

- Version with frames already sampled. Some images are very similar.

In [None]:
dataset_name = 'BU-4DFE'
input_path = 'C:/DATASETS/BU-4DFE'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Happy': 'happiness',
    'Sad': 'sadness',
    'Fear': 'fear',
    'Angry': 'anger',
    'Neutral': 'neutral', 
    'Surprise': 'surprise', 
    'Disgust': 'disgust'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for img_name in os.listdir(os.path.join(input_path)):
                  
        # Gender
        gender = img_name[0]
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', img_name)

        # Label
        try:
            label = labels_dict[img_name.split('_')[1]]
        except:
            print('Label error:', img_name)
        
        # User ID
        row_id = img_name[:4]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'name': img_name,
            'class': label,
            'age': None,
            'gender': gender,
            'race': None,
            'perspective': 'front', 
            'age_group': '18-45',
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'img_path': os.path.join(input_path, img_name)})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_imgs_user([row.pop('img_path') for row in user])
        for row in user:
            row['age'] = np.clip(age, 18, 45) if age is not None else None
            row['auto_age'] = 1 if row['age'] is not None else 0
            spamwriter.writerow(row)

## CK+

- Frame sampling applied.

In [None]:
dataset_name = 'CK+'
input_path = 'C:/DATASETS/CK+'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    0: 'neutral', 
    1: 'anger',
    2: 'contempt',
    3: 'disgust',
    4: 'fear',
    5: 'happiness',
    6: 'sadness',
    7: 'surprise', 
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for dir_name1 in os.listdir(os.path.join(input_path, 'extended-cohn-kanade-images')):
        for dir_name2 in os.listdir(os.path.join(input_path, 'extended-cohn-kanade-images', dir_name1)):

            # Label
            label_path = os.path.join(input_path, 'Emotion_labels', dir_name1, dir_name2)
            if not os.path.exists(label_path):
                print('No label:', label_path)
                continue
            else:
                label_path_dirs = os.listdir(label_path)
                if len(label_path_dirs) < 1:
                    print('No label:', label_path)
                    continue
                else:
                    with open(os.path.join(label_path, label_path_dirs[0]), 'r', newline='') as f:
                        label = int(float(f.read()))
            
            # Use only first, middle and last images
            imgs_list = os.listdir(os.path.join(input_path, 'extended-cohn-kanade-images', dir_name1, dir_name2))
            for img_name, label in zip([imgs_list[0], imgs_list[len(imgs_list)//2], imgs_list[-1]], [0, label, label]):
                
                # User ID
                row_id = dir_name1
                if row_id in users:
                    user_id = users[row_id][0]['user_id']
                else:
                    user_id = dataset_name + '-' + str(id_counter)
                    users[row_id] = []
                    id_counter += 1

                users[row_id].append({
                    'dataset': dataset_name, 
                    'user_id': user_id,
                    'name': img_name,
                    'class': labels_dict[label],
                    'age': None,
                    'gender': None,
                    'race': None,
                    'perspective': 'front', 
                    'age_group': None,
                    'subset': None,
                    'auto_age': 0,
                    'auto_gender': 0,
                    'auto_perspective': 0,
                    'img_path': os.path.join(input_path, 'extended-cohn-kanade-images', dir_name1, dir_name2, img_name)})
            
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_imgs_user([row['img_path'] for row in user])
        for row in user:

            img_path = row.pop('img_path')
            perspective = pose_to_text(get_pose(cv2.imread(img_path), use_detector=True))

            row['auto_age'] = 1 if row['age'] is None and age is not None else 0
            row['age'] = age
            row['auto_gender'] = 1 if row['gender'] is None and gender is not None else 0
            row['gender'] = gender
            row['auto_perspective'] = 1 if row['perspective'] is None and perspective is not None else 0
            row['perspective'] = perspective
            spamwriter.writerow(row)

## MMI

- Bad annotations for race: only two labels when there should be more, and there are errors. For example subjects 34, 39, 5, 53 and 54 are annotated with 1. Subject 50 should not be labelled with 0 either.
- Frame sampling applied.

In [None]:
dataset_name = 'MMI'
input_path = 'C:/DATASETS/MMI'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    1: 'anger',
    2: 'disgust',
    3: 'fear',
    4: 'happiness', 
    5: 'sadness', 
    6: 'surprise',
    7: 'scream',
    8: 'boredom',
    9: 'sleepy',
}
gender_dict = {
    0: 'female',
    1: 'male'
}
race_dict = {
    0: 'white',
    1: 'hispanic',
    2: 'asian'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()

    # Get subjects info
    subjects_dict = {}
    for subject_xml in os.listdir(os.path.join(input_path, 'Subjects')):
        subject_xml_path = os.path.join(input_path, 'Subjects', subject_xml)
        subject_xml_file = ET.parse(subject_xml_path).getroot()

        id = int(subject_xml.split('subject')[-1].split('.')[0])
        subjects_dict[id] = {
            'id': id,
            'age_of_birth': int(subject_xml_file.attrib['dob'].split('-')[0]),
            'gender': gender_dict[int(subject_xml_file.attrib['gender'])],
            'race': race_dict[int(subject_xml_file.attrib['ethnicity'])]
        }

    # Get session info
    for session_num in os.listdir(os.path.join(input_path, 'Sessions')):

        # Get xml file and video
        aux = glob.glob(os.path.join(input_path, 'Sessions', session_num, '*.avi'))
        if len(aux) == 0:
            aux = glob.glob(os.path.join(input_path, 'Sessions', session_num, '*.jpg'))
        if len(aux) == 0:
            continue
        video_path = aux[0]
        xml_file = ET.parse(video_path[:-3]+'xml').getroot()
        xml_session_file = ET.parse(os.path.join(input_path, 'Sessions', session_num, 'session.xml')).getroot()

        # Label
        aux = xml_file.find(".//*[@Name='Emotion']")
        if aux is None:
            continue
        
        label = int(aux.attrib['Value'])

        # Case of no emotion label
        if label not in labels_dict:
            continue

        label = labels_dict[label]
        
        # User ID
        row_id = int(os.path.basename(video_path)[1:].split('-')[0])
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        info_subject = row_id in subjects_dict
        
        # Gender
        gender = subjects_dict[row_id]['gender'] if info_subject else None

        # Age
        age = int(xml_session_file.attrib['recDate'].split('-')[0]) - subjects_dict[row_id]['age_of_birth'] if info_subject else None

        # Perspective
        perspective = int(xml_session_file.find('track').attrib['view'])

        # Race
        race = subjects_dict[row_id]['race'] if info_subject else None
        race = None # Labels are not reliable, so we remove them

        if video_path.endswith('.jpg'):
            print(video_path)

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'video_name': os.path.basename(video_path),
            'class': label,
            'age': age,
            'gender': gender,
            'race': race,
            'perspective_type': perspective, 
            'age_group': None,
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'video_path': video_path})
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        if user[0]['age'] is None or user[0]['gender'] is None:
            age, gender = get_age_gender_videos_user(list(set([row['video_path'] for row in user])))
        for row in user:
            
            if row['age'] is None:
                row['age'] = age
                row['auto_age'] = 1 if row['age'] is not None else 0
            
            if row['gender'] is None:
                row['gender'] = gender
                row['auto_gender'] = 1 if row['gender'] is not None else 0
            
            video_path = row.pop('video_path')
            video_name = row.pop('video_name')
            perspective_type = row.pop('perspective_type')
            subject = int(video_name[1:].split('-')[0])
            label = row['class']
            
            # Sample video
            try:
                frames = get_frames(video_path, skip=5, mode='part', first_frame=0, max_frame=-1)
            except:
                print('Error:', video_path)
                continue

            for frame_i, frame in enumerate(frames):

                if frame_i not in [0, 2, 3]:
                    continue

                if frame_i == 0:
                    row['class'] = 'neutral'
                else:
                    row['class'] = label

                # Save as it is ignoring perspective field
                if subject >= 53:

                    frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'
                    # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame)
                    row['name'] = frame_name
                    row['perspective'] = 'front'
                    spamwriter.writerow(row)

                # Divide frames in 2
                elif perspective_type == 2:

                    if subject in [3, 5, 6, 15, 16]:

                        # full_right
                        frame_name = video_name.split('.')[0] + '_right_f' + str(frame_i+1) + '.png'
                        # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame[:, frame.shape[1]//2:,...])
                        row['name'] = frame_name
                        row['perspective'] = 'full_right'
                        spamwriter.writerow(row)

                        # front
                        frame_name = video_name.split('.')[0] + '_front_f' + str(frame_i+1) + '.png'
                        # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame[:, :frame.shape[1]//2,...])
                        row['name'] = frame_name
                        row['perspective'] = 'front'
                        spamwriter.writerow(row)
                    
                    else:

                        # full_left
                        frame_name = video_name.split('.')[0] + '_left_f' + str(frame_i+1) + '.png'
                        # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame[:, :frame.shape[1]//2,...])
                        row['name'] = frame_name
                        row['perspective'] = 'full_left'
                        spamwriter.writerow(row)

                        # front
                        frame_name = video_name.split('.')[0] + '_front_f' + str(frame_i+1) + '.png'
                        # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame[:, frame.shape[1]//2:,...])
                        row['name'] = frame_name
                        row['perspective'] = 'front'
                        spamwriter.writerow(row)
                
                # Rotate clockwise
                elif perspective_type == 0:

                    frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'
                    # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE))
                    row['name'] = frame_name
                    row['perspective'] = 'front'
                    spamwriter.writerow(row)
                
                # Rotate counterclockwise
                elif perspective_type == 1:

                    frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'
                    # cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), cv2.rotate(frame, cv2.ROTATE_90_COUNTERCLOCKWISE))
                    row['name'] = frame_name
                    row['perspective'] = 'front' if subject != 21 else 'full_right'
                    spamwriter.writerow(row)
        

## BioVidEmo

- Frame sampling applied.

In [None]:
dataset_name = 'BioVidEmo'
input_path = 'C:/DATASETS/BioVidEmo'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'anger': 'anger',
    'disgust': 'disgust',
    'fear': 'fear',
    'amusement': 'happiness', 
    'sad': 'sadness',
}
gender_dict = {
    'w': 'female',
    'm': 'male'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()

    # Get session info
    for video_name in os.listdir(os.path.join(input_path, 'Videos')):

        # Label
        label = labels_dict[video_name.split('-')[-1].split('.')[0]]
        
        # User ID
        row_id = video_name.split('-')[0]+video_name.split('-')[1]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1
        
        # Gender
        gender = gender_dict[video_name.split('_')[1]]

        # Age
        age = int(video_name.split('-')[1].split('_')[2])

        users[row_id].append({
            'dataset': dataset_name, 
            'user_id': user_id,
            'video_name': video_name,
            'class': label,
            'age': age,
            'gender': gender,
            'race': None,
            'perspective': None, 
            'age_group': None,
            'subset': None,
            'auto_age': 0,
            'auto_gender': 0,
            'auto_perspective': 0,
            'video_path': os.path.join(input_path, 'Videos', video_name)})
    
    for user in tqdm(users.values()):
        for row in user:
            
            video_path = row.pop('video_path')
            video_name = row.pop('video_name')
            
            # Sample video
            try:
                frames = get_frames(video_path, skip=5, mode='auto', first_frame=0, max_frame=-1)
            except:
                print('Error:', video_path)
                continue

            for frame_i, frame in enumerate(frames):
                    
                frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'
                cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame)
                row['name'] = frame_name
                
                perspective = get_pose(frame, use_detector=True)
                perspective = pose_to_text(perspective) if perspective is not None else None
                row['auto_perspective'] = 1 if perspective is not None else 0
                row['perspective'] = perspective
                spamwriter.writerow(row)

## ElderReact

- all_labels.txt obtained by merging train_labels.txt, dev_labels.txt and test_labels.txt. Header also added.
- dev, train and test folders were merged into one
- Multiple labels for the same video.
- Users with wrong labels: same identifier used for multiple users (aprox. 20%)!

In [None]:
dataset_name = 'ElderReact'
input_path = 'C:/DATASETS/ElderReact'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Happiness': 'happiness',
    'Sadness': 'sadness',
    'Fear': 'fear',
    'Anger': 'anger',
    'Surprise': 'surprise', 
    'Disgust': 'disgust'
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile, open(os.path.join(input_path, 'Annotations', 'all_labels.txt'), 'r') as csvfile_input:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    reader = csv.DictReader(csvfile_input, delimiter=' ')
    for row in tqdm(reader):
        
        # File name, fixing some naming errors
        video_name = row['filename']
        if not os.path.exists(os.path.join(input_path, 'Videos', video_name)):
            print('Not found:', video_name)
                  
        # Gender
        gender = row['Gender']
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', row)
        
        # User ID
        row_id = row['filename'].split('_'+row['filename'].split('_')[-1])[0]
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        # Multiple labels:
        some_label = False
        for label in labels_dict.keys():
            if row[label] == '1':
                some_label = True
                users[row_id].append({
                    'dataset': dataset_name, 
                    'user_id': user_id,
                    'video_name': video_name,
                    'class': labels_dict[label],
                    'age': None,
                    'gender': gender,
                    'race': None,
                    'perspective': None, 
                    'age_group': 'elderly',
                    'subset': row['subset'],
                    'auto_age': 0,
                    'auto_gender': 0,
                    'auto_perspective': 0,
                    'video_path': os.path.join(input_path, 'Videos', video_name)})
        
        # In case there are not any labels for the video
        if not some_label and len(users[row_id]) == 0:
            print('No label:', video_name)
            id_counter -= 1
            users.pop(row_id)
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, _ = get_age_gender_videos_user(list(set([row['video_path'] for row in user])))
        for row in user:
            row['age'] = age
            row['auto_age'] = 1 if row['age'] is not None else 0
            
            video_path = row.pop('video_path')
            video_name = row.pop('video_name')
            
            # Sample video
            frames = get_frames(video_path, skip=5, mode='auto', first_frame=0, max_frame=-1)
            for frame_i, frame in enumerate(frames):

                frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'

                # Save frame
                cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame)
                row['name'] = frame_name
                
                perspective = get_pose(frame, use_detector=True)
                perspective = pose_to_text(perspective) if perspective is not None else None
                row['auto_perspective'] = 1 if perspective is not None else 0
                row['perspective'] = perspective
                spamwriter.writerow(row)

## LIRIS

- summaryParticipansts.xlsx converted to summaryParticipansts.csv
- There are 17 videos with two labels
- Naming errors: S8_suprise.mp4, S7_Happy_surprise.mp4. 

In [None]:
dataset_name = 'LIRIS'
input_path = 'C:/DATASETS/LIRIS'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Happy': 'happiness',
    'happy': 'happiness',
    'sad': 'sadness',
    'fear': 'fear',
    'anger': 'anger',
    'surprise': 'surprise',
    'suprise': 'surprise',
    'disgust': 'disgust'
}
id_counter = 1
users_aux = {}
users = {}

with open(output_path, 'w', newline='') as csvfile, open(os.path.join(input_path, 'summaryParticipansts.csv'), 'r') as csvfile_input:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()

    # Get info of participants
    reader = csv.DictReader(csvfile_input, delimiter=';')
    for row in reader:
        user_id = dataset_name + '-' + str(id_counter)
        users_aux[row['Sr. No']] = {'id': user_id, 'sex': row['Sex'], 'age': row['Age']}
        id_counter += 1

    # Go through all videos
    for video_name in tqdm(os.listdir(os.path.join(input_path, 'videos_208'))):
        video_name_split = video_name.split('_')
                  
        # Gender
        gender = users_aux[video_name_split[0]]['sex']
        if gender == 'F' or gender == 'f':
            gender = 'female'
        elif gender == 'M' or gender == 'm':
            gender = 'male'
        else:
            gender = None
            print('Gender error:', row)
            
        # User ID
        row_id = users_aux[video_name_split[0]]['id']
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        # Multiple labels:
        some_label = False
        for label in labels_dict.keys(): 
            if label in video_name:
                some_label = True
                users[row_id].append({
                    'dataset': dataset_name, 
                    'user_id': user_id,
                    'video_name': video_name,
                    'class': labels_dict[label],
                    'age': users_aux[video_name_split[0]]['age'],
                    'gender': gender,
                    'race': None,
                    'perspective': None, 
                    'age_group': None,
                    'subset': None,
                    'auto_age': 0,
                    'auto_gender': 0,
                    'auto_perspective': 0,
                    'video_path': os.path.join(input_path, 'videos_208', video_name)})
        
        # In case there are not any labels for the video
        if not some_label and len(users[row_id]) == 0:
            print('No label:', video_name)
            id_counter -= 1
            users.pop(row_id)
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        for row in user:
            
            video_path = row.pop('video_path')
            video_name = row.pop('video_name')
            
            # Sample video
            frames = get_frames(video_path, skip=5, mode='auto', first_frame=0)
            label_aux = row['class']
            for frame_i, frame in enumerate(frames):
                
                if frame_i == 0:
                    row['class'] = 'neutral'
                else:
                    row['class'] = label_aux

                frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'

                # Save frame
                cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame)
                row['name'] = frame_name
                
                perspective = get_pose(frame, use_detector=True)
                perspective = pose_to_text(perspective) if perspective is not None else None
                row['auto_perspective'] = 1 if perspective is not None else 0
                row['perspective'] = perspective
                spamwriter.writerow(row)

## EMOREACT

- labels and names from all subsets merged into all_labels.text and all_names.txt
- videos merged into one folder
- there are videos with more than one label

In [None]:
dataset_name = 'EMOREACT'
input_path = 'C:/DATASETS/EMOREACT'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Happiness': 'happiness',
    'Fear': 'fear',
    'Surprise': 'surprise', 
    'Disgust': 'disgust',
    'Curiosity': 'curiosity',
    'Uncertainty': 'uncertainty',
    'Excitement': 'excitement',
    'Frustration': 'frustration',
}
id_counter = 1
users = {}

with open(output_path, 'w', newline='') as csvfile, open(os.path.join(input_path, 'Labels', 'all_labels.text'), 'r') as csv_input:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    reader = csv.DictReader(csv_input, delimiter=',', quotechar='"')
    for row in tqdm(reader):
                    
        # File name
        video_name = row['Name']
        
        # User ID
        row_id = id_counter
        if row_id in users:
            user_id = users[row_id][0]['user_id']
        else:
            user_id = dataset_name + '-' + str(id_counter)
            users[row_id] = []
            id_counter += 1

        # Multiple labels:
        some_label = False
        for label in labels_dict.keys(): 
            if row[label] == '1':
                some_label = True
                users[row_id].append({
                    'dataset': dataset_name, 
                    'user_id': None, # user_id,
                    'video_name': video_name,
                    'class': labels_dict[label],
                    'age': None,
                    'gender': None,
                    'race': None,
                    'perspective': None, 
                    'age_group': '4-14',
                    'subset': row['Subset'],
                    'auto_age': 0,
                    'auto_gender': 0,
                    'auto_perspective': 0,
                    'video_path': os.path.join(input_path, 'Data', video_name)})
        
        # In case there are not any labels for the video
        if not some_label and len(users[row_id]) == 0:
            print('No label:', video_name)
            id_counter -= 1
            users.pop(row_id)
    
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        age, gender = get_age_gender_videos_user(list(set([row['video_path'] for row in user])))
        for row in user:
            row['age'] = np.clip(age, 4, 14) if age is not None else None
            row['auto_age'] = 1 if row['age'] is not None else 0
            row['gender'] = gender
            row['auto_gender'] = 1 if row['gender'] is not None else 0
            
            video_path = row.pop('video_path')
            video_name = row.pop('video_name')
            
            # Sample video
            frames = get_frames(video_path, skip=5, mode='auto', first_frame=0, max_frame=-1)
            for frame_i, frame in enumerate(frames):

                frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'

                # Save frame
                cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame)
                row['name'] = frame_name
                
                perspective = get_pose(frame, use_detector=True)
                perspective = pose_to_text(perspective) if perspective is not None else None
                row['auto_perspective'] = 1 if perspective is not None else 0
                row['perspective'] = perspective
                spamwriter.writerow(row)

## SFEW/AFEW

- Test subset not used because it is not labeled!
- Actors' age used.
- Errors in Train_6.xml: video 00:37:01,120 has age and name inverted

### AFEW

In [None]:
dataset_name = 'AFEW'
input_path = 'C:/DATASETS/AFEW-SFEW/AFEW'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Happy': 'happiness',
    'Sad': 'sadness',
    'Fear': 'fear',
    'Angry': 'anger',
    'Neutral': 'neutral', 
    'Surprise': 'surprise', 
    'Disgust': 'disgust'
}
id_counter = 1
users = {}
users_aux = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()

    # For Train and Val subsets
    for subset in ['Train', 'Val']:

        xml_file = ET.parse(os.path.join(input_path, 'Labels', subset+'_6.xml')).getroot()

        # Create xml dict
        dict_movie = {}
        for xml_movie in xml_file:

            n_temp = xml_movie[0].text
            video_name = n_temp[:2] + n_temp[3:5] + n_temp[6:8] + n_temp[9:12] + '.avi'
            
            # User ID
            temp_id = xml_movie[2].attrib['NameOfActor']
            if temp_id in users_aux:
                user_id = users_aux[temp_id]
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users_aux[temp_id] = user_id
                id_counter += 1

            # Gender
            gender = xml_movie[2].attrib['Gender'].lower()
            if gender != 'female' and gender != 'male':
                print('Gender error:', video_name)
            
            # Perspective
            perspective = xml_movie[2].attrib['Pose']
            if perspective == "Frontal":
                perspective = 'front'

            dict_movie[video_name] = {
                'user_id': user_id,
                'gender': gender,
                'age': xml_movie[2].attrib['AgeOfActor'],
                'perspective': perspective
            }
        
        # Go through all labels
        for label in os.listdir(os.path.join(input_path, 'Videos', subset+'_AFEW')):

            # Go through all videos
            for video_name in os.listdir(os.path.join(input_path, 'Videos', subset+'_AFEW', label)):
                
                if dict_movie[video_name]['user_id'] not in users:
                    users[dict_movie[video_name]['user_id']] = []

                users[dict_movie[video_name]['user_id']].append({
                    'dataset': dataset_name, 
                    'user_id': dict_movie[video_name]['user_id'],
                    'video_name': video_name,
                    'class': labels_dict[label],
                    'age': None if dict_movie[video_name]['age'] is None or dict_movie[video_name]['age'] == '' else dict_movie[video_name]['age'],
                    'gender': None if dict_movie[video_name]['gender'] is None or dict_movie[video_name]['gender'] == '' else dict_movie[video_name]['gender'],
                    'race': None,
                    'perspective': None, #dict_movie[video_name]['perspective'],
                    'age_group': None,
                    'subset': subset.lower(),
                    'auto_age': 0,
                    'auto_gender': 0,
                    'video_path': os.path.join(input_path, 'Videos', subset+'_AFEW', label, video_name)})
                     
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        if user[0]['age'] is None or user[0]['gender'] is None:
            age, gender = get_age_gender_videos_user(list(set([row['video_path'] for row in user])), reader='opencv')
        for row in user:
            
            # Age
            if row['age'] is None:
                row['age'] = age
                row['auto_age'] = 1 if row['age'] is not None else 0
            
            # Gender
            if row['gender'] is None:
                row['gender'] = gender
                row['auto_gender'] = 1 if row['gender'] is not None else 0
            
            video_path = row.pop('video_path')
            video_name = row.pop('video_name')
            
            # Sample video
            frames = get_frames_opencv(video_path, skip=5, mode='auto', first_frame=0)
            for frame_i, frame in enumerate(frames):

                frame_name = video_name.split('.')[0] + '_f' + str(frame_i+1) + '.png'

                # Save frame
                cv2.imwrite(os.path.join(input_path, 'Frames', frame_name), frame)
                row['name'] = frame_name

                perspective = get_pose(frame, use_detector=True)
                perspective = pose_to_text(perspective) if perspective is not None else None
                row['auto_perspective'] = 1 if perspective is not None else 0
                row['perspective'] = perspective
                spamwriter.writerow(row)

### SFEW

In [None]:
dataset_name = 'SFEW'
input_path = 'C:/DATASETS/AFEW-SFEW/SFEW'
xml_input_path =  'C:/DATASETS/AFEW-SFEW/AFEW'
output_path = os.path.join(output_labels_path, 'labels_' + dataset_name + '.csv')
labels_dict = {
    'Happy': 'happiness',
    'Sad': 'sadness',
    'Fear': 'fear',
    'Angry': 'anger',
    'Neutral': 'neutral', 
    'Surprise': 'surprise', 
    'Disgust': 'disgust'
}
id_counter = 1
users_aux = {}
users = {}

with open(output_path, 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()

    # For Train and Val subsets
    for subset in ['Train', 'Val']:

        xml_file = ET.parse(os.path.join(xml_input_path, 'Labels', subset+'_6.xml')).getroot()

        # Create xml dict
        dict_movie = {}
        for xml_movie in xml_file:

            n_temp = xml_movie[0].text
            video_name = xml_movie.attrib['MovieTitle'] + '_' + n_temp[:2] + n_temp[3:5] + n_temp[6:8] + n_temp[9:12]
            
            # User ID
            temp_id = xml_movie[2].attrib['NameOfActor']
            if temp_id in users_aux:
                user_id = users_aux[temp_id]
            else:
                user_id = dataset_name + '-' + str(id_counter)
                users_aux[temp_id] = user_id
                id_counter += 1

            # Gender
            gender = xml_movie[2].attrib['Gender'].lower()
            if gender != 'female' and gender != 'male':
                print('Gender error:', video_name)
            
            # Perspective
            perspective = xml_movie[2].attrib['Pose']
            if perspective == "Frontal":
                perspective = 'front'

            dict_movie[video_name] = {
                'user_id': user_id,
                'gender': gender,
                'age': xml_movie[2].attrib['AgeOfActor'],
                'perspective': perspective
            }
        
        # Go through all labels
        for label in os.listdir(os.path.join(input_path, subset)):

            # Go through all images
            for img_name in os.listdir(os.path.join(input_path, subset, label, label)):

                try:
                    img_split = img_name.split('_')
                    video_name = img_split[0]
                    for i in range(1, len(img_split)-1):
                        video_name += '_' + img_split[i]
                except:
                    video_name = None
                    print('Error parsing image:', img_name)

                if video_name in dict_movie:
                    
                    row_id = dict_movie[video_name]['user_id']
                    if row_id not in users:
                        users[row_id] = []
                    users[row_id].append({
                        'dataset': dataset_name, 
                        'user_id': dict_movie[video_name]['user_id'],
                        'name': img_name,
                        'class': labels_dict[label],
                        'age': dict_movie[video_name]['age'],
                        'gender': dict_movie[video_name]['gender'],
                        'race': None,
                        'perspective': dict_movie[video_name]['perspective'],
                        'age_group': None,
                        'subset': subset.lower(),
                        'auto_age': 0,
                        'auto_gender': 0,
                        'auto_perspective': 0,
                        'img_path': os.path.join(input_path, subset, label, label, img_name)})
                else:
                    if video_name is not None:
                        print('Video not found in xml:', video_name)
                    
                    user_id = dataset_name + '-' + str(id_counter)
                    id_counter += 1
                    users[user_id] = [{
                        'dataset': dataset_name, 
                        'user_id': None,
                        'name': img_name,
                        'class': labels_dict[label],
                        'age': None,
                        'gender': None,
                        'race': None,
                        'perspective': None,
                        'age_group': None,
                        'subset': subset.lower(),
                        'auto_age': 0,
                        'auto_gender': 0,
                        'auto_perspective': 0,
                        'img_path': os.path.join(input_path, subset, label, label, img_name)}]
        
    # Add computed age and gender for the images of a user
    for user in tqdm(users.values()):
        if user[0]['age'] is None or user[0]['gender'] is None:
            age, gender = get_age_gender_imgs_user([row['img_path'] for row in user])
        else:
            age = None
            gender = None
        for row in user:
            img_path = row.pop('img_path')

            if user[0]['perspective'] is None or user[0]['perspective'] == 'Non-Frontal':
                perspective = get_pose(cv2.imread(img_path), use_detector=True)
                perspective = pose_to_text(perspective) if perspective is not None else None
                
            row['age'] = age if row['age'] is None else row['age']
            row['auto_age'] = 1 if age is not None else 0

            row['gender'] = gender if row['gender'] is None else row['gender']
            row['auto_gender'] = 1 if gender is not None else 0
            
            row['auto_perspective'] = 1 if (row['perspective'] is None or row['perspective'] == 'Non-Frontal') and perspective is not None else 0
            row['perspective'] = perspective
            
            spamwriter.writerow(row)

# UNIFIED DATASET

## Merge dataset labels

In [None]:
skip = ['AFEW']

with open(os.path.join(output_root, 'labels.csv'), 'w', newline='') as csvfile:
    spamwriter = csv.DictWriter(csvfile, fieldnames=csv_columns, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    spamwriter.writeheader()
    
    for csv_file_name in os.listdir(output_labels_path):
        
        # Skip datasets
        csv_dname = csv_file_name.split('_')[1].split('.')[0]
        if csv_dname in skip:
            continue
        
        with open(os.path.join(output_labels_path, csv_file_name), 'r', newline='') as csvfile_input: 
            reader = csv.DictReader(csvfile_input, delimiter=',', quotechar='"')
            for row in reader:
                spamwriter.writerow(row)

## Copy all images

In [None]:
if not os.path.exists(output_imgs_path):
    os.makedirs(output_imgs_path)

# AffectNet
dataset_name = 'AffectNet'
input_path = 'C:/DATASETS/AffectNetUniques'
labels = ['neutral','happiness','sadness','surprise','fear','disgust','anger','contempt']
print(dataset_name)
for class_i, label in tqdm(enumerate(labels)):
    for img_name in os.listdir(os.path.join(input_path, str(class_i))):
        shutil.copy(os.path.join(input_path, str(class_i), img_name), output_imgs_path)

# DDCF
dataset_name = 'DDCF'
input_path = 'C:/DATASETS/DDCF'
labels_in =  ['Afraid','Angry','Disgusted','Happy','Neutral','Pleased','Sad','Surprised']
print(dataset_name)
for class_i, label in tqdm(enumerate(labels_in)):
    for dir_name in os.listdir(os.path.join(input_path, label)):
        for img_name in os.listdir(os.path.join(input_path, label, dir_name)):
            shutil.copy(os.path.join(input_path, label, dir_name, img_name), output_imgs_path)

# DEFSS
dataset_name = 'DEFSS'
input_path = 'C:/DATASETS/DEFSS/Images'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# FACES
dataset_name = 'FACES'
input_path = 'C:/DATASETS/FACES/FACES'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# NIMH-ChEFS
dataset_name = 'NIMH-ChEFS'
input_path = 'C:/DATASETS/NIMH-ChEFS'
print(dataset_name)
for dir_name in tqdm(os.listdir(os.path.join(input_path))):
    for img_name in os.listdir(os.path.join(input_path, dir_name)):
        shutil.copy(os.path.join(input_path, dir_name, img_name), output_imgs_path)

# RaFD
dataset_name = 'RaFD'
input_path = 'C:/DATASETS/RaFD'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# FER2013
dataset_name = 'FER2013'
input_path = 'C:/DATASETS/FER2013'
labels_in =  ['Sad', 'Neutral', 'Angry', 'Disgust','Surprise','Fear', 'Happy']
print(dataset_name)
for label in tqdm(labels_in):
    for img_name in os.listdir(os.path.join(input_path, label)):
        shutil.copy(os.path.join(input_path, label, img_name), output_imgs_path)

# ExpW
dataset_name = 'ExpW'
input_path = 'C:/DATASETS/ExpW/cropped'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# RAF-DB
dataset_name = 'RAF-DB'
input_path = 'C:/DATASETS/RAF-DB/DATASET'
labels_in =  ['5', '7', '6', '3', '1', '2', '4']
print(dataset_name)
for split in ['train', 'test']:
    for label in tqdm(labels_in):
        for img_name in os.listdir(os.path.join(input_path, split, label)):
            shutil.copy(os.path.join(input_path, split, label, img_name), output_imgs_path)

# NHFI
dataset_name = 'NHFI'
input_path = 'C:/DATASETS/NHFI'
labels_in = ['sadness', 'neutrality', 'anger', 'disgust','surprise','fear', 'happiness', 'contempt']
print(dataset_name)
for label in tqdm(labels_in):
    for img_name in os.listdir(os.path.join(input_path, label)):
        shutil.copy(os.path.join(input_path, label, img_name), os.path.join(output_imgs_path, label + '_' + img_name)) # Modified destiny name to avoid repeated names

# WSEFEP
dataset_name = 'WSEFEP'
input_path = 'C:/DATASETS/WSEFEP/images'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# KDEF
dataset_name = 'KDEF'
input_path = 'C:/DATASETS/KDEF/images'
print(dataset_name)
for dir_name in tqdm(os.listdir(os.path.join(input_path))):
    for img_name in os.listdir(os.path.join(input_path, dir_name)):
        shutil.copy(os.path.join(input_path, dir_name, img_name), output_imgs_path)

# JAFFE
dataset_name = 'JAFFE'
input_path = 'C:/DATASETS/JAFFE'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# FEGA
dataset_name = 'FEGA'
input_path = 'C:/DATASETS/FEGA'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# LIFESPAN
dataset_name = 'LIFESPAN'
input_path = 'C:/DATASETS/LIFESPAN/Expressions'
print(dataset_name)
for dir1 in os.listdir(os.path.join(input_path)):
    for img_name in os.listdir(os.path.join(input_path, dir1)):
        shutil.copy(os.path.join(input_path, dir1, img_name), output_imgs_path)

# Google-FE-Test
dataset_name = 'Google-FE-Test'
input_path = 'C:/DATASETS/Google-FE-Test'
labels = ['anger','disgust','fear','happiness','neutral','sadness','surprise']
print(dataset_name)
for class_i, label in enumerate(labels):
    for img_name in os.listdir(os.path.join(input_path, str(class_i))):
        shutil.copy(os.path.join(input_path, str(class_i), img_name), output_imgs_path)

# BU-4DFE
dataset_name = 'BU-4DFE'
input_path = 'C:/DATASETS/BU-4DFE'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# CK+
dataset_name = 'CK+'
input_path = 'C:/DATASETS/CK+'
print(dataset_name)
for dir_name1 in os.listdir(os.path.join(input_path, 'extended-cohn-kanade-images')):
    for dir_name2 in os.listdir(os.path.join(input_path, 'extended-cohn-kanade-images', dir_name1)):
        for img_name in os.listdir(os.path.join(input_path, 'extended-cohn-kanade-images', dir_name1, dir_name2)):
            shutil.copy(os.path.join(input_path, 'extended-cohn-kanade-images', dir_name1, dir_name2, img_name), output_imgs_path)

# MMI
dataset_name = 'MMI'
input_path = 'C:/DATASETS/MMI/Frames'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# BioVidEmo
dataset_name = 'BioVidEmo'
input_path = 'C:/DATASETS/BioVidEmo/Frames'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# ElderReact
dataset_name = 'ElderReact'
input_path = 'C:/DATASETS/ElderReact/Frames'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# LIRIS
dataset_name = 'LIRIS'
input_path = 'C:/DATASETS/LIRIS/Frames'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# EMOREACT
dataset_name = 'EMOREACT'
input_path = 'C:/DATASETS/EMOREACT/Frames'
print(dataset_name)
for img_name in tqdm(os.listdir(input_path)):
    shutil.copy(os.path.join(input_path, img_name), output_imgs_path)

# AFEW/SFEW
choice = 'SFEW'

if choice == 'AFEW':
    dataset_name = 'AFEW'
    input_path = 'C:/DATASETS/AFEW-SFEW/AFEW/Frames'
    print(dataset_name)
    for img_name in tqdm(os.listdir(input_path)):
        shutil.copy(os.path.join(input_path, img_name), output_imgs_path)
elif choice == 'SFEW':
    dataset_name = 'SFEW'
    input_path = 'C:/DATASETS/AFEW-SFEW/SFEW'
    print(dataset_name)
    for subset in ['Train', 'Val']:
        for label in os.listdir(os.path.join(input_path, subset)):
            for img_name in tqdm(os.listdir(os.path.join(input_path, subset, label, label))):
                shutil.copy(os.path.join(input_path, subset, label, label, img_name), output_imgs_path)

## Check all images of a CSV file exist

In [None]:
target_csv_file = os.path.join(output_root, 'labels.csv')
target_imgs_folder = output_imgs_path

with open(target_csv_file, 'r', newline='') as csvfile_input: 
    reader = csv.DictReader(csvfile_input, delimiter=',', quotechar='"')
    for row in tqdm(reader):
        if not os.path.exists(os.path.join(target_imgs_folder, row['name'])):
            print('Image:', row['name'], 'does no exist!')

## Check all images in directory are in target CSV file

In [None]:
target_csv_file = os.path.join(output_root, 'labels.csv')
target_imgs_folder = output_imgs_path
action = 'list' # 'remove'
counter = 0

with open(target_csv_file, 'r', newline='') as csvfile_input: 
    reader = csv.DictReader(csvfile_input, delimiter=',', quotechar='"')
    
    # Set to store all names
    names = set()
    for row in tqdm(reader):
        names.add(row['name'])
    
    # Check each image
    for img_name in tqdm(os.listdir(target_imgs_folder)):
        if img_name not in names:
            counter += 1
            if action == 'remove':
                os.remove(os.path.join(target_imgs_folder, img_name))
                print('Image:', img_name, 'does no exist in CSV file. DELETED')
            else:
                print('Image:', img_name, 'does no exist in CSV file!')

if action == 'remove':
    print('Total images removed:', counter)
else:
    print('Total images not in CSV:', counter)

# Fix some fields

In [None]:
# Define a function to map age_group values to age_group_clean
def map_age_group(age_group):
    if age_group in ['kid', '10-17', '4-14']:
        return 'child'
    elif age_group in ['young', 'middle-age', '18-45', '20-30']:
        return 'middle-age'
    elif age_group == 'elderly':
        return 'elder'
    else:
        return None  # Handle NaN values

# Define a function to map age values to age_group_clean
def map_age_to_clean(age):
    if pd.isna(age):
        return None
    elif 0 <= age <= 17:
        return 'child'
    elif 18 <= age <= 59:
        return 'middle-age'
    elif age >= 60:
        return 'elder'

# Define a function to map age_group values to age_group_clean
def map_gaze(perspective):
    if perspective is None:
        return None
    if 'gaze_left' in perspective:
        return 'left'
    elif 'gaze_right' in perspective:
        return 'right'
    elif 'averted_gaze' in perspective:
        return 'averted'
    else:
        return 'front'

# Define a function to map age values to perspective
def map_perspective(perspective):
    if perspective is None:
        return None
    elif perspective == 'Non-Frontal':
        return 'non-frontal'
    elif perspective == 'averted_gaze':
        return 'front'
    elif 'gaze_left' in perspective:
        return perspective.split('_gaze_left')[0]
    elif 'gaze_right' in perspective:
        return perspective.split('_gaze_right')[0]
    else:
        return perspective

# Define a function to map race values
def map_race(race):
    if race == 'caucasian':
        return 'caucasian'
    elif race == 'black':
        return 'black'
    elif race == 'asian':
        return 'asian'
    elif race == 'indian':
        return 'desi'
    elif race == 'hispanic':
        return 'hispanic'
    elif race == 'arab':
        return 'arab'
    elif race == 'white':
        return 'caucasian'
    elif race == 'polish':
        return 'caucasian'
    elif race == 'swedish':
        return 'caucasian'
    elif race == 'moroccan':
        return 'arab'
    elif race == 'japanese':
        return 'asian'
    else:
        print(race)
        return None

In [None]:
dtypes = {
    'dataset': 'category',
    'user_id': 'category',
    'name': str,
    'class': 'category',
    'age': 'Int8',
    'gender':'category' ,
    'race': 'category',
    'perspective': 'category',
    'age_group': 'category',
    'subset': 'category',
    'auto_age': bool,
    'auto_gender': bool,
    'auto_perspective': bool}
df = pd.read_csv(csv_file_in, dtype=dtypes, sep=',', quotechar='"')

# Apply the mapping function to create the 'age_group_clean' column
df['age_group_clean'] = df['age_group'].apply(map_age_group)

# Apply the mapping function to update the 'age_group_clean' column based on age
df['age_group_clean'] = df.apply(lambda row: map_age_to_clean(row['age']) if pd.isnull(row['age_group_clean']) else row['age_group_clean'], axis=1).astype('category')

# Apply the mapping function to create the 'age_group_clean' column
df['gaze'] = df['perspective'].apply(map_gaze).astype('category')

# Apply the mapping function to update the 'perspective'
df['perspective'] = df['perspective'].apply(map_perspective).astype('category') 

# Apply the mapping function to update the race
df['race'] = df['race'].apply(map_race).astype('category')

# Chantge 'auto_age' and 'auto_gender' to int
df['auto_age'] = df['auto_age'].astype(int)
df['auto_gender'] = df['auto_gender'].astype(int)
df['auto_perspective'] = df['auto_perspective'].astype(int)

# Save dataframe to csv
df.to_csv(csv_file_out, index=False, sep=',', quotechar='"')