# K-fold cross-validation with 19 datasets

In [None]:
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedGroupKFold

dataset_root = 'C:/DATASETS/AGE-FER'
dataset_labels_path = os.path.join(dataset_root, '24-datasets.csv')
dataset_imgs_path = os.path.join(dataset_root, 'images')

dtypes = {
    'dataset': 'category',
    'user_id': 'category',
    'name': str,
    'class': 'category',
    'age': 'Int8',
    'gender':'category' ,
    'race': 'category',
    'perspective': 'category',
    'age_group': 'category',
    'subset': 'category',
    'auto_age': bool,
    'auto_gender': bool,
    'age_group_clean': 'category',
    'gaze': 'category',
    'auto_perspective': bool}

In [None]:
output_dir = os.path.join(dataset_root, 'cv-labels')
base_name = '24-datasets'

In [3]:
df_global = pd.read_csv(dataset_labels_path, dtype=dtypes, sep=',', quotechar='"')

# Convert auto_age and auto_gender columns to int
df_global['auto_age'] = df_global['auto_age'].astype(int)
df_global['auto_gender'] = df_global['auto_gender'].astype(int)
df_global['auto_perspective'] = df_global['auto_perspective'].astype(int)

pd.set_option('display.max_rows', 10)
df_global

Unnamed: 0,dataset,user_id,name,class,age,gender,race,perspective,age_group,subset,auto_age,auto_gender,auto_perspective,age_group_clean,gaze
0,AffectNet,,977_934736BB31B161BBDCCEF70484DDD76A62683731CA...,anger,45,male,,front,,,1,1,True,middle-age,front
1,AffectNet,,681_790C9D5516D0F3CF9EB560940312610E78DFEBF447...,anger,48,female,,front,,,1,1,True,middle-age,front
2,AffectNet,,814_EF32741B5DD39DE5A2EBC9D88FDA045B14B1C6BF2C...,anger,37,male,,front,,,1,1,True,middle-age,front
3,AffectNet,,891_F5FED292C640581211BC28466E9C105F0DEF3D1F8E...,anger,7,female,,front,,,1,1,True,child,front
4,AffectNet,,134_64A7E0A32E8D8BD8F15307FCF49DEE6D47A619C153...,anger,55,male,,front,,,1,1,True,middle-age,front
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
446359,WSEFEP,WSEFEP-24,PB_0001.jpg,neutral,21,male,polish,front,20-30,,1,1,True,middle-age,front
446360,WSEFEP,WSEFEP-7,KA_0003.jpg,neutral,29,male,polish,front,20-30,,1,1,True,middle-age,front
446361,WSEFEP,WSEFEP-28,RB_0006.jpg,neutral,25,male,polish,front,20-30,,1,1,True,middle-age,front
446362,WSEFEP,WSEFEP-16,MK_0001.jpg,neutral,26,male,polish,front,20-30,,1,1,True,middle-age,front


In [4]:
# Build k stratified subsets of the dataframe
def make_k_subsets(df, k, output_dir, base_name, seed=777):

    # Drop the 'NaN' age group
    df = df.dropna(subset=['age_group_clean'])

    # First create the output directory if it does not exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Use sklearn's StratifiedKFold to split the data
    skf = StratifiedGroupKFold(n_splits=k, shuffle=True, random_state=seed)

    # Add key for groups, of str type. It will be the user_id if it is not in, otherwise it will be the name
    df['key'] = df['user_id'].astype(str)
    df.loc[df['user_id'].isna(), 'key'] = df['name']

    # Add key for strats, of str type. It will be the class + age_group + dataset (same samples of these types in each fold)
    df['strat'] = df['class'].astype(str) + '_' + df['age_group_clean'].astype(str) + '_' + df['dataset'].astype(str)

    # Iterate over the splits
    for i, (train_index, test_index) in enumerate(skf.split(df, df['strat'], groups=df['key'].astype(str))):
        
        # Create the train and test dataframes
        df_train = df.iloc[train_index]
        df_test = df.iloc[test_index]

        # Save the dataframes to CSV files
        train_file = os.path.join(output_dir, base_name + '_train_' + str(i+1) + '.csv')
        test_file = os.path.join(output_dir, base_name + '_test_' + str(i+1) + '.csv')
        df_train.to_csv(train_file, index=False)
        df_test.to_csv(test_file, index=False)

In [None]:
# Make k stratified subsets for the balanced dataframe by age group
make_k_subsets(df_global, k=5, output_dir=output_dir, base_name=base_name, seed=777)

## Make sure all datasets are present in all splits

In [6]:
# Get distinct datasets from df_global
datasets = df_global['dataset'].unique()

### By subset and dataset

In [None]:
df = pd.DataFrame(columns=['subset', 'fold']+list(df_global['dataset'].unique()))
for k in range(1, 6):
    for subset in ['train', 'test']:
        file = os.path.join(output_dir, base_name + '_' + subset + '_' + str(k) + '.csv')
        df_split = pd.read_csv(file, sep=',', quotechar='"')

        # Group by 'class' and 'dataset' and count the occurrences
        grouped_df = df_split.groupby(['dataset']).size().to_frame('count')

        # Add grouped_df to df
        df_row_counts = []
        for dataset in df_global['dataset'].unique():
            if dataset not in grouped_df.index:
                count = 0
            else:
                count = grouped_df.loc[dataset].sum()
            df_row_counts.append(count)
        df.loc[len(df)] = [subset, k] + df_row_counts

pd.set_option('display.max_columns', 30)

In [8]:
df[df['subset'] == 'train']

Unnamed: 0,subset,fold,AffectNet,BioVidEmo,BU-4DFE,CK+,DDCF,DEFSS,ElderReact,EMOREACT,ExpW,FACES,FEGA,FER2013,Google-FE-Test,JAFFE,KDEF,LIFESPAN,LIRIS,MMI,NHFI,NIMH-ChEFS,RAF-DB,RaFD,SFEW,WSEFEP
0,train,1,205787,1739,1181,292,2663,335,8839,4309,73324,1632,1316,28736,167,171,2348,837,888,472,4286,418,12268,3465,1077,162
2,train,2,205739,1717,1162,296,2680,309,9337,4338,73387,1620,1385,28708,168,127,2390,835,922,514,4275,432,12267,3528,1115,169
4,train,3,205768,1763,1159,302,2672,324,8597,4353,73426,1656,1282,28638,176,173,2304,828,898,505,4279,421,12269,3402,1105,183
6,train,4,205635,1718,1145,266,2682,320,9503,4309,73517,1644,1262,28694,166,190,2388,838,803,525,4289,439,12260,3213,1099,175
8,train,5,205743,1615,1177,268,2699,328,9984,4307,73378,1656,1427,28764,163,191,2306,846,873,496,4271,422,12280,3276,1136,155


In [9]:
df[df['subset'] == 'test']

Unnamed: 0,subset,fold,AffectNet,BioVidEmo,BU-4DFE,CK+,DDCF,DEFSS,ElderReact,EMOREACT,ExpW,FACES,FEGA,FER2013,Google-FE-Test,JAFFE,KDEF,LIFESPAN,LIRIS,MMI,NHFI,NIMH-ChEFS,RAF-DB,RaFD,SFEW,WSEFEP
1,test,1,51381,399,275,64,686,69,2726,1095,18434,420,352,7149,43,42,586,209,208,156,1064,115,3068,756,306,49
3,test,2,51429,421,294,60,669,95,2228,1066,18371,432,283,7177,42,86,544,211,174,114,1075,101,3069,693,268,42
5,test,3,51400,375,297,54,677,80,2968,1051,18332,396,386,7247,34,40,630,218,198,123,1071,112,3067,819,278,28
7,test,4,51533,420,311,90,667,84,2062,1095,18241,408,406,7191,44,23,546,208,293,103,1061,94,3076,1008,284,36
9,test,5,51425,523,279,88,650,76,1581,1097,18380,396,241,7121,47,22,628,200,223,132,1079,111,3056,945,247,56


### By subset, dataset and specific class

In [None]:
df = pd.DataFrame(columns=['subset', 'fold', 'class'] + list(df_global['dataset'].unique()))
for k in range(1, 6):
    for subset in ['train', 'test']:
        file = os.path.join(output_dir, base_name + '_' + subset + '_' + str(k) + '.csv')
        df_split = pd.read_csv(file, sep=',', quotechar='"')

        # Group by 'class' and 'dataset' and count the occurrences
        grouped_df = df_split.groupby(['class', 'dataset']).size().to_frame('count')

        # Add grouped_df to df
        for class_label in df_split['class'].unique():
            df_row_counts = []
            for dataset in df_global['dataset'].unique():
                if (class_label, dataset) not in grouped_df.index:
                    count = 0
                else:
                    count = grouped_df.loc[(class_label, dataset)].sum()
                df_row_counts.append(count)
            df.loc[len(df)] = [subset, k, class_label] + df_row_counts

pd.set_option('display.max_columns', 30)

In [18]:
split = 'train'
target_class = 'surprise'
df[(df['subset'] == split) & (df['class'] == target_class)]

Unnamed: 0,subset,fold,class,AffectNet,BioVidEmo,BU-4DFE,CK+,DDCF,DEFSS,ElderReact,EMOREACT,ExpW,FACES,FEGA,FER2013,Google-FE-Test,JAFFE,KDEF,LIFESPAN,LIRIS,MMI,NHFI,NIMH-ChEFS,RAF-DB,RaFD,SFEW,WSEFEP
5,train,1,surprise,10462,0,194,102,378,0,2139,1176,5654,0,139,3184,20,24,336,63,211,58,623,0,1270,495,107,23
19,train,2,surprise,10369,0,189,96,368,0,2164,1183,5672,0,136,3215,26,18,342,62,211,68,619,0,1318,504,131,24
33,train,3,surprise,10325,0,184,96,368,0,2184,1195,5598,0,141,3211,25,24,330,62,212,66,622,0,1280,486,127,26
47,train,4,surprise,10424,0,197,96,374,0,2306,1168,5637,0,132,3205,23,27,342,63,187,70,623,0,1291,459,116,25
61,train,5,surprise,10408,0,192,98,416,0,2427,1174,5663,0,156,3193,26,27,330,62,227,66,613,0,1309,468,131,22


### By subset, dataset and specific age group

In [None]:
df = pd.DataFrame(columns=['subset', 'fold', 'age_group_clean'] + list(df_global['dataset'].unique()))
for k in range(1, 6):
    for subset in ['train', 'test']:
        file = os.path.join(output_dir, base_name + '_' + subset + '_' + str(k) + '.csv')
        df_split = pd.read_csv(file, sep=',', quotechar='"')

        # Group by 'age_group_clean' and 'dataset' and count the occurrences
        grouped_df = df_split.groupby(['age_group_clean', 'dataset']).size().to_frame('count')

        # Add grouped_df to df
        for age_group in df_split['age_group_clean'].unique():
            df_row_counts = []
            for dataset in df_global['dataset'].unique():
                if (age_group, dataset) not in grouped_df.index:
                    count = 0
                else:
                    count = grouped_df.loc[(age_group, dataset)].sum()
                df_row_counts.append(count)
            df.loc[len(df)] = [subset, k, age_group] + df_row_counts

pd.set_option('display.max_columns', 30)

In [25]:
split = 'test'
target_group = 'child'
df[(df['subset'] == split) & (df['age_group_clean'] == target_group)]

Unnamed: 0,subset,fold,age_group_clean,AffectNet,BioVidEmo,BU-4DFE,CK+,DDCF,DEFSS,ElderReact,EMOREACT,ExpW,FACES,FEGA,FER2013,Google-FE-Test,JAFFE,KDEF,LIFESPAN,LIRIS,MMI,NHFI,NIMH-ChEFS,RAF-DB,RaFD,SFEW,WSEFEP
3,test,1,child,7401,399,0,9,686,39,0,1095,2387,0,0,1319,2,0,0,0,208,0,117,115,901,63,10,0
11,test,2,child,7428,421,0,3,669,70,0,1066,2316,0,0,1308,2,0,0,0,174,0,140,101,936,63,11,0
17,test,3,child,7236,375,0,1,677,51,0,1051,2430,0,0,1318,2,0,0,0,198,0,112,112,955,63,26,0
23,test,4,child,7339,420,0,9,667,57,0,1095,2457,0,0,1310,0,0,0,0,293,0,132,94,944,252,25,0
29,test,5,child,7529,523,0,28,650,53,0,1097,2314,0,0,1285,2,0,0,0,223,0,115,111,858,189,16,0
