In [108]:
import pandas as pd
import numpy as np

In [109]:
# Load a CSV file for FairFace dataset
fairface_df = pd.read_csv('FairFaceFiltered.csv')

# Display the first few rows of the DataFrame
fairface_df.head()

Unnamed: 0,image,age,gender,race,age_range
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,1,1,30-39
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,1,20-29
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,1,20-29
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,1,1,30-39
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2,0,1,10-19


In [110]:
# Load a CSV file for UTKFacae dataset
utk_df = pd.read_csv('UTKFaceFilteredDataset.csv')

# Display the first few rows of the DataFrame
utk_df.head()

Unnamed: 0,image,age,gender,race
0,<PIL.Image.Image image mode=RGB size=224x224 a...,18,1,3
1,<PIL.Image.Image image mode=RGB size=224x224 a...,34,0,3
2,<PIL.Image.Image image mode=RGB size=224x224 a...,50,0,3
3,<PIL.Image.Image image mode=RGB size=224x224 a...,23,0,3
4,<PIL.Image.Image image mode=RGB size=224x224 a...,32,0,3


In [111]:
# Define the age mapping
age_mapping = {
    0: "0-2",
    1: "3-9",
    2: "10-19",
    3: "20-29",
    4: "30-39",
    5: "40-49",
    6: "50-59",
    7: "60-69",
    8: "70+"
}

In [112]:
# function to map age_range 
def map_age_to_group(age):
    if 3 <= age <= 9:
        return age_mapping[1]
    elif 10 <= age <= 19:
        return age_mapping[2]
    elif 20 <= age <= 29:
        return age_mapping[3]
    elif 30 <= age <= 39:
        return age_mapping[4]
    elif 40 <= age <= 49:
        return age_mapping[5]
    elif 50 <= age <= 59:
        return age_mapping[6]
    elif 60 <= age <= 69:
        return age_mapping[7]
    else:
        return None  
    
# Define the function to map exact ages to the cluster numbers
def map_age_to_cluster(age):
    if 3 <= age <= 9:
        return 1
    elif 10 <= age <= 19:
        return 2
    elif 20 <= age <= 29:
        return 3
    elif 30 <= age <= 39:
        return 4
    elif 40 <= age <= 49:
        return 5
    elif 50 <= age <= 59:
        return 6
    elif 60 <= age <= 70:
        return 7
    else:
        return None  # For ages outside the clusters we are interested in



In [113]:
# Apply the mapping to create a new 'age_range' column
utk_df['age_range'] = utk_df['age'].apply(map_age_to_group)
utk_df['age_group'] = utk_df['age'].apply(map_age_to_cluster)


In [114]:
utk_df['age_group'] = utk_df['age_group'].astype('int64')

In [115]:
utk_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3612 entries, 0 to 3611
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   image      3612 non-null   object
 1   age        3612 non-null   int64 
 2   gender     3612 non-null   int64 
 3   race       3612 non-null   int64 
 4   age_range  3593 non-null   object
 5   age_group  3612 non-null   int64 
dtypes: int64(4), object(2)
memory usage: 169.4+ KB


In [116]:
utk_df.head()

Unnamed: 0,image,age,gender,race,age_range,age_group
0,<PIL.Image.Image image mode=RGB size=224x224 a...,18,1,3,10-19,2
1,<PIL.Image.Image image mode=RGB size=224x224 a...,34,0,3,30-39,4
2,<PIL.Image.Image image mode=RGB size=224x224 a...,50,0,3,50-59,6
3,<PIL.Image.Image image mode=RGB size=224x224 a...,23,0,3,20-29,3
4,<PIL.Image.Image image mode=RGB size=224x224 a...,32,0,3,30-39,4


In [117]:
# remove the original 'age' column
utk_df = utk_df.drop(columns=['age'])

# rename 
utk_df = utk_df.rename(columns={'age_group': 'age'})

# reorder
utk_df = utk_df[['image', 'age', 'gender', 'race', 'age_range']]

In [118]:
# since both the dataset contain only indian race (utk = 3 and fairface = 1) we drop the race column 
utk_df = utk_df.drop(columns=['race'])
fairface_df = fairface_df.drop(columns=['race'])

In [119]:
utk_df.head()

Unnamed: 0,image,age,gender,age_range
0,<PIL.Image.Image image mode=RGB size=224x224 a...,2,1,10-19
1,<PIL.Image.Image image mode=RGB size=224x224 a...,4,0,30-39
2,<PIL.Image.Image image mode=RGB size=224x224 a...,6,0,50-59
3,<PIL.Image.Image image mode=RGB size=224x224 a...,3,0,20-29
4,<PIL.Image.Image image mode=RGB size=224x224 a...,4,0,30-39


In [120]:
fairface_df.head()

Unnamed: 0,image,age,gender,age_range
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,1,30-39
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,20-29
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,20-29
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,1,30-39
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2,0,10-19


In [121]:
combined_df = pd.concat([fairface_df, utk_df], ignore_index=True)

In [122]:
combined_df

Unnamed: 0,image,age,gender,age_range
0,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,1,30-39
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,20-29
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,20-29
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,1,30-39
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,2,0,10-19
...,...,...,...,...
15590,<PIL.Image.Image image mode=RGB size=224x224 a...,6,1,50-59
15591,<PIL.Image.Image image mode=RGB size=224x224 a...,1,0,3-9
15592,<PIL.Image.Image image mode=RGB size=224x224 a...,3,1,20-29
15593,<PIL.Image.Image image mode=RGB size=224x224 a...,4,1,30-39


In [123]:
# Shuffle the combined DataFrame
shuffled_df = combined_df.sample(frac=1).reset_index(drop=True)
shuffled_df.head(25)

Unnamed: 0,image,age,gender,age_range
0,<PIL.Image.Image image mode=RGB size=224x224 a...,3,1,20-29
1,<PIL.JpegImagePlugin.JpegImageFile image mode=...,7,0,60-69
2,<PIL.JpegImagePlugin.JpegImageFile image mode=...,5,1,40-49
3,<PIL.JpegImagePlugin.JpegImageFile image mode=...,5,0,40-49
4,<PIL.JpegImagePlugin.JpegImageFile image mode=...,5,1,40-49
5,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,0,30-39
6,<PIL.JpegImagePlugin.JpegImageFile image mode=...,3,1,20-29
7,<PIL.Image.Image image mode=RGB size=224x224 a...,4,0,30-39
8,<PIL.JpegImagePlugin.JpegImageFile image mode=...,1,1,3-9
9,<PIL.JpegImagePlugin.JpegImageFile image mode=...,4,0,30-39


In [124]:
# Save the filtered dataframe to a CSV file
shuffled_df.to_csv('UTKFace+FairFace.csv', index=False)