In this notebook we'll be combining two publicly available dataset **UTKFace** and **FairFace** to create a bigger dataset and do some preprocessing to train the model

# Importing necessary libraries

In [2]:
import shutil
import pandas as pd
import os
import numpy as np
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

## UTKFace

In [2]:
image_names=os.listdir("./UTKFace")
len(image_names)

23702

In [3]:
image_names[100:110]

['10_0_4_20161221192738446.jpg.chip.jpg',
 '10_0_4_20170103200335831.jpg.chip.jpg',
 '10_0_4_20170103200409638.jpg.chip.jpg',
 '10_0_4_20170103200443015.jpg.chip.jpg',
 '10_0_4_20170103200501766.jpg.chip.jpg',
 '10_0_4_20170103201924664.jpg.chip.jpg',
 '10_0_4_20170103202338152.jpg.chip.jpg',
 '10_0_4_20170103212521420.jpg.chip.jpg',
 '10_0_4_20170103223451479.jpg.chip.jpg',
 '10_0_4_20170104010810728.jpg.chip.jpg']

**UTKFace** image filenames contains information about the subject. For example '10_0_4_20161221192738446.jpg.chip.jpg' here 10 is the Age, then 0 means Man, here 1 would mean woman. 4 represents ethnic class, we don't need that here. 

### Copying images of men and women into two different folders. if age is less than 3 categorize them as women regardless of the given gender.

In [6]:
for i in tqdm(image_names):
    if i.split("_")[1]=="0":
        source="UTKFace/"+i
        if int(i.split("_")[0])>3:
            target="Final_folder_gender/Men/"+i
        else:
            target="Final_folder_gender/Women/"+i
        shutil.copy(source,target)
    elif i.split("_")[1]=="1":
        source="UTKFace/"+i
        target="Final_folder_gender/Women/"+i
        shutil.copy(source,target)

100%|██████████| 23702/23702 [04:19<00:00, 91.50it/s] 


## Fairface

### Read the provided csv file

In [8]:
Fairface_train_images_df=pd.read_csv("Fairface/train_labels.csv")
Fairface_train_images_df.head()

Unnamed: 0,file,age,gender,race,service_test
0,train/1.jpg,50-59,Male,East Asian,True
1,train/2.jpg,30-39,Female,Indian,False
2,train/3.jpg,3-9,Female,Black,False
3,train/4.jpg,20-29,Female,Indian,True
4,train/5.jpg,20-29,Female,Indian,True


In [9]:
Fairface_train_images_names=Fairface_train_images_df.file.values.tolist()
Fairface_train_images_gender=Fairface_train_images_df.gender.values.tolist()
Fairface_train_images_age=Fairface_train_images_df.age.values.tolist()


# images go to separate folders based on genders 

In [10]:
for i in range(len(Fairface_train_images_names)):
    if Fairface_train_images_gender[i]=="Male":
        source="Fairface/"+Fairface_train_images_names[i]
        if Fairface_train_images_age[i]=="0-2":
            target="Final_folder_gender/Women/"+Fairface_train_images_names[i].split("/")[1]
        else:  
            target="Final_folder_gender/Men/"+Fairface_train_images_names[i].split("/")[1]
        shutil.copy(source,target)
    elif Fairface_train_images_gender[i]=="Female":
        source="Fairface/"+Fairface_train_images_names[i]
        target="Final_folder_gender/Women/"+Fairface_train_images_names[i].split("/")[1]
        shutil.copy(source,target)
    else:
        continue


# Main dataset 

create a combined dataset

In [9]:
male_images=os.listdir("C:/Users/shahr/Downloads/Compressed/Final_folder_gender/Men")
male_df=pd.DataFrame()
male_df["file"]=male_images
male_df["gender"]="M"
male_df["gender_class"]=0
# image path is added to the filename
male_df["file"]=male_df["file"].map(lambda x: "/content/Final_folder_gender/Men/"+x)
male_df.head()

Unnamed: 0,file,gender,gender_class
0,/content/Final_folder_gender/Men/1.jpg,M,0
1,/content/Final_folder_gender/Men/10.jpg,M,0
2,/content/Final_folder_gender/Men/1000.jpg,M,0
3,/content/Final_folder_gender/Men/10002.jpg,M,0
4,/content/Final_folder_gender/Men/10003.jpg,M,0


In [10]:
male_df=shuffle(male_df,random_state=42).reset_index(drop=True)

In [11]:
female_images=os.listdir("C:/Users/shahr/Downloads/Compressed/Final_folder_gender/Women/")
female_df=pd.DataFrame()
female_df["file"]=female_images
female_df["gender"]="F"
female_df["gender_class"]=1
# image path is added to the filename
female_df["file"]=female_df["file"].map(lambda x: "/content/Final_folder_gender/Women/"+x)
female_df.head()

Unnamed: 0,file,gender,gender_class
0,/content/Final_folder_gender/Women/100.jpg,F,1
1,/content/Final_folder_gender/Women/10001.jpg,F,1
2,/content/Final_folder_gender/Women/10008.jpg,F,1
3,/content/Final_folder_gender/Women/10009.jpg,F,1
4,/content/Final_folder_gender/Women/10013.jpg,F,1


In [12]:
female_df=shuffle(female_df,random_state=42).reset_index(drop=True)

In [13]:
print(male_df.shape)
print(female_df.shape)

(56262, 3)
(52193, 3)


## Create balanced dataset by ensuring equal number of male and female images

In [14]:
male_df=male_df.sample(n=female_df.shape[0],random_state=42)

In [15]:
male_df.shape

(52193, 3)

In [16]:
main_df=pd.concat([male_df,female_df])
main_df=shuffle(main_df,random_state=42).reset_index(drop=True)

print(main_df.shape)
main_df.head(10)


(104386, 3)


Unnamed: 0,file,gender,gender_class
0,/content/Final_folder_gender/Women/54512.jpg,F,1
1,/content/Final_folder_gender/Men/28571.jpg,M,0
2,/content/Final_folder_gender/Men/25747.jpg,M,0
3,/content/Final_folder_gender/Women/35642.jpg,F,1
4,/content/Final_folder_gender/Women/17038.jpg,F,1
5,/content/Final_folder_gender/Men/58_0_1_201701...,M,0
6,/content/Final_folder_gender/Women/55085.jpg,F,1
7,/content/Final_folder_gender/Men/11181.jpg,M,0
8,/content/Final_folder_gender/Men/76848.jpg,M,0
9,/content/Final_folder_gender/Men/82144.jpg,M,0


In [17]:
x=main_df[["file","gender"]]
y=main_df["gender_class"]

# Divide train and validation data

In [18]:
# Splitting the dataset into training and testing datasets with test_size=0.3 and stratify=y. 

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, stratify=y, random_state=42)

In [19]:
temp_train = X_train.copy()
temp_train['gender_class'] = y_train

temp_test = X_test.copy()
temp_test['gender_class'] = y_test

In [20]:
temp_train["gender"].value_counts()

M    36535
F    36535
Name: gender, dtype: int64

In [21]:
temp_train.head()

Unnamed: 0,file,gender,gender_class
90141,/content/Final_folder_gender/Men/13054.jpg,M,0
15971,/content/Final_folder_gender/Women/85053.jpg,F,1
96870,/content/Final_folder_gender/Women/13285.jpg,F,1
45515,/content/Final_folder_gender/Women/42733.jpg,F,1
75297,/content/Final_folder_gender/Men/78390.jpg,M,0


In [26]:
temp_train.to_csv("images_filenames_labels_train_gender_new.csv", index=False)
temp_test.to_csv("images_filenames_labels_test_gender_new.csv", index=False)

In [27]:
temp_train["gender"].value_counts()

M    36535
F    36535
Name: gender, dtype: int64

In [28]:
temp_test["gender"].value_counts()

M    15658
F    15658
Name: gender, dtype: int64

In [29]:
temp_test.shape

(31316, 3)