In [1]:
#test run with Google Colab T4 GPU
!pip install datasets

Collecting datasets
  Downloading datasets-3.3.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.1-py3-none-any.whl (484 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.9/484.9 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading 

In [None]:
#upload kaggle.json
from google.colab import files
files.upload()

In [3]:
!pip install -q kaggle

In [4]:
#move to dir
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [5]:
#download dataset
!kaggle datasets download -d aibloy/fairface/

Dataset URL: https://www.kaggle.com/datasets/aibloy/fairface/versions/
License(s): unknown
Downloading fairface.zip to /content
 97% 532M/550M [00:05<00:00, 113MB/s]
100% 550M/550M [00:05<00:00, 105MB/s]


In [None]:
!unzip fairface.zip -d /content/dataset/

In [7]:
import pandas as pd

# Load train and validation datasets separately
train_df = pd.read_csv('/content/dataset/FairFace/train_labels.csv')
val_df = pd.read_csv('/content/dataset/FairFace/val_labels.csv')

print("Train Data Columns:", train_df.columns)  # Check if 'file' and 'race_encoded' exist
print("Validation Data Columns:", val_df.columns)



Train Data Columns: Index(['file', 'age', 'gender', 'race', 'service_test'], dtype='object')
Validation Data Columns: Index(['file', 'age', 'gender', 'race', 'service_test'], dtype='object')


In [8]:
# Convert race labels to numbers

from sklearn.preprocessing import LabelEncoder

# Encode race labels as numerical values
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
train_df['race_encoded'] = label_encoder.fit_transform(train_df['race'])
val_df['race_encoded'] = label_encoder.transform(val_df['race'])  # Use same encoding


# Mapping of race categories
# zip() pairs each category with its corresponding number and dict() creates a dictionary from the pairs fir easy reference
race_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print("Race Mapping:", race_mapping)


Race Mapping: {'Black': 0, 'East Asian': 1, 'Indian': 2, 'Latino_Hispanic': 3, 'Middle Eastern': 4, 'Southeast Asian': 5, 'White': 6}


In [9]:
import os

# Print the first few file names from the DataFrame
print("Sample filenames from train_df:")
print(train_df['file'].head())

# List some actual image files from the directory
TRAIN_IMG_DIR = r"/content/dataset/FairFace/train"
print("\nFiles in training image directory:")
print(os.listdir(TRAIN_IMG_DIR)[:5])  # List the first 5 files


Sample filenames from train_df:
0    train/1.jpg
1    train/2.jpg
2    train/3.jpg
3    train/4.jpg
4    train/5.jpg
Name: file, dtype: object

Files in training image directory:
['69615.jpg', '8261.jpg', '21967.jpg', '41679.jpg', '11406.jpg']


In [10]:
import cv2
import numpy as np
import os

# Define paths to image directories
TRAIN_IMG_DIR = r"/content/dataset/FairFace/train"
VAL_IMG_DIR = r"/content/dataset/FairFace/val"


# Function to load images in batches
def image_generator(df, img_dir, batch_size=64):
    while True:  # Infinite loop for batch generation
        for i in range(0, len(df), batch_size):
            batch_df = df.iloc[i:i+batch_size]  # Get batch
            images = []
            labels = []

            for _, row in batch_df.iterrows():
                filename = row['file'].replace("train/", "").replace("val/", "")
                img_path = os.path.join(img_dir, filename)

                img = cv2.imread(img_path)
                if img is None:
                    continue  # Skip missing images

                img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # Convert BGR to RGB
                img = img.astype('float32') / 255.0  # Normalize

                images.append(img)
                labels.append(row['race_encoded'])

            yield np.array(images), np.array(labels)  # Return batch

# Create generators for training and validation
train_gen = image_generator(train_df, TRAIN_IMG_DIR, batch_size=32)
val_gen = image_generator(val_df, VAL_IMG_DIR, batch_size=32)




In [11]:
# Get the first batch of images from the training generator
X_batch, y_batch = next(train_gen)

# Print shape of the batch to verify
print("Batch Shape:", X_batch.shape, y_batch.shape)


Batch Shape: (32, 224, 224, 3) (32,)


In [12]:
import tensorflow as tf
print("Num GPUs Available:", len(tf.config.list_physical_devices('GPU')))


Num GPUs Available: 1


In [13]:
# CNN ARCHITECTURE

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Define the CNN model
model = Sequential([
    # First Convolutional Block
    Conv2D(32, (3,3), activation='relu', input_shape=(224, 224, 3)),
    MaxPooling2D(pool_size=(2,2)),

    # Second Convolutional Block
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(pool_size=(2,2)),

    # Third Convolutional Block
    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(pool_size=(2,2)),

    # Fully Connected Layers
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),  # Prevent overfitting
    Dense(7, activation='softmax')  # 7 classes (one per race category)
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)


# Use below if your PC is fast
# model.fit(
#     train_gen,
#     validation_data=val_gen,
#     steps_per_epoch=len(train_df) // 64,  # Batches per epoch
#     validation_steps=len(val_df) // 64,
#     epochs=10
# )

model.fit(
    train_gen,
    validation_data=val_gen,
    steps_per_epoch=500,  # Lower total steps per epoch
    validation_steps=100,  # Lower validation steps
    epochs=10
)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 56ms/step - accuracy: 0.1883 - loss: 1.9878 - val_accuracy: 0.2912 - val_loss: 1.8192
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 80ms/step - accuracy: 0.2863 - loss: 1.8160 - val_accuracy: 0.3475 - val_loss: 1.6766
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 53ms/step - accuracy: 0.3429 - loss: 1.6823 - val_accuracy: 0.3916 - val_loss: 1.6189
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 54ms/step - accuracy: 0.3530 - loss: 1.6563 - val_accuracy: 0.4047 - val_loss: 1.5303
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 79ms/step - accuracy: 0.3642 - loss: 1.6257 - val_accuracy: 0.4172 - val_loss: 1.5414
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 60ms/step - accuracy: 0.3848 - loss: 1.5872 - val_accuracy: 0.4047 - val_loss: 1.5187
Epoch 7/10
[1m5

<keras.src.callbacks.history.History at 0x7a2e9cd1ae50>

In [14]:
model.save("race_classifier_model_v1.h5")




In [15]:
model.save("race_classifier_model_v1.keras")

In [16]:
!git clone https://github.com/Oxford-EDI-Initiative/Face.git

Cloning into 'Face'...
remote: Enumerating objects: 34, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 34 (delta 14), reused 3 (delta 1), pack-reused 0 (from 0)[K
Receiving objects: 100% (34/34), 494.92 KiB | 1.45 MiB/s, done.
Resolving deltas: 100% (14/14), done.


In [17]:
!mv /content/race_classifier_model_v1.h5 /content/Face
!mv /content/race_classifier_model_v1.keras /content/Face