In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

import math
import os
import pandas as pd
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from joblib import dump, load

# Define the batch size for data processing
batch_size = 64

### read the data

In [ ]:
train_data = pd.read_csv(os.path.join('..', '..', 'data', 'train.csv'))
val_data = pd.read_csv(os.path.join('..', '..', 'data', 'val.csv'))
test_data = pd.read_csv(os.path.join('..', '..', 'data', 'test.csv'))

### add the path of the images

In [ ]:
def construct_img_path(row):
    return os.path.join("..", "..", "data", "faces", row['user_id'],
                        "coarse_tilt_aligned_face." + str(row['face_id']) + "." + row['original_image'])


train_data['img_path'] = train_data.apply(construct_img_path, axis=1)
val_data['img_path'] = val_data.apply(construct_img_path, axis=1)
test_data['img_path'] = test_data.apply(construct_img_path, axis=1)
train_data.head(5)


#### add column for check if the image exists
it will help us to detect if there is any missing image, or if there is any bug in the path construction

In [ ]:
train_data['img_exists'] = train_data['img_path'].apply(os.path.exists)
val_data['img_exists'] = val_data['img_path'].apply(os.path.exists)
test_data['img_exists'] = test_data['img_path'].apply(os.path.exists)

train_data.head(5)

In [ ]:
def encode_gender_age(train_data, val_data, test_data):
    """Encodes gender and age combinations into a single label, handling unseen labels."""

    train_data['gender_age_combined'] = train_data['gender'].astype(str) + '_' + train_data['age'].astype(str)
    val_data['gender_age_combined'] = val_data['gender'].astype(str) + '_' + val_data['age'].astype(str)
    test_data['gender_age_combined'] = test_data['gender'].astype(str) + '_' + test_data['age'].astype(str)

    gender_age_encoder = LabelEncoder()
    train_data['gender_age_label'] = gender_age_encoder.fit_transform(train_data['gender_age_combined'])

    # Function to handle unseen labels
    def transform_with_unknown(data, encoder):
        known_classes = set(encoder.classes_)
        data['gender_age_label'] = data['gender_age_combined'].apply(
            lambda x: encoder.transform([x])[0] if x in known_classes else -1
        ) # assign -1 to unseen label.
        return data

    val_data = transform_with_unknown(val_data, gender_age_encoder)
    test_data = transform_with_unknown(test_data, gender_age_encoder)

    num_classes = len(gender_age_encoder.classes_)

    train_data.drop('gender_age_combined', axis=1, inplace=True)
    val_data.drop('gender_age_combined', axis=1, inplace=True)
    test_data.drop('gender_age_combined', axis=1, inplace=True)

    return train_data, val_data, test_data, num_classes, gender_age_encoder

train_data, val_data, test_data, num_classes, gender_age_encoder = encode_gender_age(train_data, val_data, test_data)

num_classes = len(gender_age_encoder.classes_)
print("gender_age classes:", gender_age_encoder.classes_)
train_data.head(5)

### Filter out any rows where the image doesn't exist

In [ ]:
train_data_filtered = train_data[train_data['img_exists'] == True]
val_data_filtered = val_data[val_data['img_exists'] == True]
test_data_filtered = test_data[test_data['img_exists'] == True]

### Define a generator function to process images in batches