In [1]:
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import pickle
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from keras.regularizers import l2
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def calculate_and_log_training_time(modelname, start_time, end_time):
    training_duration = end_time - start_time
    hours, rem = divmod(training_duration, 3600)
    minutes, seconds = divmod(rem, 60)
    
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    filepath = os.path.join(model_dir, "trainingtime.txt")
    with open(filepath, "w") as f:
        f.write(f"Training took {int(hours):02d}:{int(minutes):02d}:{seconds:02f} (hh:mm:ss).")

def save_model_config_with_optimizer(model, modelname):
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    filepath = os.path.join(model_dir, "model_config.txt")
    with open(filepath, 'w') as f:
        for layer in model.layers:
            f.write(f"Layer: {layer.name}\n")
            f.write(f"Config: {layer.get_config()}\n\n")
        
        optimizer_config = model.optimizer.get_config()
        f.write("Optimizer Config:\n")
        f.write(str(optimizer_config))

def save_model(model, modelname):
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    model_path = os.path.join(model_dir, modelname + ".keras")
    model.save(model_path)

def save_performance_metrics(history, modelname):
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    filepath = os.path.join(model_dir, 'performance_metrics.csv')
    pd.DataFrame(history.history).to_csv(filepath)
    print("Performance metrics saved.")


def plot_loss_and_metrics(history, metrics=['accuracy'], model_name='model'):
    model_dir = model_name
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Plot Training & Validation Loss
    plt.figure(figsize=(14, 6))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.savefig(os.path.join(model_dir, f'{model_name}_loss_plot.jpg'))
    plt.close()

    # Plot each metric
    for metric in metrics:
        plt.figure(figsize=(14, 6))
        plt.plot(history.history[metric], label=f'Train {metric.capitalize()}')
        plt.plot(history.history[f'val_{metric}'], label=f'Validation {metric.capitalize()}')
        plt.title(f'Model {metric.capitalize()}')
        plt.ylabel(metric.capitalize())
        plt.xlabel('Epoch')
        plt.legend(loc='upper left')
        plt.grid(True)
        plt.savefig(os.path.join(model_dir, f'{model_name}_{metric}_plot.jpg'))
        plt.close()
        print(f"Plot for {metric} saved.")

def save_model_and_config_and_metrics(model, history, modelname = "model", metrics=['accuracy']):
    save_model_config_with_optimizer(model, modelname = modelname)
    save_model(model, modelname = modelname)
    save_performance_metrics(history, modelname = modelname)
    plot_loss_and_metrics(history, metrics=metrics, model_name=modelname)
    print("All model components and metrics have been saved.")

In [2]:
path = "C:/Users/marij/Documents/Universiteit_local/Master_Year1/DeepLearning/Part1_Processed_RGB.pkl"
# Open the pickle file in binary mode
with open(path, 'rb') as file:
    # Load the content of the file into a variable
    RGB_data = pickle.load(file)

In [14]:
sampled_RGB_data = RGB_data.sample(n=4000, random_state = 2001)
# Assuming your DataFrame is named sampled_RGB_data
one_hot_encoded_races = pd.get_dummies(sampled_RGB_data['Race'], prefix='Race')
# Concatenate the original DataFrame with the new one-hot encoded columns
sampled_RGB_data = pd.concat([sampled_RGB_data, one_hot_encoded_races], axis=1)
y = sampled_RGB_data[['Race_0', 'Race_1', 'Race_2',
       'Race_3', 'Race_4']].values
X = sampled_RGB_data['Image'].values
X = np.stack(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Race

In [15]:
# Define the number of unique races
num_classes = len(sampled_RGB_data["Race"].unique())

# Define the model for binary classification (gender prediction)
model = tf.keras.Sequential([
    tf.keras.Input(shape=(256, 256, 3)),  # Define the input shape here
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(num_classes, activation='softmax')  # Change activation to 'softmax'
])

# Compile the model with categorical crossentropy loss and a suitable optimizer
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Capture start time
start_time = time.time()
# fit the model
history = model.fit(X, y, epochs=12, validation_split=0.2)
# Capture end time and calculate duration
end_time = time.time()
modelname ="RaceAllRBGCNN3convdropoutregu"
calculate_and_log_training_time(modelname = modelname, start_time = start_time, end_time = end_time)
save_model_and_config_and_metrics(model = model, history = history,  modelname = modelname)

Epoch 1/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m203s[0m 2s/step - accuracy: 0.4942 - loss: 2.6874 - val_accuracy: 0.5050 - val_loss: 1.6383
Epoch 2/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 1s/step - accuracy: 0.5306 - loss: 1.3718 - val_accuracy: 0.5050 - val_loss: 1.5156
Epoch 3/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 1s/step - accuracy: 0.5393 - loss: 1.3129 - val_accuracy: 0.5100 - val_loss: 1.3217
Epoch 4/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 1s/step - accuracy: 0.5513 - loss: 1.2431 - val_accuracy: 0.5462 - val_loss: 1.2809
Epoch 5/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 1s/step - accuracy: 0.5734 - loss: 1.2194 - val_accuracy: 0.5325 - val_loss: 1.3216
Epoch 6/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m130s[0m 1s/step - accuracy: 0.5773 - loss: 1.1924 - val_accuracy: 0.5500 - val_loss: 1.1834
Epoch 7/12
[1m100/100

In [13]:
sampled_RGB_data.columns

Index(['Age', 'Gender', 'Race', 'Image', 'Race_0', 'Race_1', 'Race_2',
       'Race_3', 'Race_4'],
      dtype='object')

## Checking for bias

In [None]:
# Load your model
model = tf.keras.models.load_model('RaceAllRBGCNN3convdropoutregu/RaceAllRBGCNN3convdropoutregu.keras')

In [35]:
sampled_RGB_data = RGB_data.sample(n=4000, random_state = 2001)
# Assuming your DataFrame is named sampled_RGB_data
one_hot_encoded_races = pd.get_dummies(sampled_RGB_data['Race'], prefix='Race')
# Concatenate the original DataFrame with the new one-hot encoded columns
sampled_RGB_data = pd.concat([sampled_RGB_data, one_hot_encoded_races], axis=1)
y = sampled_RGB_data[['Race_0', 'Race_1', 'Race_2',
       'Race_3', 'Race_4']].values
X = sampled_RGB_data['Image'].values
X = np.stack(X)

In [36]:
# Generate predictions
predicted_race = model.predict(X)
# Create a new column 'actual_Race' in the DataFrame
sampled_RGB_data['actual_Race'] = np.argmax(predicted_race, axis=1)


[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 261ms/step


In [37]:
# Define age bins and labels
bins = list(range(0, 110 + 10, 11))  # This goes from 0 to 100, changing this range as needed.
labels = [f'{i}-{i+9}' for i in range(0, 100, 10)]

# Create a new column for age categories
sampled_RGB_data['Age_Category'] = pd.cut(sampled_RGB_data['Age'], bins=bins, labels=labels, right=False)

# Display the new DataFrame to verify the categories
print(sampled_RGB_data[['Age', 'Age_Category']].head())

      Age Age_Category
3614   80        70-79
5198    1          0-9
8301    5          0-9
3966    2          0-9
9565   49        40-49


In [38]:
sampled_RGB_data.columns

Index(['Age', 'Gender', 'Race', 'Image', 'Race_0', 'Race_1', 'Race_2',
       'Race_3', 'Race_4', 'actual_Race', 'Age_Category'],
      dtype='object')

In [39]:
# Calculate the match between 'Race' and 'actual_Race'
sampled_RGB_data['Race_Match'] = sampled_RGB_data['Race'] == sampled_RGB_data['actual_Race']

# Group by 'Age_Category' and calculate the percentage of matches
match_percentage_by_age = sampled_RGB_data.groupby('Age_Category')['Race_Match'].mean() * 100

# Print the result
print(match_percentage_by_age)

Age_Category
0-9      62.946429
10-19    71.899225
20-29    67.542504
30-39    73.684211
40-49    77.837838
50-59    82.424242
60-69    82.710280
70-79    81.944444
80-89    67.307692
90-99    66.666667
Name: Race_Match, dtype: float64


  match_percentage_by_age = sampled_RGB_data.groupby('Age_Category')['Race_Match'].mean() * 100


In [40]:
# Calculate the match between 'Race' and 'actual_Race'
sampled_RGB_data['Race_Match'] = sampled_RGB_data['Race'] == sampled_RGB_data['actual_Race']

# Group by 'Gender' and calculate the percentage of matches
match_percentage_by_gender = sampled_RGB_data.groupby('Gender')['Race_Match'].mean() * 100

# Print the result
print(match_percentage_by_gender)

Gender
0    70.218579
1    71.013825
Name: Race_Match, dtype: float64
