In [14]:
import os
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
import pickle
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from keras.regularizers import l2
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def calculate_and_log_training_time(modelname, start_time, end_time):
    training_duration = end_time - start_time
    hours, rem = divmod(training_duration, 3600)
    minutes, seconds = divmod(rem, 60)
    
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    filepath = os.path.join(model_dir, "trainingtime.txt")
    with open(filepath, "w") as f:
        f.write(f"Training took {int(hours):02d}:{int(minutes):02d}:{seconds:02f} (hh:mm:ss).")

def save_model_config_with_optimizer(model, modelname):
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    filepath = os.path.join(model_dir, "model_config.txt")
    with open(filepath, 'w') as f:
        for layer in model.layers:
            f.write(f"Layer: {layer.name}\n")
            f.write(f"Config: {layer.get_config()}\n\n")
        
        optimizer_config = model.optimizer.get_config()
        f.write("Optimizer Config:\n")
        f.write(str(optimizer_config))

def save_model(model, modelname):
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    model_path = os.path.join(model_dir, modelname + ".keras")
    model.save(model_path)

def save_performance_metrics(history, modelname):
    model_dir = modelname
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    filepath = os.path.join(model_dir, 'performance_metrics.csv')
    pd.DataFrame(history.history).to_csv(filepath)
    print("Performance metrics saved.")


def plot_loss_and_metrics(history, metrics=['accuracy'], model_name='model'):
    model_dir = model_name
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
    
    # Plot Training & Validation Loss
    plt.figure(figsize=(14, 6))
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend(loc='upper right')
    plt.grid(True)
    plt.savefig(os.path.join(model_dir, f'{model_name}_loss_plot.jpg'))
    plt.close()

    # Plot each metric
    for metric in metrics:
        plt.figure(figsize=(14, 6))
        plt.plot(history.history[metric], label=f'Train {metric.capitalize()}')
        plt.plot(history.history[f'val_{metric}'], label=f'Validation {metric.capitalize()}')
        plt.title(f'Model {metric.capitalize()}')
        plt.ylabel(metric.capitalize())
        plt.xlabel('Epoch')
        plt.legend(loc='upper left')
        plt.grid(True)
        plt.savefig(os.path.join(model_dir, f'{model_name}_{metric}_plot.jpg'))
        plt.close()
        print(f"Plot for {metric} saved.")

def save_model_and_config_and_metrics(model, history, modelname = "model", metrics=['accuracy']):
    save_model_config_with_optimizer(model, modelname = modelname)
    save_model(model, modelname = modelname)
    save_performance_metrics(history, modelname = modelname)
    plot_loss_and_metrics(history, metrics=metrics, model_name=modelname)
    print("All model components and metrics have been saved.")


In [3]:
path = "C:/Users/marij/Documents/Universiteit_local/Master_Year1/DeepLearning/Part1_Processed_RGB.pkl"
# Open the pickle file in binary mode
with open(path, 'rb') as file:
    # Load the content of the file into a variable
    RGB_data = pickle.load(file)

In [4]:
sampled_RGB_data = RGB_data.sample(n=4000, random_state = 2001)
y = sampled_RGB_data['Gender'].values
X = sampled_RGB_data['Image'].values
X = np.stack(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
label_counts = sampled_RGB_data['Gender'].value_counts()
print(label_counts)

Gender
1    2170
0    1830
Name: count, dtype: int64


## MOdel Gender

In [6]:
# Define the model for binary classification (gender prediction)
model = tf.keras.Sequential([
    tf.keras.Input(shape=(256, 256, 3)),  # Define the input shape here
    tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_regularizer=l2(0.001)),
    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
    tf.keras.layers.Dropout(0.25),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')  # Change activation to 'sigmoid' for binary classification
])

# Compile the model with binary crossentropy loss and a suitable optimizer
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Capture start time
start_time = time.time()
# fit the model
history = model.fit(X, y, epochs=12, validation_split=0.2)
# Capture end time and calculate duration
end_time = time.time()
modelname ="GenderAllRBGCNN3convdropoutregu"
calculate_and_log_training_time(modelname = modelname, start_time = start_time, end_time = end_time)
save_model_and_config_and_metrics(model = model, history = history,  modelname = modelname)

Epoch 1/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 1s/step - accuracy: 0.5293 - loss: 1.8240 - val_accuracy: 0.6212 - val_loss: 0.7201
Epoch 2/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 1s/step - accuracy: 0.6336 - loss: 0.7267 - val_accuracy: 0.5650 - val_loss: 0.8287
Epoch 3/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 1s/step - accuracy: 0.6121 - loss: 0.7196 - val_accuracy: 0.6800 - val_loss: 0.6807
Epoch 4/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m113s[0m 1s/step - accuracy: 0.6732 - loss: 0.6759 - val_accuracy: 0.6600 - val_loss: 0.6742
Epoch 5/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m120s[0m 1s/step - accuracy: 0.6604 - loss: 0.6708 - val_accuracy: 0.6800 - val_loss: 0.6615
Epoch 6/12
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 1s/step - accuracy: 0.6951 - loss: 0.6338 - val_accuracy: 0.6662 - val_loss: 0.6384
Epoch 7/12
[1m100/100

## Checking Bias

In [8]:
# Load your model
model = tf.keras.models.load_model('GenderAllRBGCNN3convdropoutregu/GenderAllRBGCNN3convdropoutregu.keras')

In [9]:
sampled_RGB_data = RGB_data.sample(n=4000, random_state = 2009)
y = sampled_RGB_data['Gender'].values
X = sampled_RGB_data['Image'].values
X = np.stack(X)

In [21]:
# Generate predictions
predicted_genders = model.predict(X)
# Add predictions back to DataFrame
sampled_RGB_data['predicted_gender'] = np.round(predicted_genders).astype(int)

[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 239ms/step


In [22]:
# Define age bins and labels
bins = list(range(0, 110 + 10, 11))  # This goes from 0 to 100, changing this range as needed.
labels = [f'{i}-{i+9}' for i in range(0, 100, 10)]

# Create a new column for age categories
sampled_RGB_data['Age_Category'] = pd.cut(sampled_RGB_data['Age'], bins=bins, labels=labels, right=False)

# Display the new DataFrame to verify the categories
print(sampled_RGB_data[['Age', 'Age_Category']].head())

      Age Age_Category
900    29        20-29
4606    1          0-9
6880   65        50-59
9897    3          0-9
1784   21        10-19


In [23]:
# Calculate performance metrics by Race for gender prediction
grouped_race = sampled_RGB_data.groupby('Race')[['Gender', 'predicted_gender']].apply(
    lambda x: pd.Series({
        'Accuracy': accuracy_score(x['Gender'], x['predicted_gender']),
        'Precision': precision_score(x['Gender'], x['predicted_gender'], zero_division=0),
        'Recall': recall_score(x['Gender'], x['predicted_gender'], zero_division=0),
        'F1 Score': f1_score(x['Gender'], x['predicted_gender'], zero_division=0)
    })
)

print("Performance by Race for Gender Prediction:\n", grouped_race)

Performance by Race for Gender Prediction:
       Accuracy  Precision    Recall  F1 Score
Race                                         
0     0.772727   0.783027  0.790636  0.786813
1     0.718563   0.708738  0.811111  0.756477
2     0.671053   0.662371  0.732194  0.695535
3     0.781250   0.806122  0.863388  0.833773
4     0.758542   0.797753  0.803774  0.800752


In [25]:
# Group by Age Category and calculate error metrics
grouped_age_category = sampled_RGB_data.groupby('Age_Category')[['Gender', 'predicted_gender']].apply(
    lambda x: pd.Series({
        'Accuracy': accuracy_score(x['Gender'], x['predicted_gender']),
        'Precision': precision_score(x['Gender'], x['predicted_gender'], zero_division=0),
        'Recall': recall_score(x['Gender'], x['predicted_gender'], zero_division=0),
        'F1 Score': f1_score(x['Gender'], x['predicted_gender'], zero_division=0)
    })
)

print("Performance by Age Category:\n", grouped_age_category)

Performance by Age Category:
               Accuracy  Precision    Recall  F1 Score
Age_Category                                         
0-9           0.633888   0.637549  0.703597  0.668947
10-19         0.752381   0.771084  0.825806  0.797508
20-29         0.866559   0.898455  0.916667  0.907469
30-39         0.843434   0.814286  0.881443  0.846535
40-49         0.841026   0.790055  0.856287  0.821839
50-59         0.844311   0.830189  0.840764  0.835443
60-69         0.799065   0.823009  0.801724  0.812227
70-79         0.681481   0.870370  0.566265  0.686131
80-89         0.543860   0.652174  0.454545  0.535714
90-99         0.333333   1.000000  0.333333  0.500000


  grouped_age_category = sampled_RGB_data.groupby('Age_Category')[['Gender', 'predicted_gender']].apply(


In [19]:
print(sampled_RGB_data['Gender'].unique())
print(sampled_RGB_data['predicted_gender'].unique())


[1 0]
[0.8913752  0.519364   0.2582147  ... 0.9834776  0.70008427 0.11546353]
