# Mixfeat functional experiments

In [1]:
import os
import sys
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

from typing import Tuple
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from pathlib import Path
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix

# update the path so we can directly import code from the DVlog
sys.path.append(os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), os.pardir, "DVlog"))))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir, "DVlog")))

from DVlog.utils.bias_mitigations import apply_oversampling, apply_mixfeat_oversampling
from DVlog.utils.metrics import calculate_performance_measures, calculate_gender_performance_measures, calculate_fairness_measures

In [2]:
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")
embeddings_path = Path("../DVlog/dataset/sent-embeddings-dataset")
feature_name = "sent_mpnet_keyw"

seed = 42
random_seeds = [0, 1, 42, 1123, 3107]

In [3]:
# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)
df_annotations.reset_index(drop=True, inplace=True)
df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,test


In [4]:
# loop over each row and compute the average embeddings
df_annotations["avg_embed"] = None
df_annotations["std_embed"] = None

# loop over each row and retrieve the embeddings
seq_length = 104

for idx, row in df_annotations.iterrows():
    # get the texts
    video_id = row.video_id
    
    # setup the path to the file
    embedding_path = os.path.join(embeddings_path, str(video_id), f"{feature_name}.npy")
    embedding = np.load(embedding_path).astype(np.float32)

    # apply the padding
    padded_embedding = embedding[:seq_length]

    # get the average over the whole embedding
    avg_embedding = np.mean(padded_embedding, axis=0)
    std_embedding = np.std(padded_embedding, axis=0)

    # put the embedding back
    df_annotations.at[idx, "avg_embed"] = avg_embedding
    df_annotations.at[idx, "std_embed"] = std_embedding

df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset,avg_embed,std_embed
0,0,1,f,train,"[-0.0043204557, 0.0025047027, -0.022133984, -0...","[0.032979805, 0.044009496, 0.021632787, 0.0345..."
1,1,1,f,test,"[0.014702894, 0.017551864, -0.01323786, -0.016...","[0.035327107, 0.045324475, 0.016451813, 0.0293..."
2,2,1,m,train,"[-0.0020621587, -0.002233186, -0.009010282, -0...","[0.028251473, 0.059699543, 0.019755766, 0.0281..."
3,3,1,m,train,"[0.013287175, 0.005526411, -0.010409681, -0.02...","[0.031882487, 0.042796258, 0.020934831, 0.0321..."
4,4,1,f,test,"[-0.008224284, 0.02129893, -0.0096479375, -0.0...","[0.03333977, 0.042923436, 0.018166319, 0.03391..."


In [5]:
# setup the train and validation datasets
train_indices = df_annotations[df_annotations["dataset"] == "train"].index
val_indices = df_annotations[df_annotations["dataset"] == "val"].index

# prepare the features and labels
avg_features = np.stack(df_annotations["avg_embed"].values)
std_features = np.stack(df_annotations["std_embed"].values)
labels = df_annotations["label"].values
genders = df_annotations["gender"].values

# create the train and validation sets
X_train_avg, X_train_std = avg_features[train_indices], std_features[train_indices]
y_train = labels[train_indices]

X_val_avg, X_val_std = avg_features[val_indices], std_features[val_indices]
y_val = labels[val_indices]

# combine the train and validation sets
X_avg = np.vstack((X_train_avg, X_val_avg))
X_std = np.vstack((X_train_std, X_val_std))
y = np.hstack((y_train, y_val))

# Create a test_fold array: -1 for training set, 0 for validation set
test_fold = np.concatenate([
    -1 * np.ones(len(X_train_avg), dtype=int),
    np.zeros(len(X_val_avg), dtype=int)
])

print(X_avg.shape, y.shape, test_fold.shape)

# Create PredefinedSplit object
ps = PredefinedSplit(test_fold)

(662, 768) (662,) (662,)


## setup the gridsearch with the parameters
- C (Regularization Parameter): Controls the trade-off between achieving a low error on the training data and minimizing the norm of the weights. A small value for C makes the decision surface smooth, while a large value of C aims to classify all training examples correctly.

- Gamma (Kernel Coefficient): Defines how far the influence of a single training example reaches, with low values meaning 'far' and high values meaning 'close'. It is applicable for 'rbf', 'poly', and 'sigmoid' kernels.

- Kernel: Specifies the kernel type to be used in the algorithm. Common kernels are 'linear', 'poly' (polynomial), 'rbf' (radial basis function), and 'sigmoid'.

In [6]:
# Define the SVM and parameter grid
svm = SVC(random_state=seed)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Set up and run GridSearchCV
grid_search_avg = GridSearchCV(estimator=svm, param_grid=param_grid, cv=ps, verbose=2, n_jobs=-1)
grid_search_avg.fit(X_avg, y)

Fitting 1 folds for each of 64 candidates, totalling 64 fits


In [7]:
# Output best parameters and score
best_params = grid_search_avg.best_params_
print("Best parameters found: ", best_params)
print("Best validation score: ", grid_search_avg.best_score_)

Best parameters found:  {'C': 10, 'gamma': 1, 'kernel': 'poly'}
Best validation score:  0.9523809523809523


In [8]:
# Define the SVM and parameter grid
svm = SVC(random_state=seed)
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Set up and run GridSearchCV
grid_search_std = GridSearchCV(estimator=svm, param_grid=param_grid, cv=ps, verbose=2, n_jobs=-1)
grid_search_std.fit(X_std, y)

Fitting 1 folds for each of 64 candidates, totalling 64 fits


In [9]:
# Output best parameters and score
best_params = grid_search_std.best_params_
print("Best parameters found: ", best_params)
print("Best validation score: ", grid_search_std.best_score_)

Best parameters found:  {'C': 100, 'gamma': 1, 'kernel': 'sigmoid'}
Best validation score:  0.9285714285714286


In [10]:
# build the function for automatically retrieve all metrics
def evaluate_model(y_true, y_pred, protected):

    # calculate the performance metrics
    w_precision, w_recall, w_fscore, _ = precision_recall_fscore_support(y_true, y_pred, average="weighted")

    # calculate the fairness metrics
    eq_oppor, eq_acc, pred_equal, _, _ = calculate_fairness_measures(y_true, y_pred, protected, unprivileged='m')
    
    # eq_oppor, eq_acc, fairl_eq_odds, unpriv_stats, priv_stats = calculate_fairness_measures(y_true, y_pred, protected, 'm')
    gender_metrics = calculate_gender_performance_measures(y_true, y_pred, protected)

    measure_dict = {
        "precision": w_precision,
        "recall": w_recall,
        "fscore": w_fscore,
        f"{gender_metrics[0][0]}_fscore": gender_metrics[0][3],
        f"{gender_metrics[1][0]}_fscore": gender_metrics[1][3],
        "eq_oppor": eq_oppor,
        "eq_acc": eq_acc,
        "pred_eq": pred_equal}
    return measure_dict

In [11]:
# evaulate this model on the test set
test_indices = df_annotations[df_annotations["dataset"] == "test"].index
X_test_avg, X_test_std, y_test, protec_test = avg_features[test_indices], std_features[test_indices], labels[test_indices], genders[test_indices]

# Evaluate the best model (avg)
best_svm = grid_search_avg.best_estimator_
y_pred = best_svm.predict(X_test_avg)
base_eval_dict_avg = evaluate_model(y_test, y_pred, protec_test)

# Evaluate the best model (std)
best_svm = grid_search_std.best_estimator_
y_pred = best_svm.predict(X_test_std)
base_eval_dict_std = evaluate_model(y_test, y_pred, protec_test)

## Setup the bias mitigations

In [12]:
mixfeat_options = ['oversample', 'group_upsample', 'mixgender_upsample', 'subgroup_upsample', 'synthetic', 'synthetic_mixgendered']
results = [("base_model_avg", base_eval_dict_avg), ("base_model_std", base_eval_dict_std)]

# get the training section
df_train = df_annotations[df_annotations["dataset"] == "train"]

# take the training_df and do the oversampling for each option
for seed in random_seeds:
    for option in mixfeat_options:
        print(f"Processing: {option} with seed: {seed}")

        # get the training section
        df_copy = df_train.copy()
        if option == 'oversample':
            training_df = apply_oversampling(df_copy, seed)
            X = np.stack(training_df["avg_embed"].values)

        else:
            training_df = apply_mixfeat_oversampling(df_copy, option, 1, seed)

            # extract the training data and apply the mixfeat operation whenever possible
            X = []
            for _, row in training_df.iterrows():
                if row.mixfeat:
                    idx1, idx2 = row.mixfeat
                    prob = row.mixfeat_probs[0]

                    # get the embeddings from the dataframe
                    embedding1 = df_train.loc[df_train['video_id'] == idx1]["avg_embed"].values[0]
                    embedding2 = df_train.loc[df_train['video_id'] == idx2]["avg_embed"].values[0]

                    final_embedding = (embedding1 * prob) + (embedding2 * (1 - prob))
                    X.append(final_embedding)
                else:
                    X.append(row.avg_embed)

            # get all the information and train the model
            X = np.array(X)

        # retrieve the label information
        y = training_df["label"].values

        # train an SVM model
        svm = SVC(**best_params, random_state=seed)
        svm.fit(X, y)

        # evaluate the model
        y_pred = svm.predict(X_test_avg)
        eval_dict = evaluate_model(y_test, y_pred, protec_test)
        results.append((option, eval_dict))

Processing: oversample with seed: 0
Processing: group_upsample with seed: 0
Processing: mixgender_upsample with seed: 0
Processing: subgroup_upsample with seed: 0
Processing: synthetic with seed: 0
Processing: synthetic_mixgendered with seed: 0
Processing: oversample with seed: 1
Processing: group_upsample with seed: 1
Processing: mixgender_upsample with seed: 1
Processing: subgroup_upsample with seed: 1
Processing: synthetic with seed: 1
Processing: synthetic_mixgendered with seed: 1
Processing: oversample with seed: 42
Processing: group_upsample with seed: 42
Processing: mixgender_upsample with seed: 42
Processing: subgroup_upsample with seed: 42
Processing: synthetic with seed: 42
Processing: synthetic_mixgendered with seed: 42
Processing: oversample with seed: 1123
Processing: group_upsample with seed: 1123
Processing: mixgender_upsample with seed: 1123
Processing: subgroup_upsample with seed: 1123
Processing: synthetic with seed: 1123
Processing: synthetic_mixgendered with seed: 1

In [13]:
# Extract data into a structured format
extracted_data = []
for name, result in results:
    data = {
        "name": name,
        "Precision": np.round(result["precision"], 3),
        "Recall": np.round(result["recall"], 3),
        "F-score": np.round(result["fscore"], 3),
        "Male F-score": np.round(result["m_fscore"], 3),
        "Female F-score": np.round(result["f_fscore"], 3),
        "eq_oppor": np.round(result["eq_oppor"], 2),
        "eq_acc": np.round(result["eq_acc"], 2),
        "pred_eq": np.round(result["pred_eq"], 2)
    }
    extracted_data.append(data)

# Convert the list of dictionaries to a pandas DataFrame and display it
df = pd.DataFrame(extracted_data)
df.groupby("name").mean()

Unnamed: 0_level_0,Precision,Recall,F-score,Male F-score,Female F-score,eq_oppor,eq_acc,pred_eq
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
base_model_avg,0.922,0.921,0.921,0.895,0.935,0.88,0.96,0.72
base_model_std,0.874,0.873,0.872,0.807,0.907,0.88,0.89,1.48
group_upsample,0.9054,0.9042,0.9042,0.8522,0.9314,0.918,0.916,1.614
mixgender_upsample,0.913,0.9126,0.9126,0.8522,0.9442,0.918,0.902,2.164
oversample,0.9084,0.9078,0.9078,0.8454,0.9406,0.906,0.898,2.048
subgroup_upsample,0.909,0.9078,0.9078,0.859,0.9332,0.93,0.922,1.642
synthetic,0.9128,0.9126,0.9126,0.8562,0.9424,0.846,0.908,1.202
synthetic_mixgendered,0.909,0.9078,0.9078,0.859,0.9332,0.93,0.922,1.642


In [14]:
df.groupby("name").std()

Unnamed: 0_level_0,Precision,Recall,F-score,Male F-score,Female F-score,eq_oppor,eq_acc,pred_eq
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
base_model_avg,,,,,,,,
base_model_std,,,,,,,,
group_upsample,0.004722,0.00502,0.00502,0.009311,0.00805,0.016432,0.016733,0.254716
mixgender_upsample,0.007842,0.00805,0.00805,0.009311,0.011432,0.016432,0.016432,0.7007
oversample,0.008503,0.008899,0.008899,0.007603,0.014046,0.013416,0.019235,0.772735
subgroup_upsample,0.002236,0.002683,0.002683,0.0,0.004025,0.0,0.004472,0.107331
synthetic,0.007155,0.006841,0.006841,0.00795,0.007893,0.015166,0.008367,0.102567
synthetic_mixgendered,0.002236,0.002683,0.002683,0.0,0.004025,0.0,0.004472,0.107331
