# 

In [27]:
import os
import sys
import pandas as pd
import numpy as np

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from pathlib import Path

# update the path so we can directly import code from the DVlog
sys.path.append(os.path.dirname(os.path.abspath(os.path.join(os.getcwd(), os.pardir, "DVlog"))))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir, "DVlog")))

from DVlog.utils.bias_mitigations import apply_oversampling, apply_mixfeat_oversampling

In [2]:
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")
embeddings_path = Path("../DVlog/dataset/sent-embeddings-dataset")
feature_name = "sent_mpnet_keyw"

In [3]:
# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)
df_annotations.reset_index(drop=True, inplace=True)
df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,test


In [4]:
# loop over each row and compute the average embeddings
df_annotations["avg_embed"] = None

# loop over each row and retrieve the embeddings
seq_length = 104

for idx, row in df_annotations.iterrows():
    # get the texts
    video_id = row.video_id
    
    # setup the path to the file
    embedding_path = os.path.join(embeddings_path, str(video_id), f"{feature_name}.npy")
    embedding = np.load(embedding_path).astype(np.float32)
    
    # apply the padding
    padded_embedding = embedding[:seq_length]
    
    # get the average over the whole embedding
    avg_embedding = np.mean(padded_embedding, axis=0)
    
    # put the embedding back
    df_annotations.at[idx, "avg_embed"] = avg_embedding
    

df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset,avg_embed
0,0,1,f,train,"[-0.0043204557, 0.0025047027, -0.022133984, -0..."
1,1,1,f,test,"[0.014702894, 0.017551864, -0.01323786, -0.016..."
2,2,1,m,train,"[-0.0020621587, -0.002233186, -0.009010282, -0..."
3,3,1,m,train,"[0.013287175, 0.005526411, -0.010409681, -0.02..."
4,4,1,f,test,"[-0.008224284, 0.02129893, -0.0096479375, -0.0..."


In [23]:
# setup the train and validation datasets
train_indices = df_annotations[df_annotations["dataset"] == "train"].index
val_indices = df_annotations[df_annotations["dataset"] == "val"].index

# prepare the features and labels
features = np.stack(df_annotations["avg_embed"].values)
labels = df_annotations["label"].values

# create the train and validation sets
X_train, y_train = features[train_indices], labels[train_indices]
X_val, y_val = features[val_indices], labels[val_indices]

# combine the train and validation sets
X = np.vstack((X_train, X_val))
y = np.hstack((y_train, y_val))

# Create a test_fold array: -1 for training set, 0 for validation set
test_fold = np.concatenate([
    -1 * np.ones(len(X_train), dtype=int),
    np.zeros(len(X_val), dtype=int)
])

print(X.shape, y.shape, test_fold.shape)

# Create PredefinedSplit object
ps = PredefinedSplit(test_fold)

(662, 768) (662,) (662,)


## setup the gridsearch with the parameters
- C (Regularization Parameter): Controls the trade-off between achieving a low error on the training data and minimizing the norm of the weights. A small value for C makes the decision surface smooth, while a large value of C aims to classify all training examples correctly.

- Gamma (Kernel Coefficient): Defines how far the influence of a single training example reaches, with low values meaning 'far' and high values meaning 'close'. It is applicable for 'rbf', 'poly', and 'sigmoid' kernels.

- Kernel: Specifies the kernel type to be used in the algorithm. Common kernels are 'linear', 'poly' (polynomial), 'rbf' (radial basis function), and 'sigmoid'.

In [24]:
# Define the SVM and parameter grid
svm = SVC()
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid']
}

# Set up and run GridSearchCV
grid_search = GridSearchCV(estimator=svm, param_grid=param_grid, cv=ps, verbose=2, n_jobs=-1)
grid_search.fit(X, y)

Fitting 1 folds for each of 64 candidates, totalling 64 fits


In [25]:
# Output best parameters and score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation score: ", grid_search.best_score_)

Best parameters found:  {'C': 10, 'gamma': 1, 'kernel': 'poly'}
Best cross-validation score:  0.9523809523809523


In [29]:
# evaulate this model on the test set
test_indices = df_annotations[df_annotations["dataset"] == "test"].index
X_test, y_test = features[test_indices], labels[test_indices]

# Evaluate the best model
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_test)

# run the evaluation


## Setup the bias mitigations

In [None]:
# Evaluate the best model
best_svm = grid_search.best_estimator_
y_pred = best_svm.predict(X_val)

print("Validation Accuracy: ", accuracy_score(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))