# Notebook for PDEM embeddings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import json
import os
import string
import time

import pandas as pd
import numpy as np

from pathlib import Path

In [2]:
json_dir = Path(r"../data/dvlog_text")
annotations_file = Path(r"../DVlog/dataset/dvlog_labels_v2.csv")
save_dir = Path(r"../DVlog/dataset/PDEM/")

# 
run_formatting = False

In [3]:
# load in the annotation labels
df_annotations = pd.read_csv(annotations_file)
df_annotations.head()

Unnamed: 0,video_id,label,gender,dataset
0,0,1,f,train
1,1,1,f,test
2,2,1,m,train
3,3,1,m,train
4,4,1,f,test


In [4]:
if run_formatting:
    save_dict = {
        "train": "training",
        "test": "test",
        "val": "validation"
    }

    # loop over each text file and build the transcript files
    for json_file in os.listdir(json_dir):

        # get the video_id and setup the path to the file
        filename, _ = os.path.splitext(json_file)
        video_id = int(filename.split("_")[0])
        json_path = os.path.join(json_dir, json_file)

        with open(json_path) as current_file:
            json_dict = json.loads(current_file.read())


        # extract the sentences with their begin and end timestamp
        video_sents = []
        for sent_dict in json_dict.get("segments"):
            video_sents.append((sent_dict.get("start"), sent_dict.get("end"), sent_dict.get("text")))

        # save as dataframe
        df = pd.DataFrame(video_sents)

        # check in which folder to save
        current_sample = df_annotations[df_annotations["video_id"] == video_id]
        save_folder = save_dict.get(current_sample.dataset.values[0])

        save_file = os.path.join(save_dir, save_folder, f"{filename}.csv")

        df.to_csv(save_file, index=False, sep=';')

In [7]:
pdem_features_path = r"E:/master/data/PDEM/features"
pdem_indexfiles_path = r"E:/master/data/PDEM/index_files"
pdem_features_output_path = r"E:/master/data/PDEM/pdem-dataset"

# get the test and validation dataframes
pdem_test_df = pd.read_pickle(os.path.join(pdem_features_path, "vad_features_test.pkl"))
pdem_test_df.reset_index(inplace=True)
index_test_df = pd.read_csv(os.path.join(pdem_indexfiles_path, "pdem_index_file_test.csv"), sep=";")
df_test = pd.concat([index_test_df, pdem_test_df], axis=1)


pdem_validation_df = pd.read_pickle(os.path.join(pdem_features_path, "vad_features_validation.pkl"))
pdem_validation_df.reset_index(inplace=True)
index_validation_df = pd.read_csv(os.path.join(pdem_indexfiles_path, "pdem_index_file_validation.csv"), sep=";")
df_validation = pd.concat([index_validation_df, pdem_validation_df], axis=1)


# get the training dataframe from all the subsets
frames = []

for file in [x for x in os.listdir(pdem_indexfiles_path) if 'training' in x]:
    filename, _ = os.path.splitext(file)
    subset_name = filename.split("index_file_")[-1]
    
    # select the appriopriate file
    pdem_features_file_path = os.path.join(pdem_features_path, f"vad_features_{subset_name}.pkl")
    
    pdem_train_df = pd.read_pickle(pdem_features_file_path)
    pdem_train_df.reset_index(inplace=True)
    index_train_df = pd.read_csv(os.path.join(pdem_indexfiles_path, file), sep=";")
    frames.append(pd.concat([index_train_df, pdem_train_df], axis=1))

# combine all the training frames
df_training = pd.concat(frames, ignore_index=True)
df_training.head()

Unnamed: 0,Participant_ID,saved_file_name,file_path,text,duration,file,start,end,hidden_states-0,hidden_states-1,...,hidden_states-1017,hidden_states-1018,hidden_states-1019,hidden_states-1020,hidden_states-1021,hidden_states-1022,hidden_states-1023,arousal,dominance,valence
0,0,0_AUDIO_0000_0.6_6.02.wav,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,So I wanted to come on here and sit down with...,5420,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,0 days,0 days 00:00:05.420000,-0.007392,0.006028,...,0.008151,-0.093368,-0.027413,0.160171,0.007032,0.005502,0.004331,-0.119866,0.04141,0.325436
1,0,0_AUDIO_0001_7.12_12.2.wav,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,Really struggling lately with this whole TTC ...,5080,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,0 days,0 days 00:00:05.080000,-0.007417,0.006814,...,0.008103,-0.048084,-0.066491,0.135287,0.0069,0.005879,0.004231,0.007269,0.183045,0.428093
2,0,0_AUDIO_0002_13.16_17.7.wav,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,Have gotten to a point in this journey where ...,4540,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,0 days,0 days 00:00:04.540000,-0.007424,0.005932,...,0.008105,-0.049269,-0.1207,0.169499,0.007048,0.005482,0.003954,0.019353,0.185052,0.314137
3,0,0_AUDIO_0003_18.69_24.01.wav,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,"Honestly, I don't even really know who I am a...",5320,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,0 days,0 days 00:00:05.320000,-0.007253,0.006085,...,0.007954,-0.008859,-0.007376,0.077006,0.007087,0.005913,0.00421,-0.001093,0.148365,0.336719
4,0,0_AUDIO_0004_25.26_29.78.wav,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,But I think it's important that I share how I...,4520,C:\Users\Stan\Desktop\PDEM\outputs\0\0_AUDIO_0...,0 days,0 days 00:00:04.520000,-0.007231,0.006,...,0.008155,-0.070882,-0.027365,0.074493,0.006968,0.005486,0.004779,-0.082072,0.078991,0.286873


In [13]:
# save all the features to their corresponding output files
drop_pdem_labels = [x for x in df_training.columns if "hidden_states" not in x]
drop_pdemvad_labels = drop_pdem_labels[:-3]

for df in [df_training, df_test, df_validation]:

    # retrieve the id's of each subject
    for subject_id in df["Participant_ID"].unique():
        
        # setup the directory
        subject_output_path = os.path.join(pdem_features_output_path, str(subject_id))
        os.makedirs(subject_output_path, exist_ok=True)
        
        # filter all the features for the current subject
        subject_df = df[df["Participant_ID"] == subject_id]
        
        # order the features based on time
        subject_df["timestep"] = subject_df["saved_file_name"].apply(lambda x: int(x.split("AUDIO_")[1].split("_")[0]))
        subject_df.sort_values(by=['timestep'], inplace=True)
        subject_df.drop("timestep", axis=1, inplace=True)
        
        # select only the pdem embeddings and save them
        df_pdem = subject_df.drop(labels=drop_pdem_labels, axis=1)
        np.save(os.path.join(subject_output_path, "pdem.npy"), df_pdem.to_numpy())
        
        # select the concatenated pdem and VAD embeddings and save them
        df_pdemvad = subject_df.drop(labels=drop_pdemvad_labels, axis=1)
        np.save(os.path.join(subject_output_path, "pdemvad.npy"), df_pdemvad.to_numpy())

        # apply a functional over the VAD embeddings and save them (arousal dominance valence)
        df_pdemvad["arousal"] = (df_pdemvad["arousal"] - df_pdemvad["arousal"].mean()) / df_pdemvad["arousal"].std()
        df_pdemvad["dominance"] = (df_pdemvad["dominance"] - df_pdemvad["dominance"].mean()) / df_pdemvad["dominance"].std()
        df_pdemvad["valence"] = (df_pdemvad["valence"] - df_pdemvad["valence"].mean()) / df_pdemvad["valence"].std()
        np.save(os.path.join(subject_output_path, "pdemvad_func.npy"), df_pdemvad.to_numpy())