# Expand the main active speech modelling table

For the revision of the journal publication, the table of "best performing regression models for the prediction of the self-reported mental wellbeing measures" is to be expanded: since there are several different model types and individual speech tasks, the best compromise seems to be to focus on the "Features" column. Here, we distinguish now between "eGeMAPS" and "wav2vec 2.0"-based embeddings.  

This notebook respectively iterates over all targets, fetches the best-performing model from the .csv file of each target's "raw" modelling results files, and collects the best performing model. This is done for the eGeMAPS- and the wav2vec-based models respectively.

Before, this was done manually and resulted in `compiled-merged_denoised_noisy-paper.ods`.

In [1]:
import os
import pandas as pd
import re

path_results_base = "../../results/mwas/composed"

dir_composed = os.path.join(
    path_results_base, "outer_cv_loso_fixed-continuous-surveys_all"
)

# The table will be saved as .csv to the path_results_base directory
file_out = "compiled-merged_denoised_noisy-paper-proper_loso-expanded"

# Map the clean target name to be shown in the publication to the results file
# Format: {"Display Name": ("file_name", "normalization_range")}
dct_targets = {
    "WHO-5": ("who_5_percentage_score_corrected", "raw_0_100-normalized_0_1"),
    "PSS-10": ("pss_10_total_score", "raw_0_40-normalized_0_1"),
    "PHQ-8": ("phq_8_total_score", "raw_0_24-normalized_0_1"),
    "Stress-now": ("stress_current", "raw_0_100-normalized_0_1"),
    "Stress-work": ("stress_work_tasks", "raw_0_100-normalized_0_1"),
}

In [2]:
# Define the options for the filter criteria string
cur_speech_task = "*"
cur_model = "*"
dct_denoising = {  # cur_denoising
    "Denoised": "filter-snr_tuckey_cutoff_7-clipped_default/denoising-facebook_denoiser-master64-converted_int16_dithering-loudness_normalization-no_loudness_normalization",
    "Noisy": "filter-no_audio_quality_blacklist/denoising-no_denoising-loudness_normalization-no_loudness_normalization",
}
dct_features = {"eGeMAPS": "eGeMAPSv02", "W2V2": "wav2vec2-variant*"}

In [3]:
# Iterate over all saved tables for each target and condense it to the entries to be
# included in the paper

# DataFrame to collect the best performing model rows per filter iteration
df_best = pd.DataFrame()

for cur_target in dct_targets.keys():
    # Get the value for the target's directory name and normalization range
    cur_target_val, cur_normalization = dct_targets[cur_target]
    cur_df = pd.read_csv(
        os.path.join(dir_composed, "results-" + cur_target_val + "-conf.csv")
    )

    for cur_denoise in dct_denoising.keys():
        cur_denoise_val = dct_denoising[cur_denoise]

        for cur_features in dct_features.keys():
            cur_features_val = dct_features[cur_features]

            str_filter = (
                f"/{cur_target_val}/{cur_normalization}/{cur_speech_task}/cohort-all/{cur_denoise_val}/"
                f"devaice_vad-min_segment_length-0.76-max_segment_length-6.0-segment_start_delay-0.15-segment_end_delay-0.25/"
                f"{cur_features_val}/fixed_test_speakers/type-no_feature_selection/"
                f"sklearn_standard_scaler/personalisation-none/loso/no_inner_cv/no_inner_cv/{cur_model}"
            )

            # Convet into regex pattern to match the placeholder asterisks
            cur_regex_pattern = re.escape(str_filter).replace(r"\*", ".*")
            cur_df_filter = cur_df[cur_df.path.str.match(f"^{cur_regex_pattern}$")]

            # Skip and notify if no matching row was returned
            if cur_df_filter.empty:
                print(f"WARNING: No matching rows for {cur_target} ({cur_target_val}); {cur_denoise}; {cur_features}")
                continue

            # Fetch the row with the best performing model:
            # The highest `ccc_conf_mean`
            cur_best_row = cur_df_filter.sort_values(
                by="ccc_conf_mean", ascending=False
            ).iloc[[0]]

            # Prepare the columns for the paper table processing
            # --> needs the raw labels instead of the nice ones
            cur_best_row["Target"] = cur_target_val
            cur_best_row["Task"] = cur_best_row["path"].str.split("/").str[3]
            # Just capitalise the "features" column
            cur_best_row.rename(columns={"features": "Features"}, inplace=True)
            # For the survey: have hard coded `outer_cv_loso_fixed-continuous-surveys_all`
            # and add "noisy" if no denoising was done
            cur_best_row["Survey"] = "outer_cv_loso_fixed-continuous-surveys_all" + ("-noisy" if cur_denoise == "Noisy" else "")

            df_best = pd.concat([df_best, cur_best_row], ignore_index=True)

In [4]:
df_best

Unnamed: 0,ccc_conf_mean,ccc_conf_low,ccc_conf_high,lower_bound_larger_null,task,Features,pearson_cc-test,concordance_cc-test,mean_squared_error-test,mean_absolute_error-test,...,mean_squared_error-train-agg-average,mean_absolute_error-train-agg-average,pearson_cc-train-agg-median,concordance_cc-train-agg-median,mean_squared_error-train-agg-median,mean_absolute_error-train-agg-median,path,Target,Task,Survey
0,0.354083,0.027664,0.567943,True,speechtasks-nilago-happy,eGeMAPSv02,,,0.025808,0.133634,...,0.008271797,0.07507299,0.80807,0.783139,0.008541957,0.07674245,/who_5_percentage_score_corrected/raw_0_100-no...,who_5_percentage_score_corrected,speechtasks-nilago-happy,outer_cv_loso_fixed-continuous-surveys_all
1,0.348391,0.012134,0.539457,True,speechtasks-sustained_utterance-a,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,0.024404,0.127393,...,5.746305e-08,0.0001738971,0.999999,0.999999,5.103562e-08,0.0001619295,/who_5_percentage_score_corrected/raw_0_100-no...,who_5_percentage_score_corrected,speechtasks-sustained_utterance-a,outer_cv_loso_fixed-continuous-surveys_all
2,0.361475,0.03144,0.514476,True,speechtasks-nilago-happy,eGeMAPSv02,,,0.019839,0.116789,...,0.005532489,0.06798505,0.892274,0.838539,0.005686809,0.0688229,/who_5_percentage_score_corrected/raw_0_100-no...,who_5_percentage_score_corrected,speechtasks-nilago-happy,outer_cv_loso_fixed-continuous-surveys_all-noisy
3,0.288002,0.063689,0.481959,True,speechtasks-nilago-neutral,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,0.030445,0.139364,...,1.16948e-13,2.62874e-07,1.0,1.0,5.467561e-14,1.769513e-07,/who_5_percentage_score_corrected/raw_0_100-no...,who_5_percentage_score_corrected,speechtasks-nilago-neutral,outer_cv_loso_fixed-continuous-surveys_all-noisy
4,0.082212,-0.099834,0.276044,False,speechtasks-spontaneous-work_tasks,eGeMAPSv02,,,0.029347,0.140539,...,0.007570402,0.07292284,0.798526,0.743616,0.007806128,0.07473995,/pss_10_total_score/raw_0_40-normalized_0_1/sp...,pss_10_total_score,speechtasks-spontaneous-work_tasks,outer_cv_loso_fixed-continuous-surveys_all
5,0.162778,-0.160369,0.336582,False,speechtasks-nilago-neutral,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,0.030705,0.141387,...,2.020112e-14,1.091152e-07,1.0,1.0,1.14667e-14,8.360411e-08,/pss_10_total_score/raw_0_40-normalized_0_1/sp...,pss_10_total_score,speechtasks-nilago-neutral,outer_cv_loso_fixed-continuous-surveys_all
6,0.098678,-0.082959,0.261394,False,speechtasks-spontaneous-work_tasks,eGeMAPSv02,,,0.028052,0.1349,...,0.008130248,0.0751081,0.770091,0.693969,0.008619898,0.07791728,/pss_10_total_score/raw_0_40-normalized_0_1/sp...,pss_10_total_score,speechtasks-spontaneous-work_tasks,outer_cv_loso_fixed-continuous-surveys_all-noisy
7,0.117312,0.020579,0.236165,True,speechtasks-spontaneous-work_tasks,wav2vec2-variant-wav2vec2-large-robust-12-ft-e...,,,0.0465,0.175782,...,0.0003043259,0.01345229,0.992341,0.991276,0.0003293464,0.0136287,/pss_10_total_score/raw_0_40-normalized_0_1/sp...,pss_10_total_score,speechtasks-spontaneous-work_tasks,outer_cv_loso_fixed-continuous-surveys_all-noisy
8,0.06447,-0.16422,0.318883,False,speechtasks-sustained_utterance-a,eGeMAPSv02,,,0.040698,0.166915,...,0.004713939,0.06075994,0.841036,0.832387,0.005517887,0.06604461,/phq_8_total_score/raw_0_24-normalized_0_1/spe...,phq_8_total_score,speechtasks-sustained_utterance-a,outer_cv_loso_fixed-continuous-surveys_all
9,0.194951,-0.037036,0.38324,False,speechtasks-nilago-happy,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,0.031299,0.142139,...,1.039512e-13,2.514588e-07,1.0,1.0,7.120281e-14,2.041125e-07,/phq_8_total_score/raw_0_24-normalized_0_1/spe...,phq_8_total_score,speechtasks-nilago-happy,outer_cv_loso_fixed-continuous-surveys_all


In [5]:
df_best.to_csv(os.path.join(path_results_base, file_out+".csv"), index=False)