# Compose main modelling table for publication

Use the semi-manually compiled table `compiled-merged_denoised_noisy-paper.ods` with the best performing model for each target and survey, with noisy and denoised data respectively.  
Take that table and convert it to LaTeX source. In the process, change the raw, souce-like strings that e.g., describe the models, into nice and presentable labels. This table can readily be pasted into the LaTeX project of the publication.

In [30]:
import os
import pandas as pd

# Define the paths
path_results_base = "../../results/mwas/composed"
table_paper = "compiled-merged_denoised_noisy-paper-proper_loso.ods"

# Construct the full path to the table
table_paper_path = os.path.join(path_results_base, table_paper)

# Read the table from the .ods file
df_paper = pd.read_excel(table_paper_path, engine="odf")
df_paper_raw = df_paper.copy()

In [31]:
# Insert column "Quality"

# Determine the position of the "Target" column
target_index = df_paper.columns.get_loc("Target")

# Insert the "Quality" column after the "Target" column
df_paper.insert(target_index + 1, "Quality", "")

# Fill the "Quality" column conditionally based on the "Survey" column
df_paper["Quality"] = df_paper["Survey"].apply(lambda x: "Noisy" if "noisy" in x else "Denoised")

In [32]:
# Extract the "Model" from the "path" column
df_paper["Model"] = df_paper["path"].str.extract(r'/([^/]+)/models/results-compiled\.yaml$')

# Replace strings in the "Model" column based on the specified conditions
df_paper["Model"] = df_paper["Model"].apply(
    lambda x: "XGBr" if "XGBRegressor" in x else (
        "LR" if "LinearRegression" in x else (
            "SVR" if "SVR" in x else x
        )
    )
)

# Insert the "Model" column right after the "Task" column
quality_index = df_paper.columns.get_loc("Task")
df_paper.insert(quality_index + 1, "Model", df_paper.pop("Model"))

In [33]:
# Replace strings in the "Survey" column based on the specified conditions
df_paper["Survey"] = df_paper["Survey"].apply(
    lambda x: "All" if "surveys_all" in x else ("Daily" if "daily" in x else ("Weekly" if "weekly" in x else x))
)

In [34]:
# Replace strings in the "Target" column based on the specified conditions
df_paper["Target"] = df_paper["Target"].apply(
    lambda x: "WHO-5" if "who_5_percentage_score_corrected" in x else (
        "PSS-10" if "pss_10_total_score" in x else (
            "Stress-now" if "stress_current" in x else (
                "PHQ-8" if "phq_8_total_score" in x else (
                    "Stress-work" if "stress_work_tasks" in x else x
                )
            )
        )
    )
)

In [35]:
# Replace strings in the "Task" column based on the specified conditions
df_paper["Task"] = df_paper["Task"].apply(
    lambda x: "Nilago neutral" if "nilago-neutral" in x else (
        "Nilago happy" if "speechtasks-nilago-happy" in x else (
            "All" if "speechtasks-standardized_tasks" in x else (
                "Counting" if "speechtasks-counting" in x else (
                    "Spontaneous" if "speechtasks-spontaneous-work_tasks" in x else (
                        "Sustained /a/" if "speechtasks-sustained_utterance-a" in x else x
                    )
                )
            )
        )
    )
)

In [36]:
# Replace strings in the "Features" column based on the specified conditions
df_paper["Features"] = df_paper["Features"].apply(
    lambda x: "W2V2-LR-LIBRI" if "wav2vec2-large-robust-ft-libri-960h-num_hidden_layers-0" in x else (
        "W2V2-LR-MSP" if "wav2vec2-large-robust-12-ft-emotion-msp-dim-num_hidden_layers-0" in x else (
            "W2V2-L-XLSR" if "wav2vec2-large-xlsr-53-num_hidden_layers-0" in x else (
                "eGeMAPS" if "eGeMAPSv02" in x else x
            )
        )
    )
)

In [37]:
# Define the custom order for "Survey", "Target", and "Quality"
survey_order = ["All", "Daily", "Weekly"]
target_order = ["WHO-5", "PSS-10", "PHQ-8", "Stress-now", "Stress-work"]
quality_order = ["Denoised", "Noisy"]

# Convert the columns to categorical types with the specified order
df_paper["Survey"] = pd.Categorical(df_paper["Survey"], categories=survey_order, ordered=True)
df_paper["Target"] = pd.Categorical(df_paper["Target"], categories=target_order, ordered=True)
df_paper["Quality"] = pd.Categorical(df_paper["Quality"], categories=quality_order, ordered=True)

# Sort the DataFrame by "Survey", "Target", and "Quality"
df_paper = df_paper.sort_values(by=["Survey", "Target", "Quality"])

In [38]:
# Summarize the columns "ccc_conf_mean", "ccc_conf_low", and "ccc_conf_high" into one column "CCC"
df_paper["CCC"] = df_paper.apply(
    lambda row: f"{row['ccc_conf_mean']:.3f} ({row['ccc_conf_low']:.3f} - {row['ccc_conf_high']:.3f})", axis=1
)

# Add an asterisk to the end of the "CCC" strings in all rows where "lower_bound_larger_null" is True
df_paper.loc[df_paper["lower_bound_larger_null"] == True, "CCC"] += "*"

# Insert the "CCC" column right after the "Quality" column
quality_index = df_paper.columns.get_loc("Quality")
df_paper.insert(quality_index + 1, "CCC", df_paper.pop("CCC"))

In [39]:
# Drop the specified columns
# Survey column to be dropped when only looking at all surveys
df_paper = df_paper.drop(columns=["Survey", "ccc_conf_mean", "ccc_conf_low", "ccc_conf_high", "lower_bound_larger_null"])

# Drop all columns after the "Features" column
features_index = df_paper.columns.get_loc("Features")
df_paper = df_paper.iloc[:, :features_index + 1]

In [40]:
# Function to highlight the best performing model's "CCC" value per "Survey" section
def highlight_best_ccc(df):
    # Create a helper column to extract the numeric part of "CCC"
    df["CCC_float"] = df["CCC"].str.split().str[0].astype(float)
    
    for survey in df["Survey"].unique():
        survey_df = df[df["Survey"] == survey]
        max_ccc_index = survey_df["CCC_float"].idxmax()
        df.at[max_ccc_index, "CCC"] = f"\\textbf{{{df.at[max_ccc_index, 'CCC']}}}"
    
    # Drop the helper column
    df = df.drop(columns=["CCC_float"])
    
    return df

# Highlight the best performing model's "CCC" value per "Survey" section
# df_paper = highlight_best_ccc(df_paper)

In [41]:
df_paper

Unnamed: 0,Target,Quality,CCC,Task,Model,Features
0,WHO-5,Denoised,0.354 (0.028 - 0.568)*,Nilago happy,SVR,eGeMAPS
1,WHO-5,Noisy,0.361 (0.031 - 0.514)*,Nilago happy,SVR,eGeMAPS
2,PSS-10,Denoised,0.163 (-0.160 - 0.337),Nilago neutral,LR,W2V2-LR-LIBRI
3,PSS-10,Noisy,0.117 (0.021 - 0.236)*,Spontaneous,LR,W2V2-LR-MSP
4,PHQ-8,Denoised,0.195 (-0.037 - 0.383),Nilago happy,LR,W2V2-LR-LIBRI
5,PHQ-8,Noisy,0.213 (-0.003 - 0.377),Nilago happy,LR,W2V2-LR-LIBRI
6,Stress-now,Denoised,0.178 (0.033 - 0.313)*,Nilago happy,LR,W2V2-LR-MSP
7,Stress-now,Noisy,0.194 (0.042 - 0.320)*,Nilago happy,LR,W2V2-LR-MSP
8,Stress-work,Denoised,0.238 (0.040 - 0.373)*,Sustained /a/,SVR,eGeMAPS
9,Stress-work,Noisy,0.227 (0.005 - 0.373)*,Sustained /a/,SVR,eGeMAPS


In [42]:
# Add horizontal lines at the transitions between "Survey" categories
def add_horizontal_lines(df):
    latex_str = df.to_latex(index=False)
    lines = latex_str.splitlines()
    new_lines = []
    df_index = 0  # Initialize DataFrame row index
    for i, line in enumerate(lines):
        new_lines.append(line)
        # Check if the current line is a data line (not a header or formatting line)
        if i > 3 and i < len(lines) - 1:  # Skip the first three lines (header and \midrule)
            if df_index < len(df) - 1:  # Ensure df_index is within bounds
                prev_survey = df.iloc[df_index]["Survey"]
                curr_survey = df.iloc[df_index + 1]["Survey"]
                if prev_survey != curr_survey:
                    new_lines.append("\\hline")
            df_index += 1  # Increment DataFrame row index only for data lines
    return "\n".join(new_lines)

# Export the DataFrame to LaTeX format with horizontal lines
# latex_code = add_horizontal_lines(df_paper)


In [43]:
latex_code = df_paper.to_latex(index=False)
print(latex_code)

\begin{tabular}{llllll}
\toprule
Target & Quality & CCC & Task & Model & Features \\
\midrule
WHO-5 & Denoised & 0.354 (0.028 - 0.568)* & Nilago happy & SVR & eGeMAPS \\
WHO-5 & Noisy & 0.361 (0.031 - 0.514)* & Nilago happy & SVR & eGeMAPS \\
PSS-10 & Denoised & 0.163 (-0.160 - 0.337) & Nilago neutral & LR & W2V2-LR-LIBRI \\
PSS-10 & Noisy & 0.117 (0.021 - 0.236)* & Spontaneous & LR & W2V2-LR-MSP \\
PHQ-8 & Denoised & 0.195 (-0.037 - 0.383) & Nilago happy & LR & W2V2-LR-LIBRI \\
PHQ-8 & Noisy & 0.213 (-0.003 - 0.377) & Nilago happy & LR & W2V2-LR-LIBRI \\
Stress-now & Denoised & 0.178 (0.033 - 0.313)* & Nilago happy & LR & W2V2-LR-MSP \\
Stress-now & Noisy & 0.194 (0.042 - 0.320)* & Nilago happy & LR & W2V2-LR-MSP \\
Stress-work & Denoised & 0.238 (0.040 - 0.373)* & Sustained /a/ & SVR & eGeMAPS \\
Stress-work & Noisy & 0.227 (0.005 - 0.373)* & Sustained /a/ & SVR & eGeMAPS \\
\bottomrule
\end{tabular}



In [44]:
df_paper

Unnamed: 0,Target,Quality,CCC,Task,Model,Features
0,WHO-5,Denoised,0.354 (0.028 - 0.568)*,Nilago happy,SVR,eGeMAPS
1,WHO-5,Noisy,0.361 (0.031 - 0.514)*,Nilago happy,SVR,eGeMAPS
2,PSS-10,Denoised,0.163 (-0.160 - 0.337),Nilago neutral,LR,W2V2-LR-LIBRI
3,PSS-10,Noisy,0.117 (0.021 - 0.236)*,Spontaneous,LR,W2V2-LR-MSP
4,PHQ-8,Denoised,0.195 (-0.037 - 0.383),Nilago happy,LR,W2V2-LR-LIBRI
5,PHQ-8,Noisy,0.213 (-0.003 - 0.377),Nilago happy,LR,W2V2-LR-LIBRI
6,Stress-now,Denoised,0.178 (0.033 - 0.313)*,Nilago happy,LR,W2V2-LR-MSP
7,Stress-now,Noisy,0.194 (0.042 - 0.320)*,Nilago happy,LR,W2V2-LR-MSP
8,Stress-work,Denoised,0.238 (0.040 - 0.373)*,Sustained /a/,SVR,eGeMAPS
9,Stress-work,Noisy,0.227 (0.005 - 0.373)*,Sustained /a/,SVR,eGeMAPS


In [45]:
df_paper_raw.iloc[0]["path"]

'/who_5_percentage_score_corrected/raw_0_100-normalized_0_1/speechtasks-nilago-happy/cohort-all/filter-snr_tuckey_cutoff_7-clipped_default/denoising-facebook_denoiser-master64-converted_int16_dithering-loudness_normalization-no_loudness_normalization/devaice_vad-min_segment_length-0.76-max_segment_length-6.0-segment_start_delay-0.15-segment_end_delay-0.25/eGeMAPSv02/fixed_test_speakers/type-no_feature_selection/sklearn_standard_scaler/personalisation-none/loso/no_inner_cv/no_inner_cv/SVR-default-linear-c_0.1/models/results-compiled.yaml'

In [46]:
df_paper_raw

Unnamed: 0,Survey,Target,ccc_conf_mean,ccc_conf_low,ccc_conf_high,lower_bound_larger_null,Task,Features,pearson_cc-test,concordance_cc-test,...,mean_absolute_error-test-agg-median,pearson_cc-train-agg-average,concordance_cc-train-agg-average,mean_squared_error-train-agg-average,mean_absolute_error-train-agg-average,pearson_cc-train-agg-median,concordance_cc-train-agg-median,mean_squared_error-train-agg-median,mean_absolute_error-train-agg-median,path
0,outer_cv_loso_fixed-continuous-surveys_all,who_5_percentage_score_corrected,0.354083,0.027664,0.567943,True,speechtasks-nilago-happy,eGeMAPSv02,,,...,0.122992,0.8152,0.790248,0.008271797,0.07507299,0.80807,0.783139,0.008541957,0.07674245,/who_5_percentage_score_corrected/raw_0_100-no...
1,outer_cv_loso_fixed-continuous-surveys_all-noisy,who_5_percentage_score_corrected,0.361475,0.03144,0.514476,True,speechtasks-nilago-happy,eGeMAPSv02,,,...,0.114892,0.896905,0.843099,0.005532489,0.06798505,0.892274,0.838539,0.005686809,0.0688229,/who_5_percentage_score_corrected/raw_0_100-no...
2,outer_cv_loso_fixed-continuous-surveys_all,pss_10_total_score,0.162778,-0.160369,0.336582,False,speechtasks-nilago-neutral,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,...,0.139807,1.0,1.0,2.020112e-14,1.091152e-07,1.0,1.0,1.14667e-14,8.360411e-08,/pss_10_total_score/raw_0_40-normalized_0_1/sp...
3,outer_cv_loso_fixed-continuous-surveys_all-noisy,pss_10_total_score,0.117312,0.020579,0.236165,True,speechtasks-spontaneous-work_tasks,wav2vec2-variant-wav2vec2-large-robust-12-ft-e...,,,...,0.136889,0.992969,0.991945,0.0003043259,0.01345229,0.992341,0.991276,0.0003293464,0.0136287,/pss_10_total_score/raw_0_40-normalized_0_1/sp...
4,outer_cv_loso_fixed-continuous-surveys_all,phq_8_total_score,0.194951,-0.037036,0.38324,False,speechtasks-nilago-happy,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,...,0.132149,1.0,1.0,1.039512e-13,2.514588e-07,1.0,1.0,7.120281e-14,2.041125e-07,/phq_8_total_score/raw_0_24-normalized_0_1/spe...
5,outer_cv_loso_fixed-continuous-surveys_all-noisy,phq_8_total_score,0.212767,-0.002674,0.377122,False,speechtasks-nilago-happy,wav2vec2-variant-wav2vec2-large-robust-ft-libr...,,,...,0.125532,1.0,1.0,6.688269e-14,2.04853e-07,1.0,1.0,6.672254e-14,2.048222e-07,/phq_8_total_score/raw_0_24-normalized_0_1/spe...
6,outer_cv_loso_fixed-continuous-surveys_all,stress_current,0.178202,0.033416,0.313354,True,speechtasks-nilago-happy,wav2vec2-variant-wav2vec2-large-robust-12-ft-e...,,,...,0.187073,1.0,1.0,3.203202e-14,1.421325e-07,1.0,1.0,2.46886e-14,1.253007e-07,/stress_current/raw_0_100-normalized_0_1/speec...
7,outer_cv_loso_fixed-continuous-surveys_all-noisy,stress_current,0.193889,0.042006,0.32043,True,speechtasks-nilago-happy,wav2vec2-variant-wav2vec2-large-robust-12-ft-e...,0.125211,0.062868,...,0.185099,1.0,1.0,3.257203e-14,1.415972e-07,1.0,1.0,2.195323e-14,1.153022e-07,/stress_current/raw_0_100-normalized_0_1/speec...
8,outer_cv_loso_fixed-continuous-surveys_all,stress_work_tasks,0.237528,0.039963,0.372818,True,speechtasks-sustained_utterance-a,eGeMAPSv02,,,...,0.186152,0.828743,0.802166,0.01125145,0.07627749,0.81073,0.787398,0.012191,0.08300083,/stress_work_tasks/raw_0_100-normalized_0_1/sp...
9,outer_cv_loso_fixed-continuous-surveys_all-noisy,stress_work_tasks,0.227159,0.005209,0.373449,True,speechtasks-sustained_utterance-a,eGeMAPSv02,,,...,0.18011,0.753065,0.714283,0.01500424,0.08622516,0.732933,0.699354,0.01598927,0.09326472,/stress_work_tasks/raw_0_100-normalized_0_1/sp...


In [47]:
df_paper["Model"]

0    SVR
1    SVR
2     LR
3     LR
4     LR
5     LR
6     LR
7     LR
8    SVR
9    SVR
Name: Model, dtype: object