In [62]:
from pathlib import Path

data_dir = "../data"
data_dir = "/home/HDD12TB/datasets/images/emotions/ACMMM25/AVI/AVI_Challenge_dataset"
DATA_DIR_PATH = Path(data_dir)
PREPROCESSED_TRAIN_DIR_PATH = DATA_DIR_PATH / "preprocessed_train_data"
PREPROCESSED_VAL_DIR_PATH = DATA_DIR_PATH / "preprocessed_val_data"

In [63]:
import pandas as pd

df_train = pd.read_csv(PREPROCESSED_TRAIN_DIR_PATH / "train_data.csv")
df_val = pd.read_csv(PREPROCESSED_VAL_DIR_PATH / "val_data.csv")

df_train

Unnamed: 0,id,age,work_experience,Honesty-Humility,Extraversion,Agreeableness,Conscientiousness,Integrity,Collegiality,Social_versatility,...,gender_2,gender_3,gender_4,education_1,education_2,education_3,education_4,education_5,education_6,education_7
0,60fccc84440f8e8c82ca0288,0.061224,0.000000,3.781250,3.806250,4.006250,3.668750,4.0,3.9,4.0,...,1,0,0,0,1,0,0,0,0,0
1,639881e5774909b88ee9389a,0.693878,0.666667,3.918750,3.475000,3.950000,4.100000,3.6,2.4,2.5,...,0,0,0,0,0,1,0,0,0,0
2,60e1d5f866e681d7e33fd01c,0.448980,0.529412,3.075000,2.912500,2.612500,3.562500,3.5,2.5,2.5,...,0,1,0,0,0,1,0,0,0,0
3,62ceba5fdab49dfc90278cbb,0.102041,0.098039,3.970000,2.725000,3.312500,3.381250,3.2,3.1,3.3,...,1,0,0,0,0,1,0,0,0,0
4,6134113b9f3f724a7cf9e390,0.204082,0.235294,3.037500,2.825000,3.062500,2.931250,3.5,3.5,3.5,...,1,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
445,5fbd371dc57ae80a38ba00c2,0.306122,0.196078,3.608333,3.416667,3.716667,3.558333,3.5,4.0,3.7,...,0,0,0,0,0,0,1,0,0,0
446,5a95fee189de8200013eddd4,0.448980,0.392157,3.368750,2.812500,2.912500,2.893750,3.5,2.9,2.9,...,1,0,0,0,0,1,0,0,0,0
447,629a6460a0232815dab01a01,0.265306,0.137255,3.368750,3.075000,3.443750,3.162500,2.9,3.1,2.9,...,1,0,0,0,0,0,1,0,0,0
448,5db5dacdb2d499000a8f4439,0.285714,0.254902,3.162500,3.118750,3.643750,3.062500,3.0,3.8,3.5,...,0,0,1,0,0,0,0,1,0,0


In [64]:
meta_columns = [
    "age",
    "work_experience",
    *[f"gender_{i}" for i in range(1, 5)],
    *[f"education_{i}" for i in range(1, 8)],
]
meta_columns

['age',
 'work_experience',
 'gender_1',
 'gender_2',
 'gender_3',
 'gender_4',
 'education_1',
 'education_2',
 'education_3',
 'education_4',
 'education_5',
 'education_6',
 'education_7']

In [65]:
personality_labels = [
    "Honesty-Humility",
    "Extraversion",
    "Agreeableness",
    "Conscientiousness",
]
performance_labels = [
    "Integrity",
    "Collegiality",
    "Social_versatility",
    "Development_orientation",
    "Hireability",
]

postfixes = [
    "_q1_generic",
    "_q2_generic",
    "_q3_personality",
    "_q4_personality",
    "_q5_personality",
    "_q6_personality",
]

In [66]:
import os
import numpy as np
import torch


def load_tensor(path, dim):
    if os.path.exists(path):
        tensor = torch.load(f=path, map_location="cpu")
    else:
        tensor = torch.zeros(dim)

    return tensor


def prepare(df: pd.DataFrame, preprocessed_dir_path: Path):
    data = []
    for id, row in df.iterrows():
        for postfix in postfixes:
            video_id = f"{row['id']}{postfix}"

            video_embedding = load_tensor(
                preprocessed_dir_path / "video" / f"{video_id}.pt", 1280
            ).numpy()

            if video_embedding.shape != (1280,):
                video_embedding = np.mean(video_embedding, axis=0)
                # print(video_features.shape)

            audio_embedding = load_tensor(
                preprocessed_dir_path / "audio" / f"{video_id}.pt", 1280
            ).numpy()

            if audio_embedding.shape != (1280,):
                audio_embedding = np.mean(audio_embedding, axis=0)

            text_embedding = load_tensor(
                preprocessed_dir_path / "text" / f"{video_id}.pt", 768
            ).numpy()

            assert video_embedding.shape == (1280,)
            assert audio_embedding.shape == (1280,), f"{audio_embedding.shape = }"
            assert text_embedding.shape == (768,)

            data.append(
                {
                    "video_id": video_id,
                    "video_embedding": video_embedding,
                    "audio_embedding": audio_embedding,
                    "text_embedding": text_embedding,
                    **row[meta_columns],
                    **row[personality_labels],
                    **row[performance_labels],
                }
            )

    data = pd.DataFrame(data)
    video_embedding_df = pd.DataFrame(
        data["video_embedding"]
        .apply(lambda x: pd.Series(x))
        .add_prefix("video_embedding_")
    )
    audio_embedding_df = pd.DataFrame(
        data["audio_embedding"]
        .apply(lambda x: pd.Series(x))
        .add_prefix("audio_embedding_")
    )
    text_embedding_df = pd.DataFrame(
        data["text_embedding"]
        .apply(lambda x: pd.Series(x))
        .add_prefix("text_embedding_")
    )

    data = pd.concat(
        [data, video_embedding_df, audio_embedding_df, text_embedding_df], axis=1
    ).drop(columns=["video_embedding", "audio_embedding", "text_embedding"])

    return data

In [67]:
df_train_prepared = prepare(df_train, PREPROCESSED_TRAIN_DIR_PATH)
df_train_prepared

Unnamed: 0,video_id,age,work_experience,gender_1,gender_2,gender_3,gender_4,education_1,education_2,education_3,...,text_embedding_758,text_embedding_759,text_embedding_760,text_embedding_761,text_embedding_762,text_embedding_763,text_embedding_764,text_embedding_765,text_embedding_766,text_embedding_767
0,60fccc84440f8e8c82ca0288_q1_generic,0.061224,0.000000,0,1,0,0,0,1,0,...,0.096142,-0.921477,0.825072,0.240941,1.982402,0.535274,-0.181860,-0.093772,-0.256280,-0.267936
1,60fccc84440f8e8c82ca0288_q2_generic,0.061224,0.000000,0,1,0,0,0,1,0,...,-1.296438,-0.641331,0.072822,0.253385,1.077477,0.854237,1.117200,1.119741,-0.308438,-0.471200
2,60fccc84440f8e8c82ca0288_q3_personality,0.061224,0.000000,0,1,0,0,0,1,0,...,-0.674353,-0.147880,-0.212935,0.250764,1.432331,0.031019,0.004382,0.113433,-0.294863,0.195114
3,60fccc84440f8e8c82ca0288_q4_personality,0.061224,0.000000,0,1,0,0,0,1,0,...,-0.290917,-0.733758,-0.208021,0.553123,1.474697,0.582965,0.721032,1.036157,-0.660484,-0.120091
4,60fccc84440f8e8c82ca0288_q5_personality,0.061224,0.000000,0,1,0,0,0,1,0,...,-0.575766,-0.226763,0.219687,0.863166,0.968806,0.784264,0.563985,1.324149,-0.501479,0.113265
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2695,5dcf4136cf8e7e086ea019e4_q2_generic,0.755102,0.607843,0,1,0,0,0,0,0,...,-0.915135,-0.087923,1.092396,0.426978,1.073431,0.755084,1.083986,1.379110,-0.298818,-0.355299
2696,5dcf4136cf8e7e086ea019e4_q3_personality,0.755102,0.607843,0,1,0,0,0,0,0,...,-0.815160,-0.356259,0.947747,0.440619,1.599348,0.764290,0.175387,1.099688,-0.355254,-0.446140
2697,5dcf4136cf8e7e086ea019e4_q4_personality,0.755102,0.607843,0,1,0,0,0,0,0,...,-0.573689,-0.697616,1.410202,0.445274,1.974736,0.791108,0.492092,0.267261,-0.178613,0.088313
2698,5dcf4136cf8e7e086ea019e4_q5_personality,0.755102,0.607843,0,1,0,0,0,0,0,...,-0.358174,-0.159602,0.697456,0.673996,1.208710,0.669012,-0.227742,0.287806,-0.572061,-0.094700


In [68]:
df_val_prepared = prepare(df_val, PREPROCESSED_VAL_DIR_PATH)
df_val_prepared

Unnamed: 0,video_id,age,work_experience,gender_1,gender_2,gender_3,gender_4,education_1,education_2,education_3,...,text_embedding_758,text_embedding_759,text_embedding_760,text_embedding_761,text_embedding_762,text_embedding_763,text_embedding_764,text_embedding_765,text_embedding_766,text_embedding_767
0,586935b7e16d530001b34787_q1_generic,0.346939,0.352941,1,0,0,0,0,0,0,...,-0.173014,0.317974,0.692428,0.605857,1.050828,0.711249,-0.818504,0.356438,-0.046942,-0.461593
1,586935b7e16d530001b34787_q2_generic,0.346939,0.352941,1,0,0,0,0,0,0,...,-0.699817,-0.789402,0.013374,0.228430,1.108527,0.707975,1.146970,0.886414,-0.079292,-0.726086
2,586935b7e16d530001b34787_q3_personality,0.346939,0.352941,1,0,0,0,0,0,0,...,-0.496868,0.004400,-0.549262,0.550217,0.779507,0.357362,0.264824,0.733700,-0.524597,0.356702
3,586935b7e16d530001b34787_q4_personality,0.346939,0.352941,1,0,0,0,0,0,0,...,0.838556,-0.507689,-0.219423,-0.403355,-0.457249,-0.021687,0.045043,0.533603,-0.170943,0.460590
4,586935b7e16d530001b34787_q5_personality,0.346939,0.352941,1,0,0,0,0,0,0,...,-0.295093,-0.079473,0.447235,0.951361,1.036902,0.820830,1.116837,1.400838,-0.736479,0.249015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
379,62825aef9d9d5544867a6fc0_q2_generic,0.326531,0.254902,0,1,0,0,0,0,0,...,-0.245663,-0.212908,-0.128858,-0.008825,1.077369,0.691638,0.302160,0.988163,-0.518368,0.270336
380,62825aef9d9d5544867a6fc0_q3_personality,0.326531,0.254902,0,1,0,0,0,0,0,...,-0.554743,0.097804,0.273779,0.788736,1.160453,0.912091,0.713523,1.720036,-0.375534,0.226152
381,62825aef9d9d5544867a6fc0_q4_personality,0.326531,0.254902,0,1,0,0,0,0,0,...,-0.148410,-0.385089,-0.476643,0.972988,1.356813,0.575262,0.957502,0.891740,-0.421274,-0.237689
382,62825aef9d9d5544867a6fc0_q5_personality,0.326531,0.254902,0,1,0,0,0,0,0,...,-0.962064,0.049639,0.401081,0.408098,1.222078,0.308050,0.359009,1.403726,-0.323210,0.119454


In [69]:
from sklearn.metrics import mean_squared_error, r2_score


def get_metrics(model, modalities) -> pd.DataFrame:
    video_columns = [
        col for col in df_train_prepared.columns if col.startswith("video_embedding_")
    ]
    audio_columns = [
        col for col in df_train_prepared.columns if col.startswith("audio_embedding_")
    ]
    text_columns = [
        col for col in df_train_prepared.columns if col.startswith("text_embedding_")
    ]

    train_columns = []
    if "meta" in modalities:
        train_columns.extend(meta_columns)
    if "video" in modalities:
        train_columns.extend(video_columns)
    if "audio" in modalities:
        train_columns.extend(audio_columns)
    if "text" in modalities:
        train_columns.extend(text_columns)

    X_train = df_train_prepared[train_columns]
    X_val = df_val_prepared[train_columns]

    metrics = []
    for label in personality_labels + performance_labels:
        y_train = df_train_prepared[label]

        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        y_true = df_val_prepared[label]

        metrics.append(
            {
                "label": label,
                "mse": mean_squared_error(y_true, y_pred),
                "r2": r2_score(y_true, y_pred),
            }
        )

    metrics = pd.DataFrame(metrics)
    return metrics

---


In [70]:
from itertools import combinations

modalities_combinations = []
modalities = ["meta", "video", "audio", "text"]

for r in range(1, len(modalities) + 1):
    modalities_combinations.extend(list(combinations(modalities, r)))

modalities_combinations

[('meta',),
 ('video',),
 ('audio',),
 ('text',),
 ('meta', 'video'),
 ('meta', 'audio'),
 ('meta', 'text'),
 ('video', 'audio'),
 ('video', 'text'),
 ('audio', 'text'),
 ('meta', 'video', 'audio'),
 ('meta', 'video', 'text'),
 ('meta', 'audio', 'text'),
 ('video', 'audio', 'text'),
 ('meta', 'video', 'audio', 'text')]

In [71]:
def min_mse(df: pd.DataFrame) -> pd.DataFrame:
    df["min_mse"] = df.groupby("label")["mse"].transform("min")
    # leave rows where mse is close enough to min_mse
    df = df[(df["mse"] - df["min_mse"]).abs().le(1e-10)]
    df = df.drop(columns=["min_mse"])
    return df

## Ridge


In [72]:
from sklearn.linear_model import Ridge

ridge_df = pd.DataFrame()
for modalities in modalities_combinations:
    try:
        model = Ridge(random_state=42)
        metrics_df = get_metrics(model, modalities)
        metrics_df["modalities"] = " ".join(modalities)
        ridge_df = pd.concat([ridge_df, metrics_df])
    except Exception:
        print(f"Failed on {modalities =}")

ridge_df

Unnamed: 0,label,mse,r2,modalities
0,Honesty-Humility,0.185258,0.022837,meta
1,Extraversion,0.284540,-0.009561,meta
2,Agreeableness,0.215566,0.014734,meta
3,Conscientiousness,0.178307,0.053004,meta
4,Integrity,0.204263,0.063549,meta
...,...,...,...,...
4,Integrity,0.274934,-0.260442,meta video audio text
5,Collegiality,0.430524,-0.329733,meta video audio text
6,Social_versatility,0.458500,-0.445527,meta video audio text
7,Development_orientation,0.314081,-0.332085,meta video audio text


In [73]:
min_mse(ridge_df)

Unnamed: 0,label,mse,r2,modalities
0,Honesty-Humility,0.185258,0.022837,meta
2,Agreeableness,0.215566,0.014734,meta
3,Conscientiousness,0.178307,0.053004,meta
4,Integrity,0.204263,0.063549,meta
1,Extraversion,0.237275,0.158136,audio
7,Development_orientation,0.22417,0.049245,audio
5,Collegiality,0.281066,0.13189,meta audio
6,Social_versatility,0.277597,0.12481,meta audio
8,Hireability,0.297085,0.246022,meta audio


## LinearSVR


In [74]:
from sklearn.svm import LinearSVR

svr_df = pd.DataFrame()
for modalities in modalities_combinations:
    try:
        model = LinearSVR(random_state=42, max_iter=2000)
        metrics_df = get_metrics(model, modalities)
        metrics_df["modalities"] = " ".join(modalities)
        svr_df = pd.concat([svr_df, metrics_df])
    except Exception:
        print(f"Failed on {modalities =}")

svr_df









Unnamed: 0,label,mse,r2,modalities
0,Honesty-Humility,0.187095,0.013148,meta
1,Extraversion,0.291297,-0.033536,meta
2,Agreeableness,0.214895,0.017801,meta
3,Conscientiousness,0.175894,0.065822,meta
4,Integrity,0.208995,0.041857,meta
...,...,...,...,...
4,Integrity,0.336776,-0.543961,meta video audio text
5,Collegiality,0.525378,-0.622704,meta video audio text
6,Social_versatility,0.556898,-0.755751,meta video audio text
7,Development_orientation,0.402034,-0.705113,meta video audio text


In [75]:
min_mse(svr_df)

Unnamed: 0,label,mse,r2,modalities
0,Honesty-Humility,0.187095,0.013148,meta
2,Agreeableness,0.214895,0.017801,meta
3,Conscientiousness,0.175894,0.065822,meta
4,Integrity,0.208995,0.041857,meta
5,Collegiality,0.291806,0.098717,meta
6,Social_versatility,0.291036,0.08244,meta
7,Development_orientation,0.216431,0.082071,meta
1,Extraversion,0.227538,0.192686,audio
8,Hireability,0.319091,0.190172,meta audio


## Catboost


In [76]:
from catboost import CatBoostRegressor

catboost_df = pd.DataFrame()
for modalities in modalities_combinations:
    try:
        model = CatBoostRegressor(
            iterations=1500, random_seed=42, loss_function="RMSE", verbose=False
        )

        metrics_df = get_metrics(model, modalities)
        metrics_df["modalities"] = " ".join(modalities)
        catboost_df = pd.concat([catboost_df, metrics_df])
    except Exception:
        print(f"Failed on {modalities =}")

catboost_df

Unnamed: 0,label,mse,r2,modalities
0,Honesty-Humility,0.322959,-0.703476,meta
1,Extraversion,0.308306,-0.093885,meta
2,Agreeableness,0.345148,-0.577533,meta
3,Conscientiousness,0.299659,-0.591504,meta
4,Integrity,0.210316,0.035799,meta
...,...,...,...,...
4,Integrity,0.229383,-0.051614,meta video audio text
5,Collegiality,0.290783,0.101875,meta video audio text
6,Social_versatility,0.306197,0.034642,meta video audio text
7,Development_orientation,0.226050,0.041274,meta video audio text


In [77]:
min_mse(catboost_df)

Unnamed: 0,label,mse,r2,modalities
7,Development_orientation,0.223181,0.05344,audio
4,Integrity,0.197862,0.092897,meta audio
5,Collegiality,0.27803,0.141266,meta audio
2,Agreeableness,0.206238,0.057366,meta text
1,Extraversion,0.226837,0.19517,audio text
0,Honesty-Humility,0.191104,-0.007997,meta video audio
3,Conscientiousness,0.176885,0.060554,meta audio text
6,Social_versatility,0.280602,0.115336,meta audio text
8,Hireability,0.313898,0.203351,meta audio text
