In [1]:
from constants import SDK_PATH, DATA_PATH, WORD_EMB_PATH, CACHE_PATH
import sys
import os
import requests
import numpy as np
from mmsdk import mmdatasdk as md

# Append the SDK path
if SDK_PATH is None:
    print("SDK path is not specified! Please specify first in constants/paths.py")
    exit(0)
else:
    sys.path.append(SDK_PATH)

# Create folder for storing data if it doesn't exist
if not os.path.exists(DATA_PATH):
    os.makedirs(DATA_PATH, exist_ok=True)

# Helper function to download a file from a URL
def download_file(url, dest):
    try:
        print(f"Downloading from {url}...")
        response = requests.get(url, stream=True)
        response.raise_for_status()
        with open(dest, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)
        print(f"Downloaded: {dest}")
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")

# Function to get the filename from the URL (last segment)
def get_filename_from_url(url):
    return url.split('/')[-1]  # Extract the last part of the URL

# Download function that iterates over dataset features
def download_dataset_features(feature_dict, feature_type):
    for feature_name, url in feature_dict.items():
        print(f"\nProcessing {feature_type} feature: {feature_name}")
        
        # Use the filename derived from the URL instead of the dictionary key
        filename = get_filename_from_url(url)
        dest_path = os.path.join(DATA_PATH, filename)

        if not os.path.exists(dest_path):
            download_file(url, dest_path)
        else:
            print(f"{feature_type} feature '{feature_name}' already downloaded at {dest_path}")

# Dataset initialization
DATASET = md.cmu_mosei

# Process high-level, raw, and label features
try:
    print("\nStarting download of high-level features...")
    download_dataset_features(DATASET.highlevel, "High-level")
except Exception as e:
    print(f"Error downloading high-level features: {e}")

try:
    print("\nStarting download of raw features...")
    download_dataset_features(DATASET.raw, "Raw")
except Exception as e:
    print(f"Error downloading raw features: {e}")

try:
    print("\nStarting download of labels...")
    download_dataset_features(DATASET.labels, "Label")
except Exception as e:
    print(f"Error downloading labels: {e}")



Starting download of high-level features...

Processing High-level feature: glove_vectors
High-level feature 'glove_vectors' already downloaded at ./data/CMU_MOSEI_TimestampedWordVectors.csd

Processing High-level feature: COVAREP
High-level feature 'COVAREP' already downloaded at ./data/CMU_MOSEI_COVAREP.csd

Processing High-level feature: OpenFace_2
High-level feature 'OpenFace_2' already downloaded at ./data/CMU_MOSEI_VisualOpenFace2.csd

Processing High-level feature: FACET 4.2
High-level feature 'FACET 4.2' already downloaded at ./data/CMU_MOSEI_VisualFacet42.csd

Starting download of raw features...

Processing Raw feature: words
Raw feature 'words' already downloaded at ./data/CMU_MOSEI_TimestampedWords.csd

Processing Raw feature: phones
Raw feature 'phones' already downloaded at ./data/CMU_MOSEI_TimestampedPhones.csd

Starting download of labels...

Processing Label feature: All Labels
Label feature 'All Labels' already downloaded at ./data/CMU_MOSEI_Labels.csd


In [2]:
# list the directory contents... let's see what features there are
data_files = os.listdir(DATA_PATH)
print('\n'.join(data_files))

CMU_MOSEI_COVAREP.csd
CMU_MOSEI_Labels.csd
CMU_MOSEI_TimestampedPhones.csd
CMU_MOSEI_TimestampedWords.csd
CMU_MOSEI_TimestampedWordVectors.csd
CMU_MOSEI_VisualFacet42.csd
CMU_MOSEI_VisualOpenFace2.csd


We have multiple files which can be broadly classified into three categories, highlevel, raw and labels. 

<strong>Highlevel</strong> contains the extracted features for each modality (e.g OpenFace facial landmarks, openSMILE acoustic features) while <strong>raw</strong> contains the raw transctripts, phonemes. 

We have multiple files with the .csd extension. This stands for <strong>computational sequences</strong>, which is the underlying data structure for all features in the SDK. 

<strong> Highlevel features: </strong>
- CMU_MOSEI_VisualFacet42.csd (Video modality)
- CMU_MOSEI_VisualOpenFace2.csd (Video Modality)
- CMU_MOSEI_COVAREP.csd (Audio Modality)
- CMU_MOSEI_TimestampedWordVectors.csd (Text Modality)

## Loading the data

In [3]:
visual_field = 'CMU_MOSEI_VisualFacet42'
acoustic_field = 'CMU_MOSEI_COVAREP'
text_field = 'CMU_MOSEI_TimestampedWordVectors'

features = [
    text_field, 
    visual_field, 
    acoustic_field
]

recipe = {feat: os.path.join(DATA_PATH, feat) + '.csd' for feat in features}
dataset = md.mmdataset(recipe)

[92m[1m[2024-10-29 21:20:03.402] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_TimestampedWordVectors.csd ...
[94m[1m[2024-10-29 21:20:03.948] | Status  | [0mChecking the integrity of the <glove_vectors> computational sequence ...
[94m[1m[2024-10-29 21:20:03.948] | Status  | [0mChecking the format of the data in <glove_vectors> computational sequence ...


                                                                                                                       

[92m[1m[2024-10-29 21:20:06.218] | Success | [0m<glove_vectors> computational sequence data in correct format.
[94m[1m[2024-10-29 21:20:06.228] | Status  | [0mChecking the format of the metadata in <glove_vectors> computational sequence ...
[92m[1m[2024-10-29 21:20:06.228] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_VisualFacet42.csd ...
[94m[1m[2024-10-29 21:20:06.946] | Status  | [0mChecking the integrity of the <FACET 4.2> computational sequence ...
[94m[1m[2024-10-29 21:20:06.946] | Status  | [0mChecking the format of the data in <FACET 4.2> computational sequence ...


                                                                                                                       

[92m[1m[2024-10-29 21:20:09.403] | Success | [0m<FACET 4.2> computational sequence data in correct format.
[94m[1m[2024-10-29 21:20:09.403] | Status  | [0mChecking the format of the metadata in <FACET 4.2> computational sequence ...
[92m[1m[2024-10-29 21:20:09.411] | Success | [0mComputational sequence read from file ./data/CMU_MOSEI_COVAREP.csd ...
[94m[1m[2024-10-29 21:20:09.994] | Status  | [0mChecking the integrity of the <COVAREP> computational sequence ...
[94m[1m[2024-10-29 21:20:09.994] | Status  | [0mChecking the format of the data in <COVAREP> computational sequence ...


                                                                                                                       

[92m[1m[2024-10-29 21:20:11.981] | Success | [0m<COVAREP> computational sequence data in correct format.
[94m[1m[2024-10-29 21:20:11.981] | Status  | [0mChecking the format of the metadata in <COVAREP> computational sequence ...
[92m[1m[2024-10-29 21:20:11.981] | Success | [0mDataset initialized successfully ... 




To load the dataset, we need to tell the SDK which features we need and where they exist. Thus, we construct a dictionary with format {feature_name: csd_path} and feed it to mmdataset object in the SDK.

From the highlevel features, VisualFacet is used for video modality since this file stores facial expression data extracted using the FACET tool. FACET analyzes microexpressions, including movements of facial muscles (like eyebrow raises or smiles) and emotional states (e.g., joy, anger). Thus, making it more suitable for emotion detection than openFace since it
tracks facial landmarks, head poses, and eye gaze, offering detailed spatial and motion-related facial features across video frames.

COVAREP is used for audio related features and TimeStampedWordVectors provides Pre-trained embeddings using GLoVe capture semantic relationships and contextual meaning between words.

In [5]:
print(list(dataset.keys()))
print("=" * 80)

print(list(dataset[visual_field].keys())[:10])
print("=" * 80)

some_id = list(dataset[visual_field].keys())[15]
print(list(dataset[visual_field][some_id].keys()))
print("=" * 80)

print(list(dataset[visual_field][some_id]['intervals'].shape))
print("=" * 80)

print(list(dataset[visual_field][some_id]['features'].shape))
print(list(dataset[text_field][some_id]['features'].shape))
print(list(dataset[acoustic_field][some_id]['features'].shape))
print("Different modalities have different number of time steps!")

['CMU_MOSEI_TimestampedWordVectors', 'CMU_MOSEI_VisualFacet42', 'CMU_MOSEI_COVAREP']
['--qXJuDtHPw', '-3g5yACwYnA', '-3nNcZdcdvU', '-571d8cVauQ', '-6rXp3zJ3kc', '-9YyBTjo1zo', '-9y-fZ3swSY', '-AUZQgSxyPQ', '-Alixo7euuU', '-Eqdz5y4pEY']
['features', 'intervals']
[3658, 2]
[3658, 35]
[321, 300]
[12209, 74]
Different modalities have different number of time steps!


In [None]:
# we define a simple averaging function that does not depend on intervals
def avg(intervals: np.array, features: np.array) -> np.array:
    try:
        return np.average(features, axis=0)
    except:
        return features

# first we align to words with averaging, collapse_function receives a list of functions
dataset.align(text_field, collapse_functions=[avg])

[94m[1m[2024-10-29 21:20:17.275] | Status  | [0mUnify was called ...
[92m[1m[2024-10-29 21:20:17.280] | Success | [0mUnify completed ...
[94m[1m[2024-10-29 21:20:17.280] | Status  | [0mPre-alignment based on <CMU_MOSEI_TimestampedWordVectors> computational sequence started ...
[94m[1m[2024-10-29 21:26:04.796] | Status  | [0mPre-alignment done for <CMU_MOSEI_COVAREP> ...
[94m[1m[2024-10-29 21:28:13.500] | Status  | [0mPre-alignment done for <CMU_MOSEI_VisualFacet42> ...
[94m[1m[2024-10-29 21:29:23.355] | Status  | [0mAlignment starting ...


Overall Progress:   0%|                                          | 0/3836 [00:00<?, ? Computational Sequence Entries/s]
  0%|                                                                                   | 0/183 [00:00<?, ? Segments/s][A
Aligning --qXJuDtHPw:   0%|                                                             | 0/183 [00:00<?, ? Segments/s][A
Aligning --qXJuDtHPw:  19%|█████████▍                                         | 34/183 [00:00<00:00, 336.99 Segments/s][A
Aligning --qXJuDtHPw:  45%|███████████████████████▏                           | 83/183 [00:00<00:00, 421.87 Segments/s][A
Aligning --qXJuDtHPw:  72%|███████████████████████████████████▊              | 131/183 [00:00<00:00, 447.41 Segments/s][A
Aligning --qXJuDtHPw:  99%|█████████████████████████████████████████████████▋| 182/183 [00:00<00:00, 470.30 Segments/s][A
Overall Progress:   0%|                                  | 1/3836 [00:00<26:44,  2.39 Computational Sequence Entries/s][A
  0%|              