In [None]:
pip install opensmile

In [None]:
import os
import pandas as pd
import opensmile
import re
from tqdm import tqdm
import numpy as np

In [None]:
# initialize OpenSMILE
smile_lld = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.LowLevelDescriptors
)
smile_func = opensmile.Smile(
    feature_set=opensmile.FeatureSet.eGeMAPSv02,
    feature_level=opensmile.FeatureLevel.Functionals
)

In [None]:
output_path = '/content/drive/My Drive/BECU Capstone_Duress/Data_preprocessed/CREMAD DATA.csv'# Preprocessed Data
input_dir = '/content/drive/My Drive/BECU Capstone_Duress/Data/CREMAD DATA'# Original Data
# Create a DataFrame for storing the results
all_features = pd.DataFrame()

In [None]:
# Emotion coding mapping dictionary

EMOTION_MAP = {
    'ANG': 'anger',
    'DIS': 'disgust',
    'FEA': 'fear',
    'HAP': 'happy',
    'NEU': 'neutral',
    'SAD': 'sad'
}

DURESS_CODES = {'ANG', 'SAD', 'FEA'}  # sad,angry,fearful -> duress

In [None]:
#This should be modified according to the actual situation, if there is a feature file

metadata_path = '/content/drive/My Drive/Colab Notebooks/VideoDemographics.csv'
try:
    actor_metadata = pd.read_csv(metadata_path)
    # Make sure the ActorID column is of string type in order to match the ID in the file name
    actor_metadata['ActorID'] = actor_metadata['ActorID'].astype(str)
    # Create a dictionary indexed by ActorID for quick lookup
    actor_dict = actor_metadata.set_index('ActorID').to_dict('index')
    print(f"Successfully loaded {len(actor_metadata)} actors' metadata")
except Exception as e:
    print(f"Failed to load the metadata file: {str(e)}")
    actor_dict = {}

In [None]:
# CREMAD Metadata parsing function
# 1001_DFA_ANG_XX.wav
def parse_filename(filename):

    try:
        base_name = os.path.splitext(filename)[0]
        parts = base_name.split('_')
        return {
            'emotion_code': parts[2],
        }
    except Exception as e:
        print(f"Error parsing {filename}: {str(e)}")
        return None

In [None]:
def compute_lld_stats(X_lld):
    """Compute global statistical features (mean, std, max, min) for LLDs"""
    lld_values = X_lld.values
    stats = np.hstack([
        np.mean(lld_values, axis=0).reshape(1, -1),
        np.std(lld_values, axis=0).reshape(1, -1),
        np.max(lld_values, axis=0).reshape(1, -1),
        np.min(lld_values, axis=0).reshape(1, -1)
    ])
    return stats

In [None]:
def process_audio_file(filepath, filename):
    """Extract and combine acoustic features from a single audio file"""
    try:
        # Parse the file name to obtain the metadata
        meta = parse_filename(filename)
        if not meta:
            return None

        features = {}  

        # 1. Extract Low-Level Descriptors (LLDs)
        X_lld = smile_lld.process_file(filepath)  # Shape: (num_frames, num_features)

        # 2. Compute LLD statistics
        lld_stats = compute_lld_stats(X_lld)  # Shape: (1, 4*num_features)

        # 3. Extract Functionals (eGeMAPS)
        X_func = smile_func.process_file(filepath)  # Shape: (1, num_functionals)

        # 4. Combine all acoustic features
        acoustic_features = np.hstack([lld_stats, X_func.values])

        # 5. Create feature names
        lld_feature_names = [f"{col}_{stat}" for col in X_lld.columns
                           for stat in ['mean', 'std', 'max', 'min']]
        func_feature_names = X_func.columns.tolist()
        all_feature_names = lld_feature_names + func_feature_names

        # 6. Convert to dictionary
        for name, value in zip(all_feature_names, acoustic_features[0]):
            features[name] = value



        # Add metadata to the features as Labels
        features.update({
            'duress_label': 1 if meta['emotion_code'] in DURESS_CODES else 0
        })

        return features

    except Exception as e:
        print(f"Error handling {filename}: {str(e)}")
        return None

def process_directory(directory):
    """Process all audio files in the directory"""
    data_list = [] 

    # Obtain all.wav files in the directory
    audio_files = [f for f in os.listdir(directory) if f.endswith('.wav')]

    for filename in tqdm(audio_files, desc=f"Processing directory: {directory}"):
        filepath = os.path.join(directory, filename)
        features = process_audio_file(filepath, filename)

        if features:
            data_list.append(features)

    return pd.DataFrame(data_list) if data_list else pd.DataFrame()

# Main processing flow
all_data = pd.DataFrame()

# Suppose the input directory directly contains all audio files
input_dir = '/content/drive/My Drive/BECU Capstone_Duress/Data/CREMAD DATA'

# If there is a multi-level directory structure, this part needs to be modified
try:
    print(f"Processing directory: {input_dir}")
    dir_data = process_directory(input_dir)
    if not dir_data.empty:
        all_data = pd.concat([all_data, dir_data], ignore_index=True)
except Exception as e:
    print(f"Error processing directory {input_dir}: {str(e)}")

In [None]:
# Print processing summary
if not all_data.empty:
    # Define column order for better readability
    column_order = [
        'duress_label'
    ]

    # Include all remaining feature columns (from OpenSMILE)
    remaining_columns = [col for col in all_data.columns if col not in column_order]

    # Final column ordering
    final_columns = column_order + sorted(remaining_columns)
    all_data = all_data[final_columns]

    # Print dataset statistics
    print("\n=== Dataset Summary ===")
    print(f"Total samples: {len(all_data)}")
    print(f"Features per sample: {len(remaining_columns)} acoustic features")

    # Label distribution analysis
    print("\n=== Label Distribution ===")
    print("\nDuress label (1=duress, 0=neutral/positive):")
    print(all_data['duress_label'].value_counts())


    # Save to CSV with verification
    try:
        all_data.to_csv(output_path, index=False)

        # Verify save operation
        if os.path.exists(output_path):
            saved_data = pd.read_csv(output_path)
            print(f"\nSUCCESS: Saved {len(saved_data)} samples to:")
            print(output_path)
            print(f"File size: {os.path.getsize(output_path)/1024/1024:.2f} MB")
        else:
            print("\nWARNING: File created but cannot be verified")

    except Exception as e:
        print(f"\nERROR saving file: {str(e)}")

else:
    print("\nPROCESSING FAILED: No valid data was processed")
