This file extracts prosody features such as pitch , loudness and probability of voicing from audio.wav files. And is merged with respective openface features to give a file containing audio and visual features across timeframe

In [2]:
# Convert .mp4 videos to .wav audio files using ffmpeg

import os
import subprocess
from tqdm import tqdm

video_dir = r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid"
wav_dir = r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid\wav"
os.makedirs(wav_dir, exist_ok=True)

video_files = [f for f in os.listdir(video_dir) if f.endswith(".mp4")]

for video in tqdm(video_files, desc="Extracting audio"):
    video_path = os.path.join(video_dir, video)
    wav_path = os.path.join(wav_dir, os.path.splitext(video)[0] + ".wav")

    command = [
        "ffmpeg", "-y", "-i", video_path,
        "-ac", "1", "-ar", "16000", wav_path
    ]
    subprocess.run(command, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)


Extracting audio: 100%|██████████| 1/1 [00:01<00:00,  1.11s/it]


In [3]:
# Extract prosody features using OpenSMILE
import os
import subprocess
from tqdm import tqdm

# ==== Setup Paths ====
opensmile_bin = r"D:\RP2\RP2 Projects\AI Interview Coach\opensmile-3.0.2\bin\SMILExtract.exe"
opensmile_conf = r"D:\RP2\RP2 Projects\AI Interview Coach\opensmile-3.0.2\config\prosody\prosodyShs.conf"


input_dir = r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid\wav"  # .wav files
output_dir = r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid\Testpros" # .csv output
os.makedirs(output_dir, exist_ok=True)

# ==== List of files ====
video_list = [f for f in os.listdir(input_dir) if f.lower().endswith(".wav")]

# ==== Loop through all videos ====
for file in tqdm(video_list, desc="Extracting prosody features"):
    input_path = os.path.join(input_dir, file)
    output_path = os.path.join(output_dir, os.path.splitext(file)[0] + ".csv")

    command = [
        opensmile_bin,
        "-C", opensmile_conf,
        "-I", input_path,
        "-csvoutput", output_path,
        "-csvoutputdelimiter", "comma",
        "-nologfile"
    ]

    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    if result.returncode != 0:
        print(f"❌ Failed: {file}")
        print(result.stderr.decode())


Extracting prosody features: 100%|██████████| 1/1 [00:00<00:00,  3.57it/s]


In [5]:
# ===========================================================
# Align OpenSMILE audio features with OpenFace video features
# ===========================================================

import os
import pandas as pd
import numpy as np

def align_audio_to_video_folder(audio_folder, video_folder, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    audio_files = [f for f in os.listdir(audio_folder) if f.endswith(".csv")]
    
    for file in audio_files:
        audio_path = os.path.join(audio_folder, file)
        video_path = os.path.join(video_folder, file)
        output_path = os.path.join(output_folder, file)
        
        if not os.path.exists(video_path):
            print(f"⚠️ Skipping {file} (no matching video CSV found)")
            continue
        
        try:
            # --- Load OpenSMILE prosody CSV (semicolon separator) ---
            audio_df = pd.read_csv(audio_path, sep=';')
            
            # Drop any unnamed columns
            audio_df = audio_df.loc[:, ~audio_df.columns.str.contains('^Unnamed')]
            
            # Detect timestamp column
            if 'frameTime' in audio_df.columns:
                audio_df.rename(columns={'frameTime': 'timestamp'}, inplace=True)
            elif 'time' in audio_df.columns:
                audio_df.rename(columns={'time': 'timestamp'}, inplace=True)
            else:
                # Try to find any column with 'time' in the name (case-insensitive)
                time_cols = [col for col in audio_df.columns if 'time' in col.lower()]
                if time_cols:
                    audio_df.rename(columns={time_cols[0]: 'timestamp'}, inplace=True)
                    print(f"ℹ️ {file}: Using '{time_cols[0]}' as timestamp column")
                else:
                    print(f"❌ {file}: No timestamp column found in audio CSV")
                    print(f"   Available columns: {list(audio_df.columns)}")
                    continue
            
            # Drop 'name' if exists
            if 'name' in audio_df.columns:
                audio_df.drop(columns=['name'], inplace=True)
            
            # --- Load OpenFace CSV ---
            video_df = pd.read_csv(video_path)
            
            # Detect timestamp column in video
            if 'timestamp' not in video_df.columns:
                if 'frame' in video_df.columns or 'Frame' in video_df.columns:
                    # Try to use frame column
                    frame_col = 'frame' if 'frame' in video_df.columns else 'Frame'
                    video_df.rename(columns={frame_col: 'timestamp'}, inplace=True)
                else:
                    video_df.rename(columns={video_df.columns[0]: 'timestamp'}, inplace=True)
            
            # Convert timestamps to float
            audio_df['timestamp'] = pd.to_numeric(audio_df['timestamp'], errors='coerce')
            video_df['timestamp'] = pd.to_numeric(video_df['timestamp'], errors='coerce')
            
            # Drop invalid timestamps
            audio_df.dropna(subset=['timestamp'], inplace=True)
            video_df.dropna(subset=['timestamp'], inplace=True)
            
            # Ensure timestamp columns exist and have data
            if audio_df.empty or video_df.empty:
                print(f"❌ {file}: Empty timestamp data after cleaning, skipping")
                continue
            
            # Round to 3 decimals
            audio_df['timestamp'] = audio_df['timestamp'].round(3)
            video_df['timestamp'] = video_df['timestamp'].round(3)
            
            # Remove duplicate timestamps in video (keep first)
            video_df = video_df.drop_duplicates(subset=['timestamp'], keep='first')
            
            # --- Align audio → video using merge_asof (nearest match) ---
            # Sort both dataframes by timestamp
            audio_df = audio_df.sort_values('timestamp').reset_index(drop=True)
            video_df = video_df.sort_values('timestamp').reset_index(drop=True)
            
            # Use merge_asof for nearest timestamp matching
            merged = pd.merge_asof(
                video_df,
                audio_df,
                on='timestamp',
                direction='nearest',
                tolerance=0.1  # Max 100ms difference
            )
            
            # Save the result
            merged.to_csv(output_path, index=False)
            print(f"✅ Merged {file} ({len(merged)} rows)")
            
        except Exception as e:
            print(f"❌ Error processing {file}: {e}")
            import traceback
            traceback.print_exc()

# Example usage:
# align_audio_to_video_folder('audio_csvs', 'video_csvs', 'merged_output')


align_audio_to_video_folder(
    audio_folder=r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid\Testpros",
    video_folder=r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid\openface_results\csv",
    output_folder=r"D:\RP2\RP2 Projects\AI Interview Coach\UnVid"
)



✅ Merged testsub.csv (907 rows)
