### Validating the Extracted Expert-Referenced Vocal Cues using simple classifer 

Select Crema-D dataset and same instances that are selected for explanation. Apply the data preprocessing the same as the train data

In [1]:
from _validation import *

In [2]:
# Load the data and the preprocessing



# Extract Expert-Referenced Vocal cues


In [None]:
def Top_human_ref(data, label):
    """
    Extract vocal-cue intervals after preprocessing:
      - trim to first non-silent region,
      - convert to mono,
      - resample to 16 kHz,
      - fix length to 2.0 s,
      - window = 30 ms, hop = 15 ms,
    then return top/bottom intervals for loudness (RMS), spectral centroid,
    and frame-wise jitter/shimmer (Parselmouth).
    """
    # Load arbitrary audio file/bytes path with pydub
    audio = AudioSegment.from_file(data)

    # Detect non-silent regions (pydub works in milliseconds)
    '''The term dB FS (or dBFS) means decibels relative to full scale. It is used for amplitude levels in digital systems with a maximum available peak level, e.g., PCM encoding, where 0 dB FS is assigned to the maximum level. A signal that reaches 50 percent of the maximum level would, for example, have a value of -6 dB FS. All peak measurements will be negative numbers.'''
    silence_thresh = audio.dBFS - 14  # ~14 dB below average loudness
    '''Recommended min_silence_len values (speech emotion datasets, 16 kHz audio):

        200–300 ms → good default. Keeps micro-pauses and hesitation that are emotionally relevant, while trimming clear gaps.

        ≤150 ms → too aggressive; may cut natural pauses in emotional speech.

        ≥500 ms → safer for noisy data, but risks leaving long silences at beginning/end'''
    nonsilent = detect_nonsilent(audio, min_silence_len=350, silence_thresh=silence_thresh)

    # Keep the first non-silent interval if present
    if nonsilent:
        start_ms, end_ms = nonsilent[0][0], nonsilent[-1][1]
        audio = audio[start_ms:end_ms]

    # Ensure mono and target sampling rate
    sr = 24000 
    audio = audio.set_channels(1).set_frame_rate(sr)

    # Convert to NumPy (int16 -> float32 in [-1, 1])
    samples = np.array(audio.get_array_of_samples(), dtype=np.int16)
    y = np.array(audio.get_array_of_samples(), dtype=np.int16).astype(np.float32) / 32768.0

    # Ensure a fixed length of 2 seconds (pad/truncate at the end)
    target_len = int(sr * 2.0)
    y = librosa.util.fix_length(y, size=target_len)

    # Windowing params (kept as in your code) 
    '''From Data processing section: a window length of 480 samples, and a hop size of 240 samples
    Then:
      Window duration = 480 / 24000 = 0.030 seconds (20 ms)
      Hop duration = 240 / 24000 = 0.015 seconds (10 ms)
      Overlap = 480 - 240 = 240 samples (i.e., 50% overlap).
    '''
    window_size = int(sr * 0.10)   # 0.15 s window
    hop_length = int(sr * 0.03)    # 0.25 s hop
    overlap_length = window_size - hop_length
    print(f"Window Size: {window_size}, Hop Length: {hop_length}, Overlap Length: {overlap_length}")

    # Ensure y is at least window_size long
    if len(y) < window_size:
        raise ValueError(f"Audio length {len(y)} is shorter than the window size {window_size}.")
    
    '''TODO: Apply audio segmentation before feature extraction'''

    #_____________________________________
    # --- Features ---
    loudness = librosa.feature.rms(
        y=y, frame_length = window_size, hop_length=hop_length, center=False
    ).flatten()
    # Calculate spectral centroid (shrillness)
    # Spectral centroid is a measure of the "center of mass" of the spectrum, often associated with shrillness
    # It indicates how "high" the sound is perceived
    # Higher values indicate a more shrill sound, while lower values indicate a deeper sound.
    # It is calculated as the weighted mean of the frequencies present in the sound, weighted by their magnitudes.
    # It is often used in audio analysis to characterize the timbre of a sound.
    # librosa.feature.spectral_centroid returns the spectral centroid in Hz
    # The result is a 2D array, so we flatten it to get a 1D array.
    spectral_centroid = librosa.feature.spectral_centroid(
        y=y, sr=sr, n_fft=window_size,win_length=window_size, hop_length=hop_length, center=False
    ).flatten()
    #spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=hop_length).flatten()


    # Parselmouth (Praat) jitter/shimmer on the preprocessed waveform
    sound = parselmouth.Sound(y, sampling_frequency=sr)
    point_process = call(sound, "To PointProcess (periodic, cc)", 90, 500)
    jitter_local = call(point_process, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
    shimmer_local = call([sound, point_process], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)

    # Approximate frame-wise jitter/shimmer
    framewise_jitter, framewise_shimmer = [], []
    for i in range(0, len(y) - window_size + 1, hop_length):
        frame = y[i:i + window_size]
        temp_sound = parselmouth.Sound(frame, sampling_frequency=sr)
        temp_pp = call(temp_sound, "To PointProcess (periodic, cc)", 90, 500)
        try:
            fj = call(temp_pp, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3)
            fs = call([temp_sound, temp_pp], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6)
        except Exception:
            fj, fs = 0.0, 0.0
        framewise_jitter.append(fj)
        framewise_shimmer.append(fs)

    framewise_jitter = np.asarray(framewise_jitter)
    framewise_shimmer = np.asarray(framewise_shimmer)

    # Index -> (start, end) in seconds
    def time_step(idx, hop_length, sr):
        start = (idx * hop_length) / sr
        end = ((idx * hop_length) + window_size) / sr
        return (start, end)
    
    # returns the indices of the biggest k values, in decending order (higher -> lower).
    def top_k_intervals(values, k=5):
            values = np.asarray(values)
            top_k_indices = np.argsort(values)[-k:][::-1]
            return [time_step(idx, hop_length, sr) for idx in top_k_indices]
    
    # returns the indices of the smallest k values, in ascending order (lowest -> higher).
    def bottom_k_intervals(values, k):
            values = np.asarray(values)
            idx = np.argsort(values)[:k]
            return [time_step(i, hop_length, sr) for i in idx]
    # High-arousal emotions
    # angry, fear, happy
    if label in {"angry", "fear", "happy"}:
        top_loudness_intervals = top_k_intervals(loudness, k=5)
        top_shrillness_intervals = top_k_intervals(spectral_centroid, k=5)
        top_jitter_intervals = top_k_intervals(framewise_jitter, k=5)
        top_shimmer_intervals = top_k_intervals(framewise_shimmer, k=5)

        print("Top 5 Time Intervals of Loudness above avg value:", top_loudness_intervals)
        print("Top 5 Time Intervals of Shrillness above avg value:", top_shrillness_intervals)
        print("Top 5 Time Intervals of Jitter Intervals:", top_jitter_intervals)
        print("Top 5 Time Intervals of Shimmer Intervals:", top_shimmer_intervals)

        print("Total human-referenced segments for Loudness:", len(loudness))
        print("Total human-referenced segments for Shrillness:", len(spectral_centroid))
        print("Total human-referenced segments for Jitter:", len(framewise_jitter))
        print("Total human-referenced segments for Shimmer:", len(framewise_shimmer))

        return top_loudness_intervals, top_shrillness_intervals, top_jitter_intervals, top_shimmer_intervals
    # Low-arousal emotions
    # sad, neutral
    elif label in {"sad", "neutral"}:
        lowest_loudness = bottom_k_intervals(loudness, k=5)
        lowest_shrillness = bottom_k_intervals(spectral_centroid, k=5)
        lowest_jitter = bottom_k_intervals(framewise_jitter, k=5)
        lowest_shimmer = bottom_k_intervals(framewise_shimmer, k=5)

        print("lowest_loudness:", lowest_loudness)
        print("lowest_shrillness:", lowest_shrillness)
        print("Lowest Jitter:", lowest_jitter)
        print("Lowest Shimmer:", lowest_shimmer)

        return lowest_loudness, lowest_shrillness, lowest_jitter, lowest_shimmer

    else:
        raise ValueError("Unsupported label. Use 'angry', 'fear', 'happy', 'sad', or 'neutral'.")

