tbd: 
- tune the parameters
    - fine-tune the pitch extraction
    - use extracted pitch for hnr (and jitter & shimmer?)  -- no we can't
- store jitter and shimmer as one vector?
- record the temporal pitches and hnrs
- post processing (scaling, zero-padding)
- determine which model to use (LSTM? CNN? )
- data augmentation?

for SSH

In [1]:
import parselmouth
import parselmouth.praat as praat
import numpy as np
import os
import glob
from tqdm import tqdm
import pandas as pd

In [2]:
def get_pitch(sound, pitch_floor, pitch_ceiling):
    pitch = sound.to_pitch(
            time_step=0.01,
            pitch_floor=pitch_floor,
            pitch_ceiling=pitch_ceiling,
        )
    pitch_values = pitch.selected_array['frequency']
    pitch_values[pitch_values == 0] = np.nan

    ave_pitch = np.nanmean(pitch_values)
    std_pitch = np.nanstd(pitch_values)

    return pitch_values, ave_pitch, std_pitch

def get_hnr(sound):
    harmonicity = sound.to_harmonicity(time_step=0.01)
    hnr_values = harmonicity.values.T

    ave_hnr = np.nanmean(hnr_values)
    std_hnr = np.nanstd(hnr_values)

    return hnr_values, ave_hnr, std_hnr

def get_gitter_shimmer(sound, pitch_floor, pitch_ceiling):
    pointProcess = praat.call(sound, "To PointProcess (periodic, cc)", pitch_floor, pitch_ceiling)

    # --- Get Jitter Measures ---
    lj = praat.call(pointProcess, "Get jitter (local)", 0, 0, 0.0001, 0.02, 1.3) # returns percentage
    laj = praat.call(pointProcess, "Get jitter (local, absolute)", 0, 0, 0.0001, 0.02, 1.3) # returns seconds
    rap = praat.call(pointProcess, "Get jitter (rap)", 0, 0, 0.0001, 0.02, 1.3) # returns percentage
    ppq5 = praat.call(pointProcess, "Get jitter (ppq5)", 0, 0, 0.0001, 0.02, 1.3) # returns percentage
    ddp = praat.call(pointProcess, "Get jitter (ddp)", 0, 0, 0.0001, 0.02, 1.3) # returns percentage

    jitter_values = np.array([
        lj, laj, rap, ppq5, ddp
    ], dtype=np.float32)

    # --- Get Shimmer Measures ---
    ls = praat.call([sound, pointProcess], "Get shimmer (local)", 0, 0, 0.0001, 0.02, 1.3, 1.6) # returns percentage
    lsdb = praat.call([sound, pointProcess], "Get shimmer (local_dB)", 0, 0, 0.0001, 0.02, 1.3, 1.6) # returns dB
    apq3 = praat.call([sound, pointProcess], "Get shimmer (apq3)", 0, 0, 0.0001, 0.02, 1.3, 1.6) # returns percentage
    apq5 = praat.call([sound, pointProcess], "Get shimmer (apq5)",0, 0, 0.0001, 0.02, 1.3, 1.6) # returns percentage
    apq11 = praat.call([sound, pointProcess], "Get shimmer (apq11)", 0, 0, 0.0001, 0.02, 1.3, 1.6) # returns percentage
    dda = praat.call([sound, pointProcess], "Get shimmer (dda)", 0, 0, 0.0001, 0.02, 1.3, 1.6) # returns percentage

    shimmer_values = np.array([
        ls, lsdb, apq3, apq5, apq11, dda
    ], dtype=np.float32)

    return jitter_values, shimmer_values

### Extracting feature

In [3]:
def process_audios(audio_files):

    file_ids = []
    pitches = []
    ave_pitches = []
    std_pitches = []
    hnrs = []
    ave_hnrs = []
    std_hnrs = []
    jitters = []
    shimmers =[]


    i = 0
    for audio_file in tqdm(audio_files, desc="Processing Audio Files"):
        sound = parselmouth.Sound(audio_file)
        #sound.pre_emphasize()   ## don't need this right?
        pitch_floor=75
        pitch_ceiling=600
        
        pitch_values, ave_pitch, std_pitch = get_pitch(sound, pitch_floor, pitch_ceiling)
        hnr_values, ave_hnr, std_hnr = get_hnr(sound)
        jitter, shimmer = get_gitter_shimmer(sound, pitch_floor, pitch_ceiling)

        file_ids.append(os.path.basename(audio_file).replace('.flac', ''))
        pitches.append(pitch_values)
        ave_pitches.append(ave_pitch)
        std_pitches.append(std_pitch)
        hnrs.append(hnr_values)
        ave_hnrs.append(ave_hnr)
        std_hnrs.append(std_hnr)
        jitters.append(jitter)
        shimmers.append(shimmer)


        data = {'AUDIO_ID':file_ids,
                'PITCH': pitches,
                'AVE_PITCH':ave_pitches,
                'STD_PITCH':std_pitches,
                'HNR':hnrs,
                'AVE_HNR':ave_hnrs,
                'STD_HNR':std_hnrs,
                'JITTER':jitters,
                'SHIMMER':shimmers}

        i += 1
        #if i == 10: break
        
    return data

### train_audios

In [None]:
train_audios = glob.glob('/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2025/data/ASVSpoof19/LA/ASVspoof2019_LA_train/flac/*.flac')
train_features = process_audios(train_audios)

df = pd.DataFrame(train_features)

df.to_pickle('/home/users1/liqe/TeamLab/prosody_features_train.pkl')

### validation_audios

In [4]:
dev_audios = glob.glob('/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2025/data/ASVSpoof19/LA/ASVspoof2019_LA_dev/flac/*.flac')
dev_features = process_audios(dev_audios)

df = pd.DataFrame(dev_features)

# to be changed
df.to_pickle('/home/users1/liqe/TeamLab/prosody_features_dev.pkl')

Processing Audio Files: 100%|██████████| 24986/24986 [13:32<00:00, 30.76it/s]


### evaluation_audios

In [5]:
eval_audios = glob.glob('/mount/studenten/arbeitsdaten-studenten1/team-lab-phonetics/2025/data/ASVSpoof19/LA/ASVspoof2019_LA_eval/flac/*.flac')
eval_features = process_audios(eval_audios)

df = pd.DataFrame(eval_features)

# to be changed
df.to_pickle('/home/users1/liqe/TeamLab/prosody_features_eval.pkl')

  ave_pitch = np.nanmean(pitch_values)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
Processing Audio Files: 100%|██████████| 71933/71933 [38:27<00:00, 31.17it/s]


### Transform pkl file into parquet file & flatten HNR

In [33]:
def flatten_list_of_scalar_lists(nested_item):
    """
    Converts a list of single-element lists/arrays (e.g., [[-200.0], [-100.0]])
    into a flat list of scalars (e.g., [-200.0, -100.0]).
    It can also handle items that are already scalars within the outer list.
    """
    if not isinstance(nested_item, np.ndarray):
        # If the input isn't a list at all, return np.nan or an empty list
        # depending on how you want to handle unexpected row data.
        return np.nan  # Or: return []

    flat_list = []
    for sub_item in nested_item:
        if isinstance(sub_item, (list, np.ndarray)) and len(sub_item) == 1:
            element = sub_item[0]
            if isinstance(element, (int, float, np.number)):
                flat_list.append(element)
            else:
                # The single element is not a number, append nan
                flat_list.append(np.nan)
        else:
            flat_list.append(np.nan) # Mark as unable to extract a scalar

    flat_array = np.array(flat_list)
    return flat_array

example_hnr_row = [[-200.0], [-200.0], [-200.0], [-200.0], [-200.0]]
flattened_result = flatten_list_of_scalar_lists(example_hnr_row)
print(f"Original: {example_hnr_row}")
print(f"Flattened: {flattened_result}")


Original: [[-200.0], [-200.0], [-200.0], [-200.0], [-200.0]]
Flattened: nan


In [34]:
df_train = pd.read_pickle('/home/users1/liqe/TeamLab/prosody_features_train.pkl')
df_train['HNR'] = df_train['HNR'].apply(flatten_list_of_scalar_lists)
df_train.to_parquet("prosody_features_train.parquet")

In [36]:
df_dev = pd.read_pickle('/home/users1/liqe/TeamLab/prosody_features_dev.pkl')
df_dev['HNR'] = df_dev['HNR'].apply(flatten_list_of_scalar_lists)
df_dev.to_parquet("prosody_features_dev.parquet")

In [37]:
df_eval = pd.read_pickle('/home/users1/liqe/TeamLab/prosody_features_eval.pkl')
df_eval['HNR'] = df_eval['HNR'].apply(flatten_list_of_scalar_lists)
df_eval.to_parquet("prosody_features_eval.parquet")

### Inspect saved features

In [7]:
# pkl file
#file_path = '/home/users1/liqe/TeamLab_phonetics/prosody_features_train.pkl'

# parquet file
file_path = '/home/users1/liqe/TeamLab_phonetics/prosody_features_train.parquet'

try:
    if not os.path.exists(file_path):
        print(f"Error: The file was not found at {file_path}")
    else:
        #loaded_df = pd.read_pickle(file_path)
        loaded_df = pd.read_parquet(file_path, engine='pyarrow')

        print(f"Successfully loaded DataFrame from: {file_path}")

        # --- Inspect the loaded DataFrame ---

        # 1. Display the first few rows
        print("\n--- First 5 rows (df.head()) ---")
        print(loaded_df.head())

        # 2. Display the last few rows
        print("\n--- Last 5 rows (df.tail()) ---")
        print(loaded_df.tail())

        # 3. Get concise summary of the DataFrame (index, columns, dtypes, non-null values, memory usage)
        print("\n--- DataFrame Info (df.info()) ---")
        loaded_df.info()

        # 4. Get descriptive statistics for numerical columns (count, mean, std, min, max, quartiles)
        print("\n--- Descriptive Statistics (df.describe()) ---")
        print(loaded_df.describe())

        # 5. Check the shape of the DataFrame (number of rows, number of columns)
        print("\n--- DataFrame Shape (df.shape) ---")
        print(f"Shape: {loaded_df.shape}")
        print(f"Number of files (rows): {loaded_df.shape[0]}")
        print(f"Number of columns (features + metadata): {loaded_df.shape[1]}")

        # 6. List the column names
        print("\n--- Column Names (df.columns) ---")
        print(loaded_df.columns.tolist())

        # 7. Check for missing values (NaNs)
        print("\n--- Missing Values (df.isnull().sum()) ---")
        print(loaded_df.isnull().sum()) # Sum of True (missing) values per column

        # 8. Access a specific column (e.g., 'AVE_PITCH')
        # print("\n--- Example: 'AVE_PITCH' column ---")
        # print(loaded_df['AVE_PITCH'].head())

        
        # check for the length of pitch and hnr
        first_pitch = loaded_df['PITCH'].iloc[0]
        first_hnr = loaded_df['HNR'].iloc[0]
        print(f'first pitch length is {len(first_pitch)}')
        print(f'first hnr length is {len(first_hnr)}')

        # if not loaded_df['HNR'].empty:

        #     sample_entry = 0.0
        #     for entry in loaded_df['HNR']:
        #         if entry is not None: # and (not isinstance(entry, float) or not np.isnan(entry)): # More robust check if NaNs are floats
        #             sample_entry = entry
        #             break
            
        #     if sample_entry is not None:
        #         print(f"Sample HNR entry: {sample_entry}")
        #         print(f"Type of the HNR entry itself: {type(sample_entry)}")

        #         # If it's a list or tuple, inspect its first element
        #         if isinstance(sample_entry, (list, tuple, np.ndarray)) and len(sample_entry) > 0:
        #             first_sub_item = sample_entry[0]
        #             print(f"First sub-item: {first_sub_item}")
        #             print(f"Type of the first sub-item: {type(first_sub_item)}")

        #             # If that sub-item is also a list/tuple, inspect its element
        #             print(f"Type of the element within the sub-item: {type(first_sub_item[0])}")
                    

except Exception as e:
    print(f"An error occurred while loading or inspecting the DataFrame: {e}")


Successfully loaded DataFrame from: /home/users1/liqe/TeamLab_phonetics/prosody_features_train.parquet

--- First 5 rows (df.head()) ---
       AUDIO_ID                                              PITCH  \
0  LA_T_1000137  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
1  LA_T_1000406  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
2  LA_T_1000648  [nan, nan, nan, nan, nan, 263.08241940297097, ...   
3  LA_T_1000824  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   
4  LA_T_1001074  [nan, nan, nan, nan, nan, nan, nan, nan, nan, ...   

    AVE_PITCH  STD_PITCH                                                HNR  \
0  129.703246  19.223900  [-200.0, -200.0, -200.0, -200.0, -200.0, -200....   
1  151.166201  18.754230  [-200.0, -200.0, -200.0, -200.0, -200.0, -200....   
2  240.929243  14.829958  [-200.0, -200.0, -200.0, -200.0, -200.0, -200....   
3  106.986026  14.755938  [-200.0, -200.0, -200.0, -200.0, -200.0, -200....   
4  194.306253  54.477061  [-200.0, -200.0, -200