In [3]:
import os
import pandas as pd
import soundfile as sf
import time
from maad import sound
from maad.util import power2dB, format_features
from maad.rois import create_mask, select_rois
import numpy as np
import librosa
import tensorflow as tf
from tqdm import tqdm

# Load the pre-trained model
model_path = '/home/os/aqoustics/Aqoustics-Surfperch/kaggle'
model = tf.saved_model.load(model_path)

# Helper functions
def resample_and_split_audio(segment, original_sr, target_sr=32000):
    audio = librosa.resample(segment, orig_sr=original_sr, target_sr=target_sr)
    return audio

def process_audio_in_memory(segments, model, fs):
    rows_list = []
    target_sr = 32000
    target_length = 160000  # Target length of 5 seconds at 32 kHz

    for i, segment in enumerate(segments):
        try:
            # Resample the audio segment to match the target sample rate (32kHz)
            segment_resampled = resample_and_split_audio(segment, original_sr=fs)
            
            # Ensure the segment is exactly 160000 samples long
            if len(segment_resampled) > target_length:
                segment_resampled = segment_resampled[:target_length]  # Trim to target length
            elif len(segment_resampled) < target_length:
                segment_resampled = np.pad(segment_resampled, (0, target_length - len(segment_resampled)), 'constant')

            # Model expects batch dimension, so use np.newaxis to add it
            output = model.infer_tf(segment_resampled[np.newaxis, :])

            # Extract the embedding tensor
            embeddings = output['embedding']

            # Convert to numpy array
            embedding = embeddings.numpy()[0]

            row_data = {'segment_index': i + 1}
            for j, feature in enumerate(embedding):
                row_data[f'feature_{j}'] = feature
            rows_list.append(row_data)
        except Exception as e:
            print(f"An error occurred while processing segment {i}: {e}")
            continue

    return pd.DataFrame(rows_list)


# Update the process_audio function to pass fs to the process_audio_in_memory function
def process_audio(file_path, model):
    print(f"Processing file: {file_path}")
    start_time = time.time()

    try:
        # Load the audio file and get the sampling rate (fs)
        s, fs = sound.load(file_path)
    except Exception as e:
        print(f"Error loading file {file_path}: {e}")
        return pd.DataFrame()

    try:
        # Attempt to filter the signal
        s_filt = sound.select_bandwidth(s, fs, fcut=100, forder=3, ftype='highpass')
    except ValueError as e:
        print(f"Skipping file {file_path} due to filtering error: {e}")
        return pd.DataFrame()

    # Spectrogram parameters
    db_max = 70
    Sxx, tn, fn, ext = sound.spectrogram(s_filt, fs, nperseg=1024, noverlap=512)
    Sxx_db = power2dB(Sxx, db_range=db_max) + db_max

    # Background removal and smoothing
    Sxx_db_rmbg, _, _ = sound.remove_background(Sxx_db)
    Sxx_db_smooth = sound.smooth(Sxx_db_rmbg, std=1.25)
    im_mask = create_mask(im=Sxx_db_smooth, mode_bin='relative', bin_std=1.3, bin_per=0.8)
    im_rois, df_rois = select_rois(im_mask, min_roi=50, max_roi=None)

    if df_rois.empty:
        print(f"No ROIs found in file: {file_path}")
        return pd.DataFrame()

    # Format ROIs
    df_rois = format_features(df_rois, tn, fn)

    # Filter ROIs for those with centroid frequency below 2000Hz
    low_freq_rois = df_rois[df_rois['max_f'] <= 2000]
    print(low_freq_rois)

    if low_freq_rois.empty:
        print(f"No low frequency ROIs found in file: {file_path}")
        return pd.DataFrame()

    # Extract start and end times of the filtered ROIs
    low_freq_timestamps = low_freq_rois[['min_t', 'max_t']]
    low_freq_timestamps.columns = ['begin', 'end']

    # Generate in-memory 5-second audio segments
    audio_segments = []
    clip_duration = 5.0  # seconds

    for i, (start, end) in enumerate(low_freq_timestamps.itertuples(index=False)):
        mid_point = (start + end) / 2
        clip_start = max(0, mid_point - clip_duration / 2)
        clip_end = clip_start + clip_duration

        # Check if the clip is in the first 5 seconds or last 5 seconds of the audio
        if clip_start < 0:
            clip_start = 0
            clip_end = clip_duration
        elif clip_end > len(s) / fs:
            clip_end = len(s) / fs
            clip_start = clip_end - clip_duration

        start_sample = int(clip_start * fs)
        end_sample = int(clip_end * fs)
        audio_segment = s[start_sample:end_sample]
        audio_segments.append(audio_segment)

    # Process the segments with the model in-memory
    df_features = process_audio_in_memory(audio_segments, model, fs)

    # Print the number of features (rows) extracted
    print(f"Number of features found in {file_path}: {len(df_features)}")

    end_time = time.time()
    print(f"Finished processing file: {file_path}")

    return df_features

def process_folder(input_folder, output_folder, model):
    all_timestamps = []
    total_features = 0  # To count total features across all files
    for filename in os.listdir(input_folder):
        if filename.lower().endswith('.wav'):
            file_path = os.path.join(input_folder, filename)
            timestamps = process_audio(file_path, model)
            if not timestamps.empty:
                timestamps['file'] = filename
                all_timestamps.append(timestamps)
                total_features += len(timestamps)  # Count the features for each file

    if all_timestamps:
        # Concatenate all timestamps and save to CSV
        all_timestamps_df = pd.concat(all_timestamps, ignore_index=True)
        csv_path = os.path.join(output_folder, 'surfperch_feature_embeddings.csv')
        all_timestamps_df.to_csv(csv_path, index=False)
        print(f"CSV saved to {csv_path}")
        print(f"Total number of features extracted from all files: {total_features}")
        return all_timestamps_df
    else:
        print("No audio clips generated from any files.")
        return pd.DataFrame()



In [4]:
# Example usage
input_folder = '/mnt/d/Aqoustics/UMAP'
output_folder = '/mnt/d/Aqoustics/UMAP/output'
if not os.path.exists(output_folder):
    os.makedirs(output_folder)

start_time = time.time()
all_timestamps = process_folder(input_folder, output_folder, model)
end_time = time.time()
print(f"Total processing time: {end_time - start_time:.2f} seconds")


Processing file: /mnt/d/Aqoustics/UMAP/ind_D1_20220829_120000.WAV
  labelID    label  min_y  min_x  max_y  max_x    min_f   min_t     max_f  \
1       2  unknown     14   1324     29   1328  218.750  42.400   453.125   
2       5  unknown     21   1671     33   1675  328.125  53.504   515.625   
3       6  unknown     22   1285     35   1290  343.750  41.152   546.875   
4       8  unknown     22   1471     38   1476  343.750  47.104   593.750   
5       9  unknown     24    485     47    491  375.000  15.552   734.375   
6      10  unknown     24   1097     35   1102  375.000  35.136   546.875   
7      11  unknown     25    513     34    520  390.625  16.448   531.250   
8      31  unknown     54   1285     66   1290  843.750  41.152  1031.250   
9      32  unknown     54   1471     68   1475  843.750  47.104  1062.500   

    max_t  
1  42.528  
2  53.632  
3  41.312  
4  47.264  
5  15.744  
6  35.296  
7  16.672  
8  41.312  
9  47.232  


W0000 00:00:1725452269.373877   56101 assert_op.cc:38] Ignoring Assert operator jax2tf_infer_fn_/assert_equal_1/Assert/AssertGuard/Assert


Number of features found in /mnt/d/Aqoustics/UMAP/ind_D1_20220829_120000.WAV: 9
Finished processing file: /mnt/d/Aqoustics/UMAP/ind_D1_20220829_120000.WAV
Processing file: /mnt/d/Aqoustics/UMAP/ind_D1_20220829_120200.WAV
   labelID    label  min_y  min_x  max_y  max_x     min_f   min_t     max_f  \
1        3  unknown     20    125     35    129   312.500   4.032   546.875   
2        6  unknown     23    138     35    142   359.375   4.448   546.875   
3        7  unknown     23    637     35    641   359.375  20.416   546.875   
4        8  unknown     23    801     37    805   359.375  25.664   578.125   
5        9  unknown     23   1003     35   1009   359.375  32.128   546.875   
6       10  unknown     23   1532     37   1536   359.375  49.056   578.125   
7       11  unknown     24    212     34    218   375.000   6.816   531.250   
8       13  unknown     25    967     35    972   390.625  30.976   546.875   
9       35  unknown     53    138     71    142   828.125   4.448  1

  SNR_est = (Sxx/noise_spectro)


  labelID    label  min_y  min_x  max_y  max_x    min_f   min_t    max_f  \
0       2  unknown      9    604     29    609  140.625  19.360  453.125   
1       3  unknown      9    741     31    746  140.625  23.744  484.375   
2       4  unknown     10    100     21    104  156.250   3.232  328.125   
3       8  unknown     18    359     35    365  281.250  11.520  546.875   
4       9  unknown     18   1816     29   1820  281.250  58.144  453.125   
5      11  unknown     20   1534     37   1539  312.500  49.120  578.125   

    max_t  
0  19.520  
1  23.904  
2   3.360  
3  11.712  
4  58.272  
5  49.280  
Number of features found in /mnt/d/Aqoustics/UMAP/ind_D5_20220908_132000.WAV: 6
Finished processing file: /mnt/d/Aqoustics/UMAP/ind_D5_20220908_132000.WAV
Processing file: /mnt/d/Aqoustics/UMAP/ind_D5_20220908_213800.WAV
   labelID    label  min_y  min_x  max_y  max_x    min_f   min_t    max_f  \
0        1  unknown     17     39     33     43  265.625   1.280  515.625   
1       

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


   labelID    label  min_y  min_x  max_y  max_x     min_f   min_t     max_f  \
0        1  unknown      6    443     75    449    93.750  14.208  1171.875   
2        3  unknown     17    989     52    995   265.625  31.680   812.500   
3        4  unknown     19    395     43    400   296.875  12.672   671.875   
4        5  unknown     21    665     41    669   328.125  21.312   640.625   
5        6  unknown     22    123     44    128   343.750   3.968   687.500   
6        7  unknown     22    326     46    339   343.750  10.464   718.750   
7        8  unknown     22    799     44    803   343.750  25.600   687.500   
8       10  unknown     23    599     44    603   359.375  19.200   687.500   
9       11  unknown     23   1469     44   1478   359.375  47.040   687.500   
10      12  unknown     23   1527     42   1539   359.375  48.896   656.250   
11      19  unknown     27    514     46    524   421.875  16.480   718.750   
12      21  unknown     29   1160     55   1164   45