Generalizing file reading with librosa (#122)

UCSD-E4E · Jul 19, 2022 · b58b13e · b58b13e
1 parent fa91052
commit b58b13e
Show file tree

Hide file tree

Showing 4 changed files with 32 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -8,7 +8,11 @@ TEST/*
 mixed_bird_manual.csv
 outputs/result.csv
 *.wav
+*.mp3
+*.flac
 *.pyc
 outputs/*
 *.ipynb
 !PyHa_Tutorial.ipynb
+*.csv
+!ScreamingPiha_Manual_Labels.csv
diff --git a/PyHa/IsoAutio.py b/PyHa/IsoAutio.py
@@ -1,15 +1,15 @@
-#from PyHa.tweetynet_package.tweetynet.network import TweetyNet
+from .birdnet_lite.analyze import analyze
 from .microfaune_package.microfaune.detection import RNNDetector
 from .microfaune_package.microfaune import audio
 from .tweetynet_package.tweetynet.TweetyNetModel import TweetyNetModel
 from .tweetynet_package.tweetynet.Load_data_functions import compute_features, predictions_to_kaleidoscope
+import os
 import torch
+import librosa
 import pandas as pd
 import scipy.signal as scipy_signal
 import numpy as np
-import math
-import os
-from .birdnet_lite.analyze import analyze
+from math import ceil
 from copy import deepcopy
 
 def build_isolation_parameters_microfaune(
@@ -60,7 +60,6 @@ def build_isolation_parameters_microfaune(
         "threshold_type": threshold_type,
         "threshold_const": threshold_const,
         "threshold_min": threshold_min,
-        "window_size": window_size,
         "chunk_size": chunk_size
     }
 
@@ -636,7 +635,7 @@ def chunk_isolate(
              'MANUAL ID': manual_id}
 
     # calculating the number of chunks that define an audio clip
-    chunk_count = math.ceil(
+    chunk_count = ceil(
         len(SIGNAL) / (isolation_parameters["chunk_size"] * SAMPLE_RATE))
     # calculating the number of local scores per second
     scores_per_second = len(local_scores) / old_duration
@@ -736,6 +735,7 @@ def generate_automated_labels_birdnet(audio_dir, isolation_parameters):
 def generate_automated_labels_microfaune(
         audio_dir,
         isolation_parameters,
+        ml_model = "microfaune",
         manual_id="bird",
         weight_path=None,
         normalized_sample_rate=44100,
@@ -777,6 +777,8 @@ def generate_automated_labels_microfaune(
     # Use Custom weights for Microfaune Detector
     else:
         detector = RNNDetector(weight_path)
+        # print("model \"{}\" does not exist".format(ml_model))
+        # return None
 
     # init labels dataframe
     annotations = pd.DataFrame()
@@ -786,13 +788,12 @@ def generate_automated_labels_microfaune(
         if os.path.isdir(audio_dir + audio_file):
             continue
 
-        # It is a bit awkward here to be relying on Microfaune's wave file
-        # reading when we want to expand to other frameworks,
-        # Likely want to change that in the future. Librosa had some troubles.
-
-        # Reading in the wave audio files
+        # Reading in the audio files using librosa, converting to single channeled data with original sample rate
+        # Reason for the factor for the signal is explained here: https://stackoverflow.com/questions/53462062/pyaudio-bytes-data-to-librosa-floating-point-time-series
+        # Librosa scales down to [-1, 1], but the models require the range [-32768, 32767]
         try:
-            SAMPLE_RATE, SIGNAL = audio.load_wav(audio_dir + audio_file)
+            SIGNAL, SAMPLE_RATE = librosa.load(audio_dir + audio_file, sr=None, mono=True)
+            SIGNAL = SIGNAL * 32768
         except BaseException:
             print("Failed to load", audio_file)
             continue
@@ -905,13 +906,12 @@ def generate_automated_labels_tweetynet(
         if os.path.isdir(audio_dir + audio_file):
             continue
 
-        # It is a bit awkward here to be relying on Microfaune's wave file
-        # reading when we want to expand to other frameworks,
-        # Likely want to change that in the future. Librosa had some troubles.
-
-        # Reading in the wave audio files
+        # Reading in the audio files using librosa, converting to single channeled data with original sample rate
+        # Reason for the factor for the signal is explained here: https://stackoverflow.com/questions/53462062/pyaudio-bytes-data-to-librosa-floating-point-time-series
+        # Librosa scales down to [-1, 1], but the models require the range [-32768, 32767], so the multiplication is required
         try:
-            SAMPLE_RATE, SIGNAL = audio.load_wav(audio_dir + audio_file)
+            SIGNAL, SAMPLE_RATE = librosa.load(audio_dir + audio_file, sr=None, mono=True)
+            SIGNAL = SIGNAL * 32768
         except BaseException:
             print("Failed to load", audio_file)
             continue

diff --git a/PyHa/statistics.py b/PyHa/statistics.py
@@ -68,7 +68,7 @@ def clip_general(automated_df, human_df):
     # print(SIGNAL.shape)
     human_arr = np.zeros((int(SAMPLE_RATE * duration),))
     bot_arr = np.zeros((int(SAMPLE_RATE * duration),))
-
+    
     folder_name = automated_df["FOLDER"].to_list()[0]
     clip_name = automated_df["IN FILE"].to_list()[0]
     # Placing 1s wherever the au
@@ -93,7 +93,7 @@ def clip_general(automated_df, human_df):
 
     human_arr_flipped = 1 - human_arr
     bot_arr_flipped = 1 - bot_arr
-
+    
     true_positive_arr = human_arr * bot_arr
     false_negative_arr = human_arr * bot_arr_flipped
     false_positive_arr = human_arr_flipped * bot_arr
@@ -207,7 +207,8 @@ def automated_labeling_statistics(
     for clip in clips:
         num_processed += 1
         clip_automated_df = automated_df[automated_df["IN FILE"] == clip]
-        clip_manual_df = manual_df[manual_df["IN FILE"] == clip]
+        # In case the extension for manual_df is different from the clip extension, just check the name before the extension
+        clip_manual_df = manual_df[manual_df["IN FILE"].str.startswith(".".join(clip.split(".")[:-1]))]
         try:
             if stats_type == "general":
                 clip_stats_df = clip_general(

diff --git a/PyHa/visualizations.py b/PyHa/visualizations.py
@@ -3,6 +3,7 @@
 from .tweetynet_package.tweetynet.TweetyNetModel import TweetyNetModel
 from .tweetynet_package.tweetynet.Load_data_functions import compute_features
 import torch
+import librosa
 import matplotlib.pyplot as plt
 import pandas as pd
 import scipy.signal as scipy_signal
@@ -279,9 +280,12 @@ def spectrogram_visualization(
         None
     """
 
-    # Loading in the clip with Microfaune's built-in loading function
+    # Reading in the audio file using librosa, converting to single channeled data with original sample rate
+    # Reason for the factor for the signal is explained here: https://stackoverflow.com/questions/53462062/pyaudio-bytes-data-to-librosa-floating-point-time-series
+    # Librosa scales down to [-1, 1], but the models require the range [-32768, 32767], so the multiplication is required
     try:
-        SAMPLE_RATE, SIGNAL = audio.load_wav(clip_path)
+        SIGNAL, SAMPLE_RATE = librosa.load(clip_path, sr=None, mono=True)
+        SIGNAL = SIGNAL * 32768
     except BaseException:
         print("Failure in loading", clip_path)
         return