In [1]:
import pandas as pd
import librosa
from ketos.data_handling import selection_table as sl

2022-01-27 20:49:25.064886: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-27 20:49:25.519616: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1510] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 364 MB memory:  -> device: 0, name: NVIDIA TITAN V, pci bus id: 0000:a1:00.0, compute capability: 7.0


In [2]:
data_dir='/data/WCS/' # Path to the folder that contains all the audio files
test_annot_file='CB300_test'
annot_df=pd.read_csv('../../annotations/test/'+test_annot_file+'.csv')
segment_length, segment_step = 3.0, 3.0

In [3]:
def preprocess_annot(annot_df, delete_columns=['Selection', 'View']):
    """ Delete unwanted columns from the annotations dataframe and 
        apply necessary pre-processing on the annotation columns
     
        Args:
            annot_df: pandas DataFrame
                Annotation table.
            delete_columns: list
                List of columns to delete from the annotations dataframe
                default values ('Selection', 'View')

        Returns:
            annot_df: pandas DataFrame
                Annotation table after pre-processing

    """
    # Delete unnecessary columns
    for column in delete_columns:
        del annot_df[column]
        
    annot_df.rename({'Begin Path': 'filename',
                    'File Offset (s)': 'start',
                    'species': 'label'}, axis='columns', inplace =True)

    # Modify filepath (Discard the drive location (e.g., D:/))
    annot_df['filename']=annot_df['filename'].apply(lambda x: x[3:len(x)]) 
    # Modify filepath to replace \ with / in the filename
    annot_df['filename']=annot_df['filename'].apply(lambda x: x.replace("\\", "/")) 
    # Modify filepath to add the data root dir (/data/WCS/)
    annot_df['filename']=annot_df['filename'].apply(lambda x: data_dir+x)
    # Calculate End time
    annot_df['end']=annot_df['start']+annot_df['Delta Time (s)']
    return annot_df

def extract_annot_by_file(df_annot):
    """ Store the annotations of each file into a dictionary
     
        Args:
            annot_df: pandas DataFrame
                Annotation table.

        Returns:
            annot_by_file_dict: dict
                Dictionary of annotations stored for each file

    """
    annot_by_file_dict=dict()

    # Extract all distinct filename
    unique_filenames=df_annot['filename'].unique()

    # Separate annotations by each filename
    for filename in unique_filenames:
        target_matched_annot_df = df_annot[df_annot['filename'].str.find(filename) != -1]
        annot_by_file_dict[filename]=target_matched_annot_df
    return annot_by_file_dict


def check_overlap(start, end, annotation_df, search_class_label):
    """ Check if a selection has overlap in the annotation table of a specific file
     
        Args:
            start: float
                Selection start time.
            end: float
                Selection end time.
            annotation_df: pandas DataFrame
                Annotation table.
            search_class_label: int
                Class label to search in the annotation table.

        Returns:
            overlap_found: bool
                Returns True if there is overlap found, otherwise, returns False if the specifed selection does not match in the annotation dataframe.

    """
    overlap_found = False
    for annot_index, annot_row in annotation_df.iterrows():
        if((annot_row['start'] <= start <= annot_row['end']) or 
           (annot_row['start'] <= end <= annot_row['end']) or
           (start <= annot_row['start'] and end >= annot_row['end']) and
           annot_row['label']==search_class_label):
            overlap_found = True
            break
    return overlap_found

def validate_annot_detections_serial(annot_df, detection_df):
    """ Function to check if the annotation and detection serial is matching (filename and start time)
        It also prints the row index if any row of the annot_df and detection_df doesn't match.

        The serial of both dataframe is required to be same so that we can easily use the scikit-learn 
        packages for performance measurements.
     
        Args:
            annot_df: pandas DataFrame
                Annotation table.
            detection_df: pandas DataFrame
                Model detection dataframe.

        Returns:
            valid_flag: bool
                Returns True if both dataframes are in the same serial, Else returns False.

    """
    valid_flag=True
    for i in range(len(annot_df['filename'].values)):
        if(((annot_df['filename'].values)[i]!=(detection_df['filename'].values)[i]) or 
           ((annot_df['start'].values)[i]!=(detection_df['start'].values)[i])):
            print("Not equal at, ", i)
            print("Annot: ", annot_df.iloc[i], "Detection: ", detection_df.iloc[i])
            valid_flag=False
    return valid_flag

# Process annotation to add segmentation that matches with model detections

In [4]:
annot_df=preprocess_annot(annot_df)

# Get all files' duration in time
unique_filenames=annot_df['filename'].unique()
duration_list=[]
for filename in unique_filenames:
    duration_list.append(librosa.get_duration(filename=filename))
    
target_files_with_len = pd.DataFrame({'filename':unique_filenames, 
                                      'duration':duration_list})

In [5]:
#Standardize annotation table format
annot, label_dict = sl.standardize(annot_df, return_label_dict=True, trim_table=True)

segmented_annot = sl.segment_files(table=target_files_with_len, length=segment_length, step=segment_step, pad=True)

# Resetting index to change the multi-indexed dataframe to normal columns
annot=annot.reset_index()
del annot['annot_id']
segmented_annot=segmented_annot.reset_index()
del segmented_annot['sel_id']

# Store annotations of each file to a dictionary
annot_by_file_dict=extract_annot_by_file(annot)

In [6]:
# Populate the segmented annotation table with appropriate label
for index, row in segmented_annot.iterrows():
    annot_filtered_by_filename_df=annot_by_file_dict[row['filename']]
    # For multiple classes, change this part. Along with 1, also add conditions for other labels
    overlap_annot_result=check_overlap(row['start'], row['end'], annot_filtered_by_filename_df, 1)
    if(overlap_annot_result):
        segmented_annot.at[index, 'label']=1
    else:
        segmented_annot.at[index, 'label']=0

# Read model detections

In [7]:
model_detection_filename='detections_bh_detector_v02_'+test_annot_file+'.csv' # Change the model detections file name for diff versions of model/detector
model_detections_df=pd.read_csv("../../results/model_detections/"+model_detection_filename)

# Sort annotations and model detections. Check validity of correct serial

In [8]:
segmented_annot=segmented_annot.sort_values(['filename', 'start'], ascending=[True, True])
model_detections_df=model_detections_df.sort_values(['filename', 'start'], ascending=[True, True])
validate_annot_detections_serial(segmented_annot, model_detections_df)

True

# Convert model detection score to labels

In [9]:
# Threshold value to detect a prediction as Bowhead (BH)
threshold=0.2
def get_label_from_score(value):
    """
        Returns label (1 if BH, else 0) based on prediction score
        Args:
            value: int
                Prediction score.

        Returns:
            0 or 1. 
            1 meaning detected as a BH, 0 means other/background
    
    """
    if(value>threshold):
        return 1
    else:
        return 0
    
model_detections_df['label']=model_detections_df['score'].apply(get_label_from_score)

# Calculate performance

In [10]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
target_names = ['Positive (BH)', 'Negative']
y_true=segmented_annot['label'].values
y_pred=model_detections_df['label'].values
print(classification_report(y_true, y_pred, target_names=target_names))
print(confusion_matrix(y_true, y_pred))

               precision    recall  f1-score   support

Positive (BH)       0.98      0.85      0.91     89889
     Negative       0.11      0.48      0.17      3311

     accuracy                           0.84     93200
    macro avg       0.54      0.67      0.54     93200
 weighted avg       0.95      0.84      0.88     93200

[[76594 13295]
 [ 1725  1586]]
