In [None]:
import pandas as pd

In [None]:
path_dict = {'original_annot': '../../annotations/original/'}
# Read DFO annotation
annot_df = pd.read_csv(path_dict['original_annot']+"DFOCRP_H50bjRcb-WCV1_ValidatedDetections.csv")
annot_df.head()

In [None]:
all_filenames=annot_df['filename'].unique()
len(all_filenames)

In [None]:
print("Len before: ", len(annot_df))

In [None]:
def merge_intervals(intervals_list, freq_intervals_list):
    """ Merge overlapping time and freq annotations
     
        Args:
            intervals_list: list
                Start and end annotation list with overlaps.
            freq_intervals_list: list
                Corresponding max and min freq annotation.

        Returns:
            merged_time_intervals: list
                Time annotations after merging overlapping annotations intervals
            merged_freq_intervals: list
                Freq annotations after merging overlapping annotations intervals
    """
    temp_intervals_list=intervals_list.copy()
    merged_freq_intervals = []
    intervals_list.sort(key=lambda interval: interval[0])
    merged_time_intervals = [intervals_list[0]]
    
    for current in intervals_list:
        previous = merged_time_intervals[-1]
        if current[0] <= previous[1]:
            previous[1] = max(previous[1], current[1])
        else:
            merged_time_intervals.append(current)

    # Get corresponding min max freq range
    for each_interval in merged_time_intervals:
        min_feq, max_feq = 99999.0, 0.0
        for interval_index in range(len(temp_intervals_list)):
            if((each_interval[0]==temp_intervals_list[interval_index][0]) and (freq_intervals_list[interval_index][0] < min_feq)):
                min_feq=freq_intervals_list[interval_index][0]
            if((each_interval[1]==temp_intervals_list[interval_index][1]) and (freq_intervals_list[interval_index][1] > max_feq)):
                max_feq=freq_intervals_list[interval_index][1]
        merged_freq_intervals.append([min_feq, max_feq])
            
    return merged_time_intervals, merged_freq_intervals

In [None]:
# Extract all the annotations of each unique file
column_name_mapping={'filename': 'filename',
                      'start': 'start',
                      'end': 'end',
                      'freq_min': 'lowFreq',
                      'freq_max': 'highFreq',
                      'label': 'Sound.ID.Species'}
overlap_count_dict = {}
row_list=[]

for filename in all_filenames:
    # Filter out the annotations for each file
    annot_df_filtered_with_filename = annot_df[annot_df[column_name_mapping['filename']] == filename]
    label_list_of_filtered_file=annot_df_filtered_with_filename[column_name_mapping['label']].unique()

    # Now, iterate thorugh all unique labels from that file
    for each_label in label_list_of_filtered_file:
        overlap_count = 0
        annot_df_filtered_with_filename_with_label = annot_df_filtered_with_filename[annot_df_filtered_with_filename[column_name_mapping['label']] == each_label]

        # If this file has more than one annotation entry for any label, then iterate thorugh them
        if(len(annot_df_filtered_with_filename_with_label)>1):
            start_list=annot_df_filtered_with_filename_with_label[column_name_mapping['start']].values
            end_list=annot_df_filtered_with_filename_with_label[column_name_mapping['end']].values
            time_intervals_list=list(map(list, list(zip(start_list, end_list))))

            freq_min_list=annot_df_filtered_with_filename_with_label[column_name_mapping['freq_min']].values
            freq_max_list=annot_df_filtered_with_filename_with_label[column_name_mapping['freq_max']].values
            freq_intervals_list=list(map(list, list(zip(freq_min_list, freq_max_list))))

            merged_time_intervals_list, merged_freq_intervals_list=merge_intervals(time_intervals_list, freq_intervals_list)

            assert len(merged_time_intervals_list)==len(merged_freq_intervals_list)
            for i in range(len(merged_time_intervals_list)):
                row_list.append([filename, # filename
                                merged_time_intervals_list[i][0], # start
                                merged_time_intervals_list[i][1], # end
                                merged_freq_intervals_list[i][0], # freq_min
                                merged_freq_intervals_list[i][1], # freq_max
                                each_label]) # label

        # Else, there is only one annotation in that file for the label
        else:
            row_list.append([annot_df_filtered_with_filename_with_label[column_name_mapping['filename']].values[0], # filename
                            annot_df_filtered_with_filename_with_label[column_name_mapping['start']].values[0], # start
                            annot_df_filtered_with_filename_with_label[column_name_mapping['end']].values[0], # end
                            annot_df_filtered_with_filename_with_label[column_name_mapping['freq_min']].values[0], # freq_min
                            annot_df_filtered_with_filename_with_label[column_name_mapping['freq_max']].values[0], # freq_max
                            annot_df_filtered_with_filename_with_label[column_name_mapping['label']].values[0]]) # label

In [None]:
annot_df = pd.DataFrame(row_list, columns=[column_name_mapping['filename'],
                                           column_name_mapping['start'],
                                           column_name_mapping['end'],
                                           column_name_mapping['freq_min'],
                                           column_name_mapping['freq_max'],
                                           column_name_mapping['label']]) 
print("Len after removing overlapping annotations:", len(annot_df))