In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from datetime import datetime

In [2]:
# root_path='/home/sadmans/KW_detector_multiclass/' # Orca-VM
# root_path='/home/sadman/KW_detector_multiclass/' # DL Training pc
# root_path='/home/sadman/projects/ctb-ruthjoy/sadman/Projects/KW_detector_multiclass/' # Cedar
root_path='../../' 
cedar_username='sadman'

path_dict={ 'original_annot': root_path+'annotations/original/',
            'train_annot': root_path+'annotations/train/',
            'test_annot': root_path+'annotations/test/' }

# Database-related parameters
db_name_string='dfo_no_overlap' # onc_barkley_canyon, dfo, jasco_malahat_vfpa.

# Get annotation file name based on database name
if(db_name_string=='onc_barkley_canyon'):
    annot_filename='annot_ONC-BarkleyCanyon-JK_KW-HB-D_HALLO_train.csv'
elif(db_name_string=='jasco_malahat_vfpa'):
    annot_filename='JASCO_Malahat_VFPA_reformatted_annotations.csv'
elif(db_name_string=='dfo'):
    annot_filename='DFOCRP_H50bjRcb-WCV1_ValidatedDetections.csv'
elif(db_name_string=='dfo_no_overlap'):
    annot_filename='dfo_annot_no_overlap.csv'

In [3]:
# Load annotation files
annot_df=pd.read_csv(path_dict['original_annot']+annot_filename, sep=',')
print(annot_df.head())

                                            filename    start         end  \
0  /data/audio/DFO/DFOCRP_H50bjRcb-WCV1/DFOCRP.H5...  188.343  188.624250   
1  /data/audio/DFO/DFOCRP_H50bjRcb-WCV1/DFOCRP.H5...  190.718  190.983625   
2  /data/audio/DFO/DFOCRP_H50bjRcb-WCV1/DFOCRP.H5...    2.781    3.077875   
3  /data/audio/DFO/DFOCRP_H50bjRcb-WCV1/DFOCRP.H5...  193.217  193.638875   
4  /data/audio/DFO/DFOCRP_H50bjRcb-WCV1/DFOCRP.H5...    0.620    0.948125   

   lowFreq  highFreq Sound.ID.Species  
0     1856      2048     Vessel Noise  
1     2816      3040     Vessel Noise  
2     1696      1824     Vessel Noise  
3     2560      2720     Vessel Noise  
4      960      1056     Vessel Noise  


In [4]:
def preprocess(annot_df):
    """ Delete unwanted columns from the annotations dataframe and 
        apply necessary pre-processing on the annotation columns
     
        Args:
            annot_df: pandas DataFrame
                Annotation table.

        Returns:
            annot_df: pandas DataFrame
                Annotation table after pre-processing

    """
    if(db_name_string=='onc_barkley_canyon'):
        # Replacing all PWSD with D
        annot_df['sound_id_species']=annot_df['sound_id_species'].replace('PWSD', 'D')

        # Adding data dir path with the filename
        annot_df['filename'] = '/home/'+cedar_username+'/projects/ctb-ruthjoy/SRKW/ONC/BarkleyCanyon/' + annot_df['filename'].astype(str)

        # Replace incorrect end time by adding the average duration of 'D' (one of the annotation end time was incorrectly set to the same value as the max-freq, 4708.4	4708.4)
        annot_df['duration']=annot_df['end']-annot_df['start']
        all_D_annot_df = annot_df[(annot_df['duration']<60.0) & (annot_df['sound_id_species'] == 'D')]
        for index, row in annot_df.iterrows():
            if(row['duration']>60.0):
                annot_df.at[index, 'end']=annot_df.iloc[index]['start']+all_D_annot_df['duration'].mean()
        
        # Rename the label column
        annot_df.rename(columns={'sound_id_species': 'label'}, inplace=True)

    elif(db_name_string=='jasco_malahat_vfpa'):
        # 'validation' using the following convention: 
        #   - 0: not KW
        #   - 1: KW
        #   - 2: uncertain
        # while her best guess at a species/ecotype identification is saved in the 'comments' field.
        # The followings are the annotations selected for each class
        # label 0 other: (comments: BELL, Chain, VESSEL CHAIN, blanks) & (validation=0)
        # label 1 kw: (comments: KW, KWSR, KWT) & (validation=1)
        # label 2 hb: (comments: HB) & (validation=0)
        rows_list = []
        for index, row in annot_df.iterrows():
            # Update filenames with the appropriate paths
            if('SHMALAHAT' in row['filename']):
                row['filename']='/home/'+cedar_username+'/projects/ctb-ruthjoy/SRKW/JASCO/Malahat/'+row['filename']
            elif('VFPA' in row['filename']):
                row['filename']='/home/'+cedar_username+'/projects/ctb-ruthjoy/SRKW/JASCO/VFPA/'+row['filename']
                
            # Filter out relevant species or background sound
            if(row['comments'] in ['BELL', 'Chain', 'VESSEL', 'CHAIN', 'blanks'] and row['validation']==0):
                row['label']='OTHER'
                rows_list.append(row)
            elif(row['comments'] in ['KW', 'KWSR', 'KWT'] and row['validation']==1):
                row['label']='KW'
                rows_list.append(row)
            elif(row['comments'] in ['HB'] and row['validation']==0):
                row['label']='HB'
                rows_list.append(row)

        annot_df = pd.DataFrame(rows_list)  
        
    elif(db_name_string=='dfo'):
        # First, delete unnecessary columns
        delete_columns_list = ['Deploy.ID', 'Loc.ID', 'CRP_UID', 'PG.Id']
        for each_col in delete_columns_list:
            if(each_col in annot_df.columns):
                del annot_df[each_col]
        
        # Filter out relevant species or background sound
        rows_list = []
        for index, row in annot_df.iterrows():
            file_paths = row['filename'].split(os.sep)
            
            # Adding path where the tar will be extracted
            row['filename']='/home/'+cedar_username+'/projects/ctb-ruthjoy/'+cedar_username+'/Projects/KW_detector_multiclass/saved_database/'+file_paths[len(file_paths)-2]+'/'+file_paths[len(file_paths)-1]
            
            if(row['Sound.ID.Species'] in ['Vessel Noise', 'Nothing', 'Anchor Noise', 'Chain', 'Clang', 'Rubbing', 'Vessel Noise?']):
                row['label']='OTHER'
                rows_list.append(row)
            elif(row['Sound.ID.Species'] in ['KW']):
                row['label']='KW'
                rows_list.append(row)
            elif(row['Sound.ID.Species'] in ['HW']):
                row['label']='HB'
                rows_list.append(row)
            elif(row['Sound.ID.Species'] in ['PWSD']):
                row['label']='D'
                rows_list.append(row)
        
        annot_df = pd.DataFrame(rows_list) 

    elif(db_name_string=='dfo_no_overlap'):
        # Filter out relevant species or background sound
        rows_list = []
        for index, row in annot_df.iterrows():
            file_paths = row['filename'].split(os.sep)
            # Adding path where the tar will be extracted
            row['filename']='/home/'+cedar_username+'/projects/ctb-ruthjoy/'+cedar_username+'/Projects/KW_detector_multiclass/saved_database/'+file_paths[len(file_paths)-2]+'/'+file_paths[len(file_paths)-1]
            
            if(row['Sound.ID.Species'] in ['Vessel Noise', 'Nothing', 'Anchor Noise', 'Chain', 'Clang', 'Rubbing', 'Vessel Noise?']):
                row['label']='OTHER'
                rows_list.append(row)
            elif(row['Sound.ID.Species'] in ['KW']):
                row['label']='KW'
                rows_list.append(row)
            elif(row['Sound.ID.Species'] in ['HW']):
                row['label']='HB'
                rows_list.append(row)
            elif(row['Sound.ID.Species'] in ['PWSD']):
                row['label']='D'
                rows_list.append(row)
        
        annot_df = pd.DataFrame(rows_list) 

    return annot_df

In [5]:
print("Len before:", len(annot_df))
annot_df=preprocess(annot_df)
print(annot_df.head())
print("Len after:", len(annot_df))

Len before: 126353
                                            filename    start         end  \
0  /home/sadman/projects/ctb-ruthjoy/sadman/Proje...  188.343  188.624250   
1  /home/sadman/projects/ctb-ruthjoy/sadman/Proje...  190.718  190.983625   
2  /home/sadman/projects/ctb-ruthjoy/sadman/Proje...    2.781    3.077875   
3  /home/sadman/projects/ctb-ruthjoy/sadman/Proje...  193.217  193.638875   
4  /home/sadman/projects/ctb-ruthjoy/sadman/Proje...    0.620    0.948125   

   lowFreq  highFreq Sound.ID.Species  label  
0     1856      2048     Vessel Noise  OTHER  
1     2816      3040     Vessel Noise  OTHER  
2     1696      1824     Vessel Noise  OTHER  
3     2560      2720     Vessel Noise  OTHER  
4      960      1056     Vessel Noise  OTHER  
Len after: 125847


In [6]:
stratify_col='label'

# Split the processed original annotation into train and test
train_annot_df, test_annot_df = train_test_split(annot_df, test_size=0.2, stratify=annot_df[stratify_col].values)

In [7]:
train_annot_df.head()

Unnamed: 0,filename,start,end,lowFreq,highFreq,Sound.ID.Species,label
79826,/home/sadman/projects/ctb-ruthjoy/sadman/Proje...,70.15,70.571875,1216,1728,HW,HB
22276,/home/sadman/projects/ctb-ruthjoy/sadman/Proje...,267.875,268.25,2080,2368,HW,HB
44397,/home/sadman/projects/ctb-ruthjoy/sadman/Proje...,116.139,116.48275,800,960,HW,HB
49122,/home/sadman/projects/ctb-ruthjoy/sadman/Proje...,49.499,49.90525,960,1056,HW,HB
59073,/home/sadman/projects/ctb-ruthjoy/sadman/Proje...,57.609,58.030875,1184,1536,HW,HB


In [8]:
train_annot_df.to_csv(path_dict['train_annot']+db_name_string+'_train_multiclass.csv', index=False)
test_annot_df.to_csv(path_dict['test_annot']+db_name_string+'_test_multiclass_annot.csv', index=False)