# **Stage Zero: Data Selection** 



* Here, I will do some data analysis within the dataset and decide which species I want to work on for the project.
* I will load the metadata csv and clean it up, then do some data analysis and select species based on audio duration.
* This is based on audio duration, assuming the species with more audio duration contains more data to assist the models in training
* From the selected species, I will then convert the audio samples into npy format to remove that step in later stages

## Libraries

In [36]:
# Standard libraries
import numpy as np
import pandas as pd
import os
import time

# Libraries for audio
from IPython.display import Audio
import librosa

# Training and Testing Split
from sklearn.model_selection import train_test_split

# Splitting data
import tensorflow as tf

# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

## Cleaning the dataset

In [3]:
md_raw = pd.read_csv('C:/Users/thato/OneDrive - University of Cape Town/Bird Classification Project/AudioSamples/birdclef-2023/train_metadata.csv') # raw metadata
md_raw.head()

Unnamed: 0,primary_label,secondary_labels,type,latitude,longitude,scientific_name,common_name,author,license,rating,url,filename
0,abethr1,[],['song'],4.3906,38.2788,Turdus tephronotus,African Bare-eyed Thrush,Rolf A. de By,Creative Commons Attribution-NonCommercial-Sha...,4.0,https://www.xeno-canto.org/128013,abethr1/XC128013.ogg
1,abethr1,[],['call'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363501,abethr1/XC363501.ogg
2,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,3.5,https://www.xeno-canto.org/363502,abethr1/XC363502.ogg
3,abethr1,[],['song'],-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,5.0,https://www.xeno-canto.org/363503,abethr1/XC363503.ogg
4,abethr1,[],"['call', 'song']",-2.9524,38.2921,Turdus tephronotus,African Bare-eyed Thrush,James Bradley,Creative Commons Attribution-NonCommercial-Sha...,4.5,https://www.xeno-canto.org/363504,abethr1/XC363504.ogg


In [4]:
sr = 16000 # sample rate

### Removing the duplicates

In [5]:
path = 'C:/Users/thato/OneDrive - University of Cape Town/Bird Classification Project/AudioSamples/birdclef-2023/train_audio/'

In [6]:
# Function to get audio duration
def audioduration(file_path, sr=sr):
  audio, _ = librosa.load(file_path, sr=sr)
  return librosa.get_duration(y=audio, sr=sr)

In [7]:
durations = []
for index, row in tqdm(md_raw.iterrows(), total=len(md_raw), desc="Processing durations"):
  filename = row['filename']
  duration = audioduration(path + filename)
  durations.append(duration)

md_raw['duration'] = durations

Processing durations: 100%|██████████| 16941/16941 [10:30<00:00, 26.88it/s] 


In [8]:
md_clean = md_raw.drop_duplicates(subset=['duration', 'latitude', 'type', 'primary_label', 'author'], keep='first')

### Dropping unused columns

In [9]:
column_delete = ['secondary_labels', 'scientific_name', 'common_name', 'author', 'license', 'url', 'latitude', 'longitude']
md_clean.drop(column_delete, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  md_clean.drop(column_delete, axis=1, inplace=True)


# Data Analysis

### Checking for class imbalance by sample size

In [11]:
counts = md_clean['primary_label'].value_counts().reset_index()
counts.columns = ['primary_label', 'Number']

# Sort by the 'primary_label' column (which was formerly 'index')
counts = counts.sort_values('Number', ascending=False)

# Display the result
counts.head(50)

Unnamed: 0,primary_label,Number
0,wlwwar,500
1,thrnig1,500
2,eaywag1,499
3,comsan,497
4,barswa,496
5,woosan,486
6,combuz1,475
7,hoopoe,436
8,eubeat1,434
9,cohmar1,424


In [12]:
np.max(counts['primary_label'])

'yewgre1'

### Checking for class imbalance by duration

In [13]:
duration_sum = md_clean[['primary_label', 'duration']].groupby('primary_label').sum('duration').reset_index().sort_values('primary_label', ascending=False)
duration_sum['duration_mins'] = duration_sum['duration'] / 60

duration_sum = duration_sum.sort_values('duration', ascending=False)
duration_sum.head(50)

Unnamed: 0,primary_label,duration,duration_mins
221,thrnig1,61606.555,1026.775917
245,wlwwar,33523.055875,558.717598
89,eubeat1,28229.312125,470.488535
123,hoopoe,25088.625188,418.143753
73,combuz1,24756.75925,412.612654
70,cohmar1,20435.822688,340.597045
20,barswa,19109.527312,318.492122
84,eaywag1,16745.787688,279.096461
74,comsan,15471.654,257.8609
72,combul2,14317.793687,238.629895


### I am only interested in the top 50 species with the most samples (more variation)

In [21]:
species_counts = md_clean['primary_label'].value_counts().reset_index()
species_counts.columns = ['primary_label', 'num_samples']
species_counts_sorted = species_counts.sort_values('num_samples', ascending=False)
top_50_species = species_counts_sorted.head(50)
top_50_species_list = top_50_species['primary_label'].tolist()
top_50_species_list

['wlwwar',
 'thrnig1',
 'eaywag1',
 'comsan',
 'barswa',
 'woosan',
 'combuz1',
 'hoopoe',
 'eubeat1',
 'cohmar1',
 'litegr',
 'combul2',
 'rbsrob1',
 'blakit1',
 'greegr',
 'gnbcam2',
 'rerswa1',
 'somgre1',
 'colsun2',
 'ratcis1',
 'blbpuf2',
 'categr',
 'tafpri1',
 'carcha1',
 'egygoo',
 'fotdro5',
 'gargan',
 'grecor',
 'yertin1',
 'wbrcha2',
 'hadibi1',
 'abhori1',
 'reccuc1',
 'strher',
 'piekin1',
 'rindov',
 'cibwar1',
 'bkctch1',
 'yewgre1',
 'laudov1',
 'yebapa1',
 'varsun2',
 'afpfly1',
 'grewoo2',
 'trobou1',
 'tamdov1',
 'grbcam1',
 'spmthr1',
 'piecro1',
 'afecuc1']

In [22]:
md_filtered = md_clean[md_clean['primary_label'].isin(top_50_species_list)] # filtered metadata with interested species (chosen due to similar number of auddio samples)

### Finding out class balance by using number of samples and audio duration

In [23]:
counts = md_filtered['primary_label'].value_counts().reset_index()
counts.columns = ['primary_label', 'num_samples']

In [24]:
duration_sum = md_filtered.groupby('primary_label')['duration'].sum().reset_index()
duration_sum.columns = ['primary_label', 'total_duration_mins']
duration_sum['total_duration_mins'] = duration_sum['total_duration_mins'] / 60

In [26]:
merged_df = pd.merge(counts, duration_sum, on='primary_label')
merged_df.head()

Unnamed: 0,primary_label,num_samples,total_duration_mins
0,thrnig1,500,1026.775917
1,wlwwar,500,558.717598
2,eaywag1,499,279.096461
3,comsan,497,257.8609
4,barswa,496,318.492122


In [28]:
merged_df['normalized_samples'] = merged_df['num_samples'] / merged_df['num_samples'].max()
merged_df['normalized_duration'] = merged_df['total_duration_mins'] / merged_df['total_duration_mins'].max()

In [33]:
merged_df.head(30)

Unnamed: 0,primary_label,num_samples,total_duration_mins,normalized_samples,normalized_duration,balance_score
0,thrnig1,500,1026.775917,1.0,1.0,1.0
1,wlwwar,500,558.717598,1.0,0.544148,0.544148
2,eaywag1,499,279.096461,0.998,0.271818,0.273818
3,comsan,497,257.8609,0.994,0.251136,0.257136
4,barswa,496,318.492122,0.992,0.310187,0.318187
5,woosan,486,224.76278,0.972,0.218901,0.246901
6,combuz1,475,412.612654,0.95,0.401853,0.451853
7,hoopoe,436,418.143753,0.872,0.40724,0.53524
8,eubeat1,434,470.488535,0.868,0.458219,0.590219
9,cohmar1,424,340.597045,0.848,0.331715,0.483715


The goal is to find the species with the smallest difference between these two metrics in order to find the best balanced data

In [34]:
interested_species = ['litegr',
 'combul2',
 'rbsrob1',
 'blakit1',
 'greegr',
 'gnbcam2',
 'rerswa1',
 'somgre1',
 'colsun2',
 'ratcis1',
 'blbpuf2',
 'categr',
 'tafpri1',
 'carcha1',
 'egygoo',
 'fotdro5',
 'gargan',
 'grecor',
 'yertin1',
 'wbrcha2']
df = md_clean[md_clean['primary_label'].isin(interested_species)] 

df.head()

Unnamed: 0,primary_label,type,rating,filename,duration
1837,blakit1,['alarm call'],4.0,blakit1/XC115289.ogg,32.914313
1838,blakit1,['advertising call'],3.5,blakit1/XC120438.ogg,6.06
1839,blakit1,['call'],3.5,blakit1/XC128222.ogg,30.0
1840,blakit1,['call'],2.5,blakit1/XC129824.ogg,22.704
1841,blakit1,['call'],2.5,blakit1/XC129825.ogg,30.048


In [35]:
df['primary_label'].value_counts()

primary_label
litegr     369
combul2    291
rbsrob1    281
blakit1    261
greegr     252
gnbcam2    238
rerswa1    227
somgre1    198
colsun2    180
ratcis1    172
categr     166
blbpuf2    166
tafpri1    160
carcha1    152
egygoo     152
fotdro5    137
grecor     136
gargan     136
yertin1    134
wbrcha2    131
Name: count, dtype: int64

# Splitting into training and testing

In [43]:
train, test = train_test_split(df, test_size=0.3, random_state=1414) # For reproducibility

## Standardize 'type' column

In [44]:
train.loc[train['type'].str.contains('call') & train['type'].str.contains('song'), 'type'] = 'both'
train.loc[train['type'].str.contains('call'), 'type'] = 'call'
train.loc[train['type'].str.contains('song'), 'type'] = 'song'
train.loc[(train['type'] != ('call')) & (train['type'] != ('song')) & (train['type'] != ('both')), 'type'] = 'blank'

test.loc[test['type'].str.contains('call') & test['type'].str.contains('song'), 'type'] = 'both'
test.loc[test['type'].str.contains('call'), 'type'] = 'call'
test.loc[test['type'].str.contains('song'), 'type'] = 'song'
test.loc[(test['type'] != ('call')) & (test['type'] != ('song')) & (test['type'] != ('both')), 'type'] = 'blank'

## Extracting the numpy arrays

In [40]:
# Function to load audio
def extract_audio(filename, sr=16000):
  filepath = 'C:/Users/thato/OneDrive - University of Cape Town/Bird Classification Project/AudioSamples/birdclef-2023/train_audio/' + filename
  audio, _ = librosa.load(filepath, sr=sr)
  return audio

In [46]:
# Function to save the audio as np arrays
def save_audio_np(filenames, is_train):
    folder_dir = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V2/train_audio_npy/' if is_train else 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V2/test_audio_npy/'
    os.makedirs(folder_dir, exist_ok=True)  # Use exist_ok=True to avoid errors if the folder exists
    filepaths = []

    # Loop over the filenames, extract the np objects, and save them to the respective directory
    for filename in tqdm(filenames, desc="Saving audio files", unit="file"):
        audio = extract_audio(filename)
        folder = filename.split('/')[0]
        name = filename.split('/')[1]
        new_filename = name.replace('.ogg', '.npy')
        filepaths.append(f'{folder}/{new_filename}')

        # Create save directory
        save_dir = f'{folder_dir}/{folder}/'
        os.makedirs(save_dir, exist_ok=True)
        save_filename = f'{save_dir}{new_filename}'

        # Check if the file already exists
        if os.path.exists(save_filename):
            print(f"File {save_filename} already exists. Skipping...")
            continue  # Skip the current iteration if the file exists

        # Save .npy object
        np.save(save_filename, audio)

    # Assert all filenames have been looped through, and all files have been saved
    assert len(filenames) == len(filepaths)
    return filepaths

In [47]:
train_filepaths = save_audio_np(train['filename'], is_train=True)

Saving audio files: 100%|██████████| 2757/2757 [03:10<00:00, 14.44file/s]


In [48]:
train['filename_npy'] = train_filepaths # Adding new column with the location of the npy filepaths

In [49]:
test_filepaths = save_audio_np(test['filename'], is_train=False)

Saving audio files: 100%|██████████| 1182/1182 [00:56<00:00, 20.75file/s]


In [50]:
test['filename_npy'] = test_filepaths # Adding new column with the location of the npy filepaths

In [51]:
train.to_csv('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V2/traintest-split/train.csv')
test.to_csv('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V2/traintest-split/test.csv')