# **Consistent Annotated Train-Validation Split**

- Just splitting the training data now, so that there is consistency between all the other stages of the training and feature extraction process
- It will be a 75-25 split of the training data

In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import os
import time

# Libraries for audio
from IPython.display import Audio
import librosa

# Training and Testing Split
from sklearn.model_selection import train_test_split

# for normalization & avgpooling features
from sklearn.preprocessing import MinMaxScaler # to squeeze all the features to be within 0 and 1
import tensorflow as tf

# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [6]:
# Variabels to be reused
path = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Final-Version-Of-Bird-Classification-Project/audio_files' 
npy_path = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Final-Version-Of-Bird-Classification-Project/train_audio_npy/' 
train_csv = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Final-Version-Of-Bird-Classification-Project/train-not-annotated.csv' 
annotated_train_csv = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Final-Version-Of-Bird-Classification-Project/train-annotated.csv'
not_annotated_splt = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Final-Version-Of-Bird-Classification-Project/trainval-split/trainval.csv'
sr = 22050

In [5]:
df_train = pd.read_csv(annotated_train_csv) # Reading in training data
df_train.head()

Unnamed: 0.1,Unnamed: 0,species,audio_name,duration,filename_npy,start,end,label
0,2669,Dendrocopos minor,XC470657.mp3,33.267528,Dendrocopos minor/XC470657.npy,0.10952,0.654736,drumming
1,1617,Porphyrio porphyrio,XC357911.mp3,10.293832,Porphyrio porphyrio/XC357911.npy,5.949768,6.382371,call
2,3643,Acrocephalus arundinaceus,XC473571.mp3,98.38517,Acrocephalus arundinaceus/XC473571.npy,54.70896,60.05328,song
3,2448,Alcedo atthis,XC511677.mp3,186.024036,Alcedo atthis/XC511677.npy,128.688746,128.90135,call
4,2836,Alcedo atthis,XC511674.mp3,82.56,Alcedo atthis/XC511674.npy,48.46272,49.241137,call


In [7]:
split_df = pd.read_csv(not_annotated_splt) # Reading in train-val split data
split_df.head()

Unnamed: 0.1,Unnamed: 0,species,audio_name,duration,filename_npy,set
0,552,Himantopus himantopus,XC184422.mp3,155.899864,Himantopus himantopus/XC184422.npy,tr
1,560,Himantopus himantopus,XC67351.mp3,13.826168,Himantopus himantopus/XC67351.npy,tr
2,508,Himantopus himantopus,XC312572.mp3,34.678639,Himantopus himantopus/XC312572.npy,tr
3,561,Himantopus himantopus,XC144533.mp3,39.428571,Himantopus himantopus/XC144533.npy,tr
4,468,Himantopus himantopus,XC154931.mp3,14.18449,Himantopus himantopus/XC154931.npy,tr


In [8]:
df_train['species'].value_counts()

species
Ixobrychus minutus           428
Botaurus stellaris           313
Alcedo atthis                300
Acrocephalus arundinaceus    277
Charadrius alexandrinus      262
Fulica atra                  253
Dendrocopos minor            237
Himantopus himantopus        228
Circus aeruginosus           228
Motacilla flava              223
Coracias garrulus            179
Acrocephalus melanopogon     171
Gallinula chloropus          167
Ardea purpurea               151
Porphyrio porphyrio          128
Tachybaptus ruficollis       119
Acrocephalus scirpaceus       92
Ciconia ciconia               83
Anas strepera                 81
Anas platyrhynchos            55
Name: count, dtype: int64

In [9]:
df = df_train.merge(split_df[['audio_name', 'set']], on='audio_name', how='left')

# Check the updated DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,species,audio_name,duration,filename_npy,start,end,label,set
0,2669,Dendrocopos minor,XC470657.mp3,33.267528,Dendrocopos minor/XC470657.npy,0.10952,0.654736,drumming,tr
1,1617,Porphyrio porphyrio,XC357911.mp3,10.293832,Porphyrio porphyrio/XC357911.npy,5.949768,6.382371,call,val
2,3643,Acrocephalus arundinaceus,XC473571.mp3,98.38517,Acrocephalus arundinaceus/XC473571.npy,54.70896,60.05328,song,val
3,2448,Alcedo atthis,XC511677.mp3,186.024036,Alcedo atthis/XC511677.npy,128.688746,128.90135,call,val
4,2836,Alcedo atthis,XC511674.mp3,82.56,Alcedo atthis/XC511674.npy,48.46272,49.241137,call,tr


In [10]:
df['set'].value_counts()

set
tr     2964
val    1011
Name: count, dtype: int64

In [11]:
filepaths = df['filename_npy']

In [12]:
# Extract unique combinations of 'audio_name' and 'set' in both DataFrames
train_unique = df[['audio_name', 'set']].drop_duplicates().reset_index(drop=True)
split_unique = split_df[['audio_name', 'set']].drop_duplicates().reset_index(drop=True)

# Sort by 'audio_name' to ensure order is consistent for comparison
train_unique = train_unique.sort_values(by='audio_name').reset_index(drop=True)
split_unique = split_unique.sort_values(by='audio_name').reset_index(drop=True)

if train_unique.equals(split_unique):
    print("All unique audio_name and set pairs match between df_train and split_df.")
else:
    print("There are mismatches between df_train and split_df.")
    # Show mismatched rows by doing an anti-join (rows in one DataFrame but not the other)
    mismatched_rows = pd.concat([train_unique, split_unique]).drop_duplicates(keep=False)
    print("Mismatched rows:")
    print(mismatched_rows)

All unique audio_name and set pairs match between df_train and split_df.


In [13]:
df.to_csv('C:/Users/thato/Documents/Final-Year-Project/Dataset/Final-Version-Of-Bird-Classification-Project/trainval-split/trainval-annotated.csv', index=False)