# **Consistent Annotated Train-Validation Split**

- Just splitting the training data now, so that there is consistency between all the other stages of the training and feature extraction process
- It will be a 75-25 split of the training data

In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import os
import time

# Libraries for audio
from IPython.display import Audio
import librosa

# Training and Testing Split
from sklearn.model_selection import train_test_split

# for normalization & avgpooling features
from sklearn.preprocessing import MinMaxScaler # to squeeze all the features to be within 0 and 1
import tensorflow as tf

# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.ndimage

In [5]:
# Variabels to be reused
path = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/audio_files' 
npy_path = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/train_audio_npy/' 
train_csv = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/train-not-annotated.csv' 
annotated_train_csv = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/train-annotated.csv'
not_annotated_splt = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/trainval-split/trainval.csv'
sr = 22050

In [9]:
df_train = pd.read_csv(annotated_train_csv) # Reading in training data
df_train.head()

Unnamed: 0.1,Unnamed: 0,species,audio_name,duration,filename_npy,start,end,label
0,0,Ciconia ciconia,XC171088.mp3,32.470204,Ciconia ciconia/XC171088.npy,11.021315,16.283807,clapping
1,1,Ciconia ciconia,XC171088.mp3,32.470204,Ciconia ciconia/XC171088.npy,19.004347,27.850152,clapping
2,2,Ciconia ciconia,XC171088.mp3,32.470204,Ciconia ciconia/XC171088.npy,28.105281,29.315961,clapping
3,3,Alcedo atthis,XC503772.mp3,9.195102,Alcedo atthis/XC503772.npy,1.875801,3.946005,call
4,4,Alcedo atthis,XC503772.mp3,9.195102,Alcedo atthis/XC503772.npy,4.429412,4.707893,call


In [11]:
split_df = pd.read_csv(not_annotated_splt) # Reading in train-val split data
split_df.head()

Unnamed: 0.1,Unnamed: 0,species,audio_name,duration,filename_npy,set
0,602,Acrocephalus melanopogon,XC542357.mp3,37.955918,Acrocephalus melanopogon/XC542357.npy,tr
1,195,Acrocephalus melanopogon,XC360704.mp3,18.796553,Acrocephalus melanopogon/XC360704.npy,tr
2,116,Acrocephalus melanopogon,XC213784.mp3,75.455782,Acrocephalus melanopogon/XC213784.npy,tr
3,596,Acrocephalus melanopogon,XC358629.mp3,40.449161,Acrocephalus melanopogon/XC358629.npy,tr
4,219,Acrocephalus melanopogon,XC357368.mp3,86.232018,Acrocephalus melanopogon/XC357368.npy,tr


In [4]:
df_train['species'].value_counts()

species
Dendrocopos minor            378
Alcedo atthis                374
Ixobrychus minutus           374
Acrocephalus arundinaceus    339
Motacilla flava              326
Botaurus stellaris           281
Fulica atra                  276
Charadrius alexandrinus      245
Gallinula chloropus          216
Circus aeruginosus           201
Himantopus himantopus        196
Acrocephalus melanopogon     179
Coracias garrulus            179
Ardea purpurea               164
Porphyrio porphyrio          161
Ciconia ciconia               96
Acrocephalus scirpaceus       90
Tachybaptus ruficollis        89
Anas strepera                 76
Anas platyrhynchos            38
Name: count, dtype: int64

In [13]:
df = df_train.merge(split_df[['audio_name', 'set']], on='audio_name', how='left')

# Check the updated DataFrame
df.head()

Unnamed: 0.1,Unnamed: 0,species,audio_name,duration,filename_npy,start,end,label,set
0,0,Ciconia ciconia,XC171088.mp3,32.470204,Ciconia ciconia/XC171088.npy,11.021315,16.283807,clapping,tr
1,1,Ciconia ciconia,XC171088.mp3,32.470204,Ciconia ciconia/XC171088.npy,19.004347,27.850152,clapping,tr
2,2,Ciconia ciconia,XC171088.mp3,32.470204,Ciconia ciconia/XC171088.npy,28.105281,29.315961,clapping,tr
3,3,Alcedo atthis,XC503772.mp3,9.195102,Alcedo atthis/XC503772.npy,1.875801,3.946005,call,tr
4,4,Alcedo atthis,XC503772.mp3,9.195102,Alcedo atthis/XC503772.npy,4.429412,4.707893,call,tr


In [14]:
df['set'].value_counts()

set
tr     3444
val     834
Name: count, dtype: int64

In [15]:
filepaths = df['filename_npy']

In [18]:
# Extract unique combinations of 'audio_name' and 'set' in both DataFrames
train_unique = df[['audio_name', 'set']].drop_duplicates().reset_index(drop=True)
split_unique = split_df[['audio_name', 'set']].drop_duplicates().reset_index(drop=True)

# Sort by 'audio_name' to ensure order is consistent for comparison
train_unique = train_unique.sort_values(by='audio_name').reset_index(drop=True)
split_unique = split_unique.sort_values(by='audio_name').reset_index(drop=True)

if train_unique.equals(split_unique):
    print("All unique audio_name and set pairs match between df_train and split_df.")
else:
    print("There are mismatches between df_train and split_df.")
    # Show mismatched rows by doing an anti-join (rows in one DataFrame but not the other)
    mismatched_rows = pd.concat([train_unique, split_unique]).drop_duplicates(keep=False)
    print("Mismatched rows:")
    print(mismatched_rows)

All unique audio_name and set pairs match between df_train and split_df.


In [19]:
df.to_csv('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/trainval-split/trainval-annotated.csv', index=False)