# **Feature Extraction Methods: Imbalanced Data With Annotations**

- *Key Features*: [MFCCs, Mel-Spectrograms, Chroma Frequencies, RMS Power]
- *Key Manipulations*: [Varying Window Sizes, Normalization, Average Pooling (Compression), Filtering]
- *Process Assistence*: [Converting them to numpy arrays now, easy label access across features]
- *Conversion*: [To numpy arrays and pkl files]


In [1]:
# Standard libraries
import numpy as np
import pandas as pd
import os
import time

# Libraries for audio
from IPython.display import Audio
import librosa
import librosa.display

# Training and Testing Split
from sklearn.model_selection import train_test_split

# for normalization & avgpooling features
import tensorflow as tf
# for preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

# Operational
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import scipy.ndimage
import pygame
import time
from scipy.signal import butter, filtfilt
import random
import IPython.display as ipd
from functools import partial

pygame 2.6.1 (SDL 2.28.4, Python 3.12.4)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Variabels to be reused
path = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/audio_files' 
npy_path = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/train_audio_npy/' 
train_csv = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/train-not-annotated.csv' 
annotated_train_csv = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/trainval-split/trainval-annotated.csv'
not_annotated_splt = 'C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/trainval-split/trainval.csv'
sr = 22050

In [3]:
trainval_data = pd.read_csv(annotated_train_csv)
train_data = trainval_data[trainval_data['set'] == 'tr']
val_data = trainval_data[trainval_data['set'] == 'val']

# **Creating a class to do the extraction**

In [35]:
class Extraction:
  def __init__(self, train_df, val_df, window_size, overlap=0.5, npy_path=npy_path, sr=sr, n_mels=60, n_mfcc=20, n_chroma=12, features=['mfcc'], normalize=True, avgpool=False):
    """
    Instantiate the Extraction class to extract features.

    Parameters:
      sr (int): Sample rate of the audio files.
      n_mfccs (int): Number of MFCCs to extract.
      n_mels (int): Number of Mel bands to extract.
      n_chroma (int): Number of chroma bins to use.
      features (list): List of features to extract.
        accepted features: 'mfcc', 'chroma', 'rms', 'melspectrogram'.
      normalize (bool): Whether to normalize the features.
      maxpool (bool): Whether to maxpool the features.
    """
    
    self.train_df = train_df
    self.val_df = val_df
    self.npy_path = npy_path
    self.window_size = window_size
    self.overlap = overlap
    self.sr = sr
    self.n_mels = n_mels
    self.n_mfcc = n_mfcc
    self.n_chroma = n_chroma

     # confirm features have been specified
    assert len(features) != 0, "Must Specify At Least One Feature In The Form Of A List."
    self.features = features

    self.accepted_feature = ['mfcc', 'chroma', 'rms', 'melspectrogram']
    for feature in self.features:
      assert feature in self.accepted_feature, f"{feature} is not an accepted feature, only 'mfcc', 'chroma', 'rms', 'melspectrogram' are accepted features."

    self.normalize = normalize
    self.avgpool = avgpool

    print(f"Train DataFrame shape: {train_df.shape}")
    print(f"Validation DataFrame shape: {val_df.shape}")

    # extract train and val labels and features
    self.train_y, self.train_features = self.feature_extraction(self.train_df, window_size=self.window_size)
    self.val_y, self.val_features = self.feature_extraction(self.val_df, window_size=self.window_size)

    # process the features by average pooling
    self.train_features, self.val_features = self.process_features(self.train_features, self.val_features)

    
  

  def normalize_audio(self, audio):
    return (audio - np.min(audio)) / (np.max(audio) - np.min(audio))
  
  def bandpass_filter(self, audio, lowcut=800, highcut=8000, order=4):
    nyquist = 0.5 * self.sr  # Nyquist frequency
    low = lowcut / nyquist
    high = highcut / nyquist

    b, a = butter(order, [low, high], btype='band')
    filtered_audio = filtfilt(b, a, audio)
    return filtered_audio
  
  def generate_pink_noise(self, num_samples):
    white_noise = np.random.randn(num_samples)
    
    # Apply a filter to convert white noise into pink noise (1/f noise)
    X = np.fft.rfft(white_noise)
    S = np.arange(1, len(X) + 1)  # Frequency scaling
    pink_noise = np.fft.irfft(X / S)

    if len(pink_noise) < num_samples:
        # Pad with zeros if the length is less than num_samples
        pink_noise = np.pad(pink_noise, (0, num_samples - len(pink_noise)), mode='constant')
    elif len(pink_noise) > num_samples:
        # Trim if necessary
        pink_noise = pink_noise[:num_samples]
    
    return self.normalize_audio(pink_noise)
  
  def pad_with_noise(self, audio_data, window_length, window_samples):
    current_length = librosa.get_duration(y=audio_data, sr=self.sr)

    if current_length > window_length:
        return audio_data
    
    target_length_samples = int(window_length * sr) 
    current_length_samples = window_samples
    padding_length_samples = target_length_samples - current_length_samples

    assert target_length_samples == (current_length_samples+padding_length_samples)
    
    # Generate pink noise to pad with
    pink_noise = self.generate_pink_noise(padding_length_samples)
    padded_audio = np.concatenate([audio_data, pink_noise])
    # if len(padded_audio) < target_length_samples:
    #     padded_audio = np.append(padded_audio, self.generate_pink_noise(1))

    assert target_length_samples == len(padded_audio)
    
    return padded_audio
  
  def avg_pooling_keras(self, feature):
    # Clear the previous Keras session
    tf.keras.backend.clear_session()

    # Define the input shape based on features
    input_shape = feature.shape[1:]  # (n_mels, time_steps)

    # Create the Keras model for average pooling
    inputs = tf.keras.layers.Input(shape=input_shape)
    pooled = tf.keras.layers.GlobalAveragePooling1D()(inputs)
    pooling_model = tf.keras.models.Model(inputs=inputs, outputs=pooled)

    # Perform pooling using the model
    pooled_features = pooling_model.predict(feature)

    return pooled_features

#-------------------------Feature Extraction---------------------------------------
  def extract_mfcc(self, window):
    mfcc = librosa.feature.mfcc(y=window, sr=self.sr, n_mfcc=self.n_mfcc)
    if self.normalize:
      return librosa.util.normalize(mfcc)
    else:
      return mfcc


  def extract_chroma(self, window):
    chroma = librosa.feature.chroma_stft(y=window, sr=self.sr, n_chroma=self.n_chroma)
    if self.normalize:
      return librosa.util.normalize(chroma)
    else:
      return chroma
   

  def extract_rms(self, window):
    return librosa.feature.rms(y=window)

  def extract_melspectrogram(self, window):
    mel = librosa.feature.melspectrogram(y=window, sr=self.sr, n_mels=self.n_mels)
    if self.normalize:
      return librosa.util.normalize(mel)
    else:
      return mel
    
  def avgpooling(self, train_X, val_X, n_time, n_features):
    """
    Average pooling the train and val features.

    Parameters:
      train_X (npy): Training feature array of shape (batch_size, n_features, n_time)
      val_X (npy): Validation feature array of shape (batch_size, n_features, n_time)
      n_time (int): Time axis
      n_features (int): Feature axis

    Returns:
      train_X (npy): Avgpooled training feature array of shape (batch_size, n_features)
      val_X (npy): Avgpooled validation feature array of shape (batch_size, n_features)
    """
    # Clear the Keras session
    tf.keras.backend.clear_session()
    
    # Create the Keras input layer with shape (n_features, n_time)
    input_layer = tf.keras.layers.Input(shape=(n_features, n_time))
    
    # Apply average pooling over the time axis (axis=-1) to reduce n_time
    avg_pool = tf.keras.layers.Lambda(lambda x: tf.reduce_mean(x, axis=-1))(input_layer)
    
    # Build the model
    pooling_model = tf.keras.models.Model(inputs=input_layer, outputs=avg_pool)

    # Use the model to apply average pooling on the training and validation features
    train_X = pooling_model.predict(train_X)
    val_X = pooling_model.predict(val_X)

    return train_X, val_X

    
  def process_features(self, train_features_dict, val_features_dict):
    for each in train_features_dict.keys():
      
      if each == 'mfcc':
        n_features=self.n_mfcc
      elif each == 'chroma':
        n_features=self.n_chroma
      elif each == 'rms':
        n_features=1
      elif each == 'melspectrogram':
        n_features=self.n_mels
      
      train_feature = train_features_dict[each]
      val_feature = val_features_dict[each]

      if self.avgpool:
        train_features_dict[each], val_features_dict[each] = self.avgpooling(train_feature, val_feature, n_time=train_feature.shape[2], n_features=n_features)
      else:
        train_features_dict[each], val_features_dict[each] = train_features_dict[each], val_features_dict[each]
    
    return train_features_dict, val_features_dict
      

  def feature_extraction(self, dataframe, window_size, filter=True):
    y = [] # To hold the labels
    features_dict = {item: [] for item in self.features} # Create a key for each feature listed
    print(f"Number of rows in dataframe: {len(dataframe)}")
    for _, row in tqdm(dataframe.iterrows(), desc="Processing data", total=len(dataframe)):
          label = row['species']
          file_path = os.path.join(self.npy_path, row['filename_npy'])
          start = row['start']
          end = row['end']

          # print(f"Processing file: {file_path}")

          try:
            
              audio = np.load(file_path)
          except FileNotFoundError:
              print(f"File not found: {file_path}")
              continue


          start = int(start * sr)
          end = int(end * sr)+512



          if end > len(audio):
             end = len(audio)

          sample = audio[start:end]

          if len(sample) < 512:
                continue

          sample = self.normalize_audio(sample)

          sample = self.pad_with_noise(sample, window_length=self.window_size, window_samples=len(sample))
          # print(len(sample))

          if filter:
                  sample = self.bandpass_filter(sample)

          window_samples = int(window_size * self.sr)
          hop_samples = int(window_samples * (1 - self.overlap))  # For overlapping

          # Break the audio into windows with the specified overlap
          audio_windows = librosa.util.frame(sample, frame_length=window_samples, hop_length=hop_samples).T
          
          
          # display(label)
          
          for _, window in enumerate(audio_windows):
              
              y.append(label)

              if len(window) < window_samples:
                  if len(window) < 512*2:
                     continue
                  else:
                      window = self.pad_with_noise(window, window_length=window_size)
              
              # Feature Extraction FR --------------------------------------------------------------------
              # dynatically call the extract_x function to extract the listed features
              for feature in self.features:
                extract = f"extract_{feature}"
                if hasattr(self, extract) and callable(func := getattr(self, extract)):
                  features_dict[feature].append(func(window))

          # cast lists to np arrays
    for each in features_dict.keys():
              features_dict[each] = np.array(features_dict[each])

    y = np.array(y)

          # If not using average pooling, return resized features
    return y, features_dict

# **No Average Pooling**

## **Window Size = 1s**

## **Window Size = 6s**

### **['melspectrogram']**

In [None]:
features_list = ['melspectrogram']

In [None]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


Processing data: 100%|██████████| 3444/3444 [00:43<00:00, 79.01it/s] 


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:11<00:00, 74.51it/s] 


In [None]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

In [None]:
val_y = features.val_y
display(val_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(3318,)

'melspectrogram'

(3318, 60, 44)

### Encode Classes

In [None]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [None]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [None]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [None]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [None]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_mel.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram', 'mfcc']**

In [None]:
features_list = ['melspectrogram', 'mfcc']

In [None]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


Processing data: 100%|██████████| 3444/3444 [01:25<00:00, 40.11it/s]


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:22<00:00, 37.20it/s]


In [None]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

'mfcc'

(12565, 20, 44)

In [None]:
val_y = features.val_y
display(train_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(12565,)

'melspectrogram'

(3318, 60, 44)

'mfcc'

(3318, 20, 44)

### Encode Classes

In [None]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [None]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [None]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [None]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [None]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_mel_mfcc.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram', 'mfcc', 'chroma']**

In [None]:
features_list = ['melspectrogram', 'mfcc', 'chroma']

In [None]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


  return pitch_tuning(
Processing data: 100%|██████████| 3444/3444 [02:20<00:00, 24.49it/s]


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:37<00:00, 22.29it/s]


In [None]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

'mfcc'

(12565, 20, 44)

'chroma'

(12565, 12, 44)

In [None]:
val_y = features.val_y
display(val_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(3318,)

'melspectrogram'

(3318, 60, 44)

'mfcc'

(3318, 20, 44)

'chroma'

(3318, 12, 44)

### Encode Classes

In [None]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [None]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [None]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [None]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [None]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_mel_mfcc_chroma.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram', 'mfcc', 'chroma', 'rms']**

In [None]:
features_list = ['melspectrogram', 'mfcc', 'chroma', 'rms']

In [None]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


  return pitch_tuning(
Processing data: 100%|██████████| 3444/3444 [02:47<00:00, 20.60it/s]


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:41<00:00, 20.01it/s]


In [None]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

'mfcc'

(12565, 20, 44)

'chroma'

(12565, 12, 44)

'rms'

(12565, 1, 44)

In [None]:
val_y = features.val_y
display(val_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(3318,)

'melspectrogram'

(3318, 60, 44)

'mfcc'

(3318, 20, 44)

'chroma'

(3318, 12, 44)

'rms'

(3318, 1, 44)

### Encode Classes

In [None]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [None]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [None]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [None]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [None]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_all.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram']**

In [74]:
features_list = ['melspectrogram']

In [75]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


Processing data: 100%|██████████| 3444/3444 [00:43<00:00, 79.01it/s] 


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:11<00:00, 74.51it/s] 


In [76]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

In [77]:
val_y = features.val_y
display(val_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(3318,)

'melspectrogram'

(3318, 60, 44)

### Encode Classes

In [78]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [79]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [80]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [81]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [82]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_mel.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram', 'mfcc']**

In [83]:
features_list = ['melspectrogram', 'mfcc']

In [84]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


Processing data: 100%|██████████| 3444/3444 [01:25<00:00, 40.11it/s]


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:22<00:00, 37.20it/s]


In [85]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

'mfcc'

(12565, 20, 44)

In [86]:
val_y = features.val_y
display(train_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(12565,)

'melspectrogram'

(3318, 60, 44)

'mfcc'

(3318, 20, 44)

### Encode Classes

In [87]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [88]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [89]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [90]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [91]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_mel_mfcc.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram', 'mfcc', 'chroma']**

In [92]:
features_list = ['melspectrogram', 'mfcc', 'chroma']

In [93]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


  return pitch_tuning(
Processing data: 100%|██████████| 3444/3444 [02:20<00:00, 24.49it/s]


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:37<00:00, 22.29it/s]


In [94]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

'mfcc'

(12565, 20, 44)

'chroma'

(12565, 12, 44)

In [95]:
val_y = features.val_y
display(val_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(3318,)

'melspectrogram'

(3318, 60, 44)

'mfcc'

(3318, 20, 44)

'chroma'

(3318, 12, 44)

### Encode Classes

In [96]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [97]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [98]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [99]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [100]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_mel_mfcc_chroma.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)

### **['melspectrogram', 'mfcc', 'chroma', 'rms']**

In [101]:
features_list = ['melspectrogram', 'mfcc', 'chroma', 'rms']

In [102]:
features = Extraction(train_data,
                      val_data,
                      window_size=1,
                      features=features_list,
                      avgpool=False
                      )

Train DataFrame shape: (3444, 9)
Validation DataFrame shape: (834, 9)
Number of rows in dataframe: 3444


  return pitch_tuning(
Processing data: 100%|██████████| 3444/3444 [02:47<00:00, 20.60it/s]


Number of rows in dataframe: 834


Processing data: 100%|██████████| 834/834 [00:41<00:00, 20.01it/s]


In [103]:
train_y = features.train_y
display(train_y.shape)

train_features = features.train_features
for key in train_features.keys():
  display(key)
  display(train_features[key].shape)

(12565,)

'melspectrogram'

(12565, 60, 44)

'mfcc'

(12565, 20, 44)

'chroma'

(12565, 12, 44)

'rms'

(12565, 1, 44)

In [104]:
val_y = features.val_y
display(val_y.shape)

val_features = features.val_features
for key in val_features.keys():
  display(key)
  display(val_features[key].shape)

(3318,)

'melspectrogram'

(3318, 60, 44)

'mfcc'

(3318, 20, 44)

'chroma'

(3318, 12, 44)

'rms'

(3318, 1, 44)

### Encode Classes

In [105]:
label_encoder = LabelEncoder().fit(train_y)
train_y_encoded = label_encoder.transform(train_y)
val_y_encoded = label_encoder.transform(val_y)

classes = list(label_encoder.inverse_transform([0, 1, 2]))
print("Encoded classes for [0, 1, 2]:", classes)
print("Encoded training labels:", train_y_encoded)
print("Encoded validation labels:", val_y_encoded)

Encoded classes for [0, 1, 2]: ['Acrocephalus arundinaceus', 'Acrocephalus melanopogon', 'Acrocephalus scirpaceus']
Encoded training labels: [ 9  9  9 ... 17 17 17]
Encoded validation labels: [14 14 14 ...  4  4  4]


In [106]:
display(len(train_y_encoded))
display(train_y_encoded[:10])

display(len(val_y_encoded))
display(val_y_encoded[:10])

12565

array([9, 9, 9, 9, 9, 9, 9, 9, 9, 9])

3318

array([14, 14, 14, 14, 14, 14, 10, 10, 10, 10])

In [107]:
train_features['label'] = train_y_encoded
val_features['label'] = val_y_encoded

In [108]:
merged_dict = {'train': train_features, 'val': val_features}
merged_dict

{'train': {'melspectrogram': array([[[1.49619878e-04, 9.91906688e-07, 2.48130880e-14, ...,
           7.57408160e-14, 2.46620584e-05, 2.42136864e-04],
          [1.51059693e-04, 1.00149214e-06, 4.29179038e-14, ...,
           1.54229494e-13, 2.47809445e-05, 2.43302732e-04],
          [1.53461197e-04, 1.01720986e-06, 7.69849280e-12, ...,
           5.91120706e-12, 2.49865924e-05, 2.45357940e-04],
          ...,
          [8.10535547e-08, 1.13486703e-09, 1.55103726e-10, ...,
           9.95547819e-11, 3.92484929e-06, 3.89781243e-05],
          [6.54161468e-08, 4.35279562e-10, 1.70086070e-13, ...,
           3.73563449e-12, 3.73668669e-06, 3.71356338e-05],
          [5.58002542e-08, 3.70017484e-10, 1.19116812e-15, ...,
           8.60750760e-15, 3.62438557e-06, 3.60217485e-05]],
  
         [[2.90496083e-04, 1.33911613e-04, 1.18182281e-11, ...,
           2.01018052e-13, 9.45688580e-06, 9.98487150e-05],
          [2.97352272e-04, 1.36961177e-04, 1.73004844e-11, ...,
           3.01842594e

### Save the merged dictionary to a pkl

In [109]:
with open('C:/Users/thato/Documents/Final-Year-Project/Dataset/Project-V4/feature-extraction/Annotated/NotAveragePooled/split_features_1s_all.pkl', 'wb') as file:
  pickle.dump(merged_dict, file)