# Birdsong Classification Model
by Beck, Carly, River, and Solomon

## Part 0. Importing Packages and Embedding Data

In [None]:
## clean up imports after :) ##
# don't forget 'pip install -r requirements.txt'


# the usual
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# audio stuff
import librosa
from transformers import Wav2Vec2Processor, HubertModel


# tensorflow and tings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D

# sklearn tings
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix
)
from sklearn import metrics

ModuleNotFoundError: No module named 'librosa'

Here are some of beck's scribbles for inputting audio data and getting embeddings from that (using HuBERT).

In [None]:
# Load metadata from CSV with columns "audio_file_path" and "label"
metadata = pd.read_csv('metadata.csv')

# Initialize HuBERT model and processor
processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-large-ls960-ft")
model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")

# # Function to extract MFCC features from an audio file
# # We can come back to this if we need more information? I think HuBERT is better.
# def extract_mfcc_tensor(audio_path):
#     # Load the audio file
#     audio, sr = librosa.load(audio_path, sr=None)
#     # Extract MFCC features
#     mfccs = librosa.feature.mfcc(y=audio, sr=sr)
#     return torch.tensor(mfccs, dtype=torch.float32)  # convert to a tensor

def extract_hubert_embedding_tensor(audio_path):
    # Load the audio file as a numpy array
    audio, sr = librosa.load(audio_path, sr=16000)  # ensure sample rate is 16000 for HuBERT because the internet said so
    # Preprocess the audio file for HuBERT
    input_values = processor(audio, sampling_rate=sr, return_tensors="pt").input_values
    # Pass input values through HuBERT model
    with torch.no_grad():
        outputs = model(input_values)
    # Extract the embeddings from the last hidden state
    embeddings = outputs.last_hidden_state.squeeze(0)  # keep as a 2D tensor (sequence_len, feature_dim)
    return embeddings

# Create the data list as a list of tuples
data = [
    (
        # extract_mfcc_tensor(row['audio_file_path']),
        extract_hubert_embedding_tensor(row['audio_file_path']),
        torch.tensor(row['label'], dtype=torch.long)
    )
    for _, row in metadata.iterrows()
]



## Part 1. Preprocessing
Here, we should:
- shape our data so we can input it into the model
- ensure all data is the same shape and size 
- print the size and shape of each class before and after reshaping (for bugtesting purposes)

Consider [this link](https://pr454nt.medium.com/audio-embeddings-a-short-guide-for-understanding-audio-signals-in-vector-space-0e6551e50747#:~:text=Audio%20embeddings%20represent%20a%20groundbreaking,known%20as%20a%20vector%20space.) for embedding knowledge.

In [None]:
# we'll use this to pad the embeddings to the max length in our data

# Define the collate function for 2D flattening
def collate_fn_2d(batch):
    # Find the maximum sequence length in this batch
    max_seq_len = max([item.size(0) for item in batch])  # `item.size(0)` gives sequence length
    
    # Pad each sequence to the maximum length and flatten to 2D
    padded_batch = [
        torch.cat([item, torch.zeros(max_seq_len - item.size(0), item.size(1))], dim=0).view(-1)
        for item in batch
    ]
    
    # Stack all flattened embeddings into a batch tensor
    padded_batch = torch.stack(padded_batch)  # shape: (batch_size, max_seq_len * feature_dim)
    
    return padded_batch, padded_batch  # Return input and target as the same for autoencoder or classification


In [None]:
# now we'll make a dataloader because audio files are big and we don't want to load them all at once
# we'll utilize this during training
dataloader = DataLoader(data, batch_size=32, shuffle=True, collate_fn=collate_fn_2d)


## Part 2. Visualizing Our Data
Here, we should:
- check class distribution of the full dataset
- split the data into training, validation, and testing sets
- check the class distribution of the training and validation sets

In [None]:
### this is a class distribution plotting function I used in text classification - Beck ###

def plot_class_distribution(y, title, x_label='Class', y_label='Count', ax=None, y_max_override=None):
    # Dynamically set y_max based on the current input data
    y_max = y_max_override if y_max_override else max(Counter(y).values()) * 1.2

    sns.countplot(x=y, palette='Accent', ax=ax)
    ax.set_title(title)
    ax.set_xlabel(x_label, size=12, color='grey')
    ax.set_xticklabels(['Hate Speech', 'Offensive Language', 'Neither'])
    ax.set_ylabel(y_label, size=12, color='grey')
    ax.grid()

    # Set y-axis limit
    ax.set_ylim(0, y_max)

    # Add percentage text on top of the bars
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2, height + (y_max * 0.025), 
                f'{height/len(y):.2%}', ha='center', color='black', fontsize=10)

In [1]:
# plot the distribution of the whole dataset



In [None]:
# split the whole dataset into training (to train), validation (to check training), and testing (to present results) sets



In [None]:
# plot the distribution of our subsets

