# Birdsong Classification Model
by Beck, Carly, River, and Solomon

## Part 0. Importing Packages

In [None]:
## clean up imports after :) ##
# don't forget 'pip install -r requirements.txt'


# the usual
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from collections import Counter
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# audio stuff
import librosa
from transformers import Wav2Vec2Processor, HubertModel


# tensorflow and tings
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, GlobalMaxPooling1D

# sklearn tings
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    confusion_matrix
)
from sklearn import metrics

In [6]:
# import the data <3



## Part 1. Splitting and Visualizing Our Data
Here, we should:
- check class distribution of the full dataset
- split the data into training, validation, and testing sets
- check the class distribution of the training and validation sets

In [5]:
### this is a class distribution plotting function I used for text classification - Beck ###

def plot_class_distribution(y, title, x_label='Class', y_label='Count', ax=None, y_max_override=None):
    # Dynamically set y_max based on the current input data
    y_max = y_max_override if y_max_override else max(Counter(y).values()) * 1.2

    sns.countplot(x=y, palette='Accent', ax=ax)
    ax.set_title(title)
    ax.set_xlabel(x_label, size=12, color='grey')
    ax.set_xticklabels(['Hate Speech', 'Offensive Language', 'Neither'])
    ax.set_ylabel(y_label, size=12, color='grey')
    ax.grid()

    # Set y-axis limit
    ax.set_ylim(0, y_max)

    # Add percentage text on top of the bars
    for p in ax.patches:
        height = p.get_height()
        ax.text(p.get_x() + p.get_width()/2, height + (y_max * 0.025), 
                f'{height/len(y):.2%}', ha='center', color='black', fontsize=10)

In [None]:
# plot the class distribution of the whole dataset
class_counts = metadata['label'].value_counts()
plt.figure(figsize=(10, 6))
class_counts.plot(kind='bar', color='yellow')
plt.title('Class Distribution')
plt.xlabel('Class Labels')
plt.ylabel('Number of Samples')
plt.xticks(rotation=45)
plt.grid(False)
plt.show()


In [None]:
# split the whole dataset into training and testing sets



In [None]:
# plot the distribution of our subsets



## Part 2. Developing the Model
Here, we should decide on and explain our CNN architecture.

In [None]:
# define our model


In [None]:
# compile the model

## Part 3. Training the Model

In [None]:
# train the model on the training set

In [None]:
# evaluate the model on the test set (copied from other code - Beck)

hist = model.history.history    # rename training history for better quality of life

# plot of accuracy during training
plt.plot(hist['accuracy'], label='Training Accuracy')
plt.plot(hist['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

# plot of loss during training
plt.plot(hist['loss'], label='Training Loss')
plt.plot(hist['val_loss'], label='Validation Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [None]:
# present more evaluation metrics (copied from other code - Beck)

y_test_pred_prob = model.predict(X_test)    # get the model's predictions for the test data (in probabilities)
y_test_pred = np.argmax(y_test_pred_prob, axis=1)    # convert the predictions to class labels

# calculate and print simpler evaluation metrics
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Final Evaluation Metrics (Normalized)')
print(f'Accuracy: {accuracy*100:.0f}')
print(f'Precision: {precision*100:.0f}')
print(f'Recall: {recall*100:.0f}')
print(f'F1 Score: {f1*100:.0f}')

# create and plot the confusion matrix
cm = confusion_matrix(y_test, y_test_pred, normalize='true') * 100  # normalize the confusion matrix and multiply by 100 to show percentages
cm_labels = ['Hate Speech', 'Offensive Language', 'Neither']
display = metrics.ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=cm_labels)
display.plot(cmap='Blues', values_format='.2f') # add color map and format to 2 decimal places
plt.title("Normalized Confusion Matrix (Percentages)")
plt.grid(False)
plt.show()
