## Data Distribution

In [None]:
import pandas as pd
import numpy as np
import librosa
import os
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from CNN.utility import train_routine
from ds_creation.ds_utility import get_file_count, get_other_class
from ds_creation.plot_utility import process_metrics, process_audio_files, tsne_calc

import seaborn as sns


SPLIT_PERC = {'train': 0.8, 'val': 0.2}
DATA_DIR = os.path.join('data', 'mammals_calls', 'data')
AUDIO_DIR = os.path.join('data', 'audio')
MODELS_METRICS_DIR = os.path.join("models_metrics")

h = 164
w = 397

seed = 2025
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
count_df = get_file_count(DATA_DIR)
print(count_df)

In [None]:
count_thousands = count_df[count_df['file_count'] > 1000]
count_hundreds = count_df[(count_df['file_count'] > 100) & (count_df['file_count'] < 1000)]
count_tens = count_df[count_df['file_count'] < 100]

plt.figure(figsize=(15, 6))
plt.bar(count_thousands['species'], count_thousands['file_count'], color="#87CEEB")
plt.bar(count_hundreds['species'], count_hundreds['file_count'], color='#00688B')
plt.bar(count_tens['species'], count_tens['file_count'], color="#191970")
plt.xlabel('Species')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Log Scaled Number of Spectrograms')
plt.yscale("log")
plt.title('Number of Spectrograms per Species')

plt.tight_layout()
plt.show()

print(f'Total number of species: {count_df.shape[0]}')
print(count_thousands)
print(f'Number of species with more than 1000 spectrograms: {count_thousands.shape[0]}')
print(f'Number of species with more than 100 but less than 1000 spectrograms: {count_hundreds.shape[0]}')
print(f'Number of species with less than 100 spectrograms: {count_tens.shape[0]}')


In [None]:
audio_df_exploded, mfcc_matrix = process_audio_files(count_df['species'].tolist(), AUDIO_DIR, MODELS_METRICS_DIR)
print(f'Different species in audio dataset: {audio_df_exploded["species"].unique()}')
print(audio_df_exploded.head())

In [None]:
tsne_df = tsne_calc(audio_df_exploded, mfcc_matrix, MODELS_METRICS_DIR)

## Training CNN

In [None]:
CNN_CACHE_DIR = os.path.join("data_cache", "CNN")

PATIENCE = 3
TO_TRAIN = True
FROM_START = True
if not os.path.exists(MODELS_METRICS_DIR):
    os.makedirs(MODELS_METRICS_DIR)

split_perc = {'train': 0.8, 'val': 0.2}
count_df = get_file_count(DATA_DIR)

Training della CNN classica con le classi che contengono più di 1000 sample.

In [None]:

n_classes_1000, _ = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (0, 0), subfolder='23-09_training_01', from_start=False, to_train=TO_TRAIN, cardinality=999)

Si ripete il training aggiungendo 10 classi per volta in ordine decrescente in numero di sample contenuti.

In [None]:
#Fino a classe 23
n_classes_plus_10, _ = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_1000, 10), subfolder='23-09_training_02', from_start=False,to_train=TO_TRAIN)

In [None]:
#Fino a classe 33
n_classes_plus_20, _ = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_plus_10, 10), subfolder='23-09_training_03', from_start=FROM_START, to_train=TO_TRAIN)

In [None]:
#Fino a classe 43
n_classes_plus_30, _ = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_plus_20, 10), subfolder='23-09_training_04', from_start=FROM_START, to_train=True)

In [None]:
#Fino all'ultima classe
n_classes_plus_rem, _ = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_plus_30, 10), subfolder='23-09_training_05', from_start=FROM_START, to_train=TO_TRAIN)

## Output Plots

In [None]:
curr_training_date = '13-10'

process_metrics(count_df, 13, os.path.join(MODELS_METRICS_DIR, f'{curr_training_date}_training'), MODELS_METRICS_DIR)
process_metrics(count_df, 23, os.path.join(MODELS_METRICS_DIR, f'{curr_training_date}_training'), MODELS_METRICS_DIR)
process_metrics(count_df, 33, os.path.join(MODELS_METRICS_DIR, f'{curr_training_date}_training'), MODELS_METRICS_DIR)
process_metrics(count_df, 43, os.path.join(MODELS_METRICS_DIR, f'{curr_training_date}_training'), MODELS_METRICS_DIR)
process_metrics(count_df, 51, os.path.join(MODELS_METRICS_DIR, f'{curr_training_date}_training'), MODELS_METRICS_DIR)

## Training delle prime 23 classi + classe altro
(tot files 87757)

In [None]:
count_df_truncated = get_file_count(DATA_DIR)
count_df_truncated = count_df_truncated[count_df_truncated['file_count'] < 999]
other_species_list = count_df_truncated['species'].tolist()
print(f'Other total files: {count_df_truncated["file_count"].sum()}, species count: {count_df_truncated.shape}, species: {other_species_list}')
get_other_class(DATA_DIR, other_species_list)
count_df = get_file_count(DATA_DIR)
print(count_df) 
other_ds, _ = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (0, 0), cardinality=1000, subfolder='13-10_training', from_start=FROM_START, to_train=True)


## Old output plots
(valido per il training fino al 16-09 incluso)

In [None]:
def get_class_metrics(row, all_classes_df, metrics_list):
    for metric in metrics_list:
        class_num = row['index']
        col_name = rf'{class_num}_{metric}'
        class_metrics = all_classes_df[col_name]
        row[metric] = class_metrics
    return row

def process_metrics(n_classes, training_date):
    all_classes_df = pd.read_csv(os.path.join(MODELS_METRICS_DIR, f'{training_date}_training', f'{n_classes}_CNN_metrics.csv'))
    all_classes_df = all_classes_df.loc[:, all_classes_df.columns.str.match(r'^\d')]
    last_epoch_metrics = all_classes_df.iloc[-1]

    classes_df = pd.read_csv(os.path.join(MODELS_METRICS_DIR, f'{training_date}_training', f'{n_classes}_label_to_index.csv'))

    metrics_list = ['precision', 'recall', 'f1-score', 'support']
    classes_df = classes_df.apply(get_class_metrics, axis=1, all_classes_df=last_epoch_metrics, metrics_list=metrics_list)
    classes_df['label'] = classes_df['label'].apply(lambda x: ' '.join(x.split(' ')[:2]) if len(x) > 20 else x)
    classes_df.to_csv(os.path.join(MODELS_METRICS_DIR, f'{training_date}_merged_metrics', f'{n_classes}_merged_metrics.csv'), index=False)

def metrics_plot_builder(metrics_df):
    metrics_list = ['precision', 'recall', 'f1-score']
    f = 1
    fig = plt.figure(figsize=(15, 15))
    for metric in metrics_list:
        axs = plt.subplot(2, 2, f)
        axs.bar(metrics_df['label'], metrics_df[metric], color="#87CEEB")
        for i, (metric_value, support) in enumerate(zip(metrics_df[metric], metrics_df['support'])):
            label_pos = metric_value - (metric_value/2) if metric_value > 0 else metric_value + 0.02
            plt.text(i, label_pos, f'n:{int(support)}', ha='center', va='bottom', fontsize=9, rotation=90)
        axs.set_xlabel('Class')
        axs.tick_params(axis='x', rotation=90)
        axs.set_ylabel(metric.capitalize())
        axs.set_title(f'{metric.capitalize()} per Class (ordered by Support - descending)')
        
        f += 1
    fig.tight_layout()
    return fig

In [None]:
curr_training_date = '16-09'
process_metrics(13, curr_training_date)
#process_metrics(n_classes_plus_10, curr_training_date)
#process_metrics(n_classes_plus_20, curr_training_date)
#process_metrics(n_classes_plus_30, curr_training_date)
#process_metrics(n_classes_plus_rem, curr_training_date)

In [None]:
curr_metrics_date = '16-09_merged_metrics'
classes_df = pd.read_csv(os.path.join(MODELS_METRICS_DIR, curr_metrics_date, f'{13}_merged_metrics.csv'))
classes_df_sorted = classes_df.sort_values(by='support', ascending=False)
fig = metrics_plot_builder(classes_df_sorted)
plt.show()