## Data Distribution

In [None]:
import pandas as pd
import numpy as np
import librosa
import os
import random
import tensorflow as tf
import matplotlib.pyplot as plt
from CNN.utility import train_routine
from CNN.loader import load_dataset, get_split
from sklearn.manifold import TSNE
import seaborn as sns


SPLIT_PERC = {'train': 0.8, 'val': 0.2}
DATA_DIR = os.path.join('data', 'mammals_calls')
AUDIO_DIR = os.path.join('data', 'audio')
TO_TRAIN = False
h = 164
w = 397

seed = 2025
random.seed(seed)
np.random.seed(seed)
tf.random.set_seed(seed)

In [None]:
subfolders = [f.path for f in os.scandir(DATA_DIR) if f.is_dir()]
data_info = {}
for subfolder in subfolders:
    species_name = os.path.basename(subfolder)
    file_count = len([f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))])
    data_info[species_name] = file_count
count_df = pd.DataFrame(list(data_info.items()), columns=['species', 'file_count'])
count_df = count_df.sort_values(by='file_count', ascending=False)
print(count_df)


In [None]:
count_thousands = count_df[count_df['file_count'] > 1000]
count_hundreds = count_df[(count_df['file_count'] > 100) & (count_df['file_count'] < 1000)]
count_tens = count_df[count_df['file_count'] < 100]

plt.figure(figsize=(15, 6))
plt.bar(count_thousands['species'], count_thousands['file_count'], color="#87CEEB")
plt.bar(count_hundreds['species'], count_hundreds['file_count'], color='#00688B')
plt.bar(count_tens['species'], count_tens['file_count'], color="#191970")
plt.xlabel('Species')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Log Scaled Number of Spectrograms')
plt.yscale("log")
plt.title('Number of Spectrograms per Species')

plt.tight_layout()
plt.show()

print(f'Total number of species: {count_df.shape[0]}')
print(count_thousands)
print(f'Number of species with more than 1000 spectrograms: {count_thousands.shape[0]}')
print(f'Number of species with more than 100 but less than 1000 spectrograms: {count_hundreds.shape[0]}')
print(f'Number of species with less than 100 spectrograms: {count_tens.shape[0]}')


In [None]:
def mfcc_extractor(row, chunk_size):
    try:
        signal, sr = librosa.load(row['audio_files'])
    except Exception as e:
        print(f"Error loading audio file {row['audio_files']}: {e}")
        row['chunk_list'] = []
        return row
    chunk_size = chunk_size * sr
    mfcc_chunks = []
    i = 1
    
    for start in range(0, len(signal), sr):
        i += 1
        end = start + chunk_size
        y_chunk = signal[start:end]
        
        if len(y_chunk) < chunk_size:
            break  
        mfcc = librosa.feature.mfcc(y=y_chunk, sr=sr, n_mfcc=50)
        mfcc_mean = np.mean(mfcc, axis=1)

        mfcc_chunks.append(mfcc_mean)
    row['chunk_list'] = mfcc_chunks
    return row

In [None]:
n = 1700
audio_files = {}
for species in count_thousands['species']:
    curr_path = os.path.join(AUDIO_DIR, species)
    audio_files[species] = [os.path.join(curr_path, f) for f in os.listdir(curr_path) if f.endswith('.wav')]

audio_df = pd.DataFrame(list(audio_files.items()), columns=['species', 'audio_files'])
audio_df = audio_df.explode('audio_files').reset_index(drop=True)

audio_df = audio_df.apply(mfcc_extractor, axis=1, chunk_size=2)
audio_df_exploded = audio_df.explode('chunk_list').reset_index(drop=True)

print(f'Different species in audio dataset: {audio_df_exploded["species"].nunique()}')

In [None]:
print(f'Different species in audio dataset: {audio_df_exploded["species"].unique()}')
print(audio_df_exploded.head())

In [None]:
# eliminare le righe con liste vuote in 'chunk_list'
audio_df_exploded_clean = audio_df_exploded.dropna(axis=0, subset=['chunk_list'])
valid_chunks = audio_df_exploded_clean['chunk_list']
valid_chunks = valid_chunks[valid_chunks.apply(lambda x: isinstance(x, np.ndarray) and len(x) == 50)]

# Converti in matrice
mfcc_matrix = np.array(valid_chunks.tolist())

tsne = TSNE(n_components=2, random_state=42)
    
x_transformed = tsne.fit_transform(mfcc_matrix)
tsne_df = pd.DataFrame(np.column_stack((x_transformed, audio_df_exploded_clean["species"])), columns=['X', 'Y', "Targets"])



In [None]:
tsne_df.loc[:, "Targets"] = tsne_df.Targets.astype('category')

In [None]:
plt.figure(figsize=(10,8))
g = sns.FacetGrid(data=tsne_df, hue='Targets', height=8, palette="tab10")
g.map(plt.scatter, 'X', 'Y').add_legend()
plt.show()

## Training CNN

In [None]:
CNN_CACHE_DIR = os.path.join("data_cache", "CNN")
MODELS_METRICS_DIR = os.path.join("models_metrics")
PATIENCE = 3
FROM_START = False
if not os.path.exists(MODELS_METRICS_DIR):
    os.makedirs(MODELS_METRICS_DIR)

split_perc = {'train': 0.8, 'val': 0.2}

Training della CNN classica con le classi che contengono più di 1000 sample.

In [None]:

n_classes_1000 = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (0, 0), subfloder='22-09_training_01', to_train=False, cardinality=999)

Si ripete il training aggiungendo 10 classi per volta in ordine decrescente in numero di sample contenuti.

In [None]:
#Fino a classe 23
n_classes_plus_10 = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_1000, 10), subfloder='22-09_training_02', to_train=TO_TRAIN)

In [None]:
#Fino a classe 33
n_classes_plus_20 = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_plus_10, 10), subfloder='22-09_training_03', to_train=TO_TRAIN)

In [None]:
#Fino a classe 43
n_classes_plus_30 = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_plus_20, 10), subfloder='22-09_training_04', to_train=TO_TRAIN)

In [None]:
#Fino all'ultima classe
n_classes_plus_rem = train_routine(count_df, PATIENCE, SPLIT_PERC, DATA_DIR, (w, h), (n_classes_plus_30, 10), subfloder='22-09_training_05', to_train=True)

## Output Plots

In [None]:
def get_metrics(cm, label_dict):
    num_classes = cm.shape[0]
    true_positives = np.diag(cm)
    false_positives = np.sum(cm, axis=0) - true_positives
    false_negatives = np.sum(cm, axis=1) - true_positives

    support =  np.asarray([label_dict[i]["support"] for i in range(num_classes)], dtype=int)
    true_negatives = np.sum(cm) - (true_positives + false_positives + false_negatives)

    precision = np.divide(true_positives, true_positives + false_positives, out=np.zeros_like(true_positives, dtype=float), where=(true_positives + false_positives) != 0)
    recall = np.divide(true_positives, true_positives + false_negatives, out=np.zeros_like(true_positives, dtype=float), where=(true_positives + false_negatives) != 0)
    f1_score = np.divide(2 * precision * recall, precision + recall, out=np.zeros_like(precision, dtype=float), where=(precision + recall) != 0)
    
    metrics_df = pd.DataFrame({
        'label': [label_dict[i]["label"] for i in range(num_classes)],
        'precision': precision,
        'recall': recall,
        'f1-score': f1_score,
        'support': support,
        'tp': true_positives,
        'fp': false_positives,
        'fn': false_negatives,
        'tn': true_negatives
    })

    metrics_df = metrics_df.sort_values(by='support', ascending=False).reset_index(drop=True)
    return metrics_df

def process_metrics(n_classes, training_date):
    all_classes_df = pd.read_csv(os.path.join(MODELS_METRICS_DIR, f'{training_date}_training', f'{n_classes}_training_log.csv'))
    label_df = pd.read_csv(os.path.join(MODELS_METRICS_DIR, f'{training_date}_training', f'{n_classes}_label_to_index.csv'))
    label_df = label_df.merge(count_df, left_on='label', right_on='species', how='left', validate='one_to_one', suffixes=('_training', '_total')).drop(columns=['species'])
    label_df['support'] = label_df['file_count_total'] - label_df['file_count_training']

    best_weights = all_classes_df[all_classes_df['val_accuracy'] == all_classes_df['val_accuracy'].max()]
    cm = best_weights['val_confusion_matrix']
    cm = cm.values[0]
    cm = cm[2:-2]
    cm_list = cm.split(', ')
    cm_matrix = []
    for r in cm_list:
        r = r[1:-1]
        r = r.split()
        cm_matrix.append([int(i) for i in r])
    cm_matrix = np.array(cm_matrix)
    label_dict = label_df.to_dict('index')
    metrics_df = get_metrics(cm_matrix, label_dict)
    metrics_df.to_csv(os.path.join(MODELS_METRICS_DIR, f'{training_date}_training', f'{n_classes}_metrics.csv'), index=False)
    metrics_plot_builder(metrics_df)
    confusion_matrix_plot(cm_matrix, metrics_df['label'].tolist())


def confusion_matrix_plot(cm, labels):
    cmn = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]

    def custom_format(val):
        if val < 0.01:
            return "0"
        else:
            return f"{val:.2f}"
    formatted_annotations = np.vectorize(custom_format)(cmn)
    plt.figure(figsize=(20, 20))
    sns.heatmap(cmn, annot=formatted_annotations, fmt='', cmap='Blues', xticklabels=labels, yticklabels=labels)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

def metrics_plot_builder(metrics_df):
    metrics_list = ['precision', 'recall', 'f1-score']
    f = 1
    fig = plt.figure(figsize=(15, 15))
    for metric in metrics_list:
        axs = plt.subplot(2, 2, f)
        axs.bar(metrics_df['label'], metrics_df[metric], color="#87CEEB")
        for i, (metric_value, support) in enumerate(zip(metrics_df[metric], metrics_df['support'])):
            label_pos = metric_value - (metric_value/2) if metric_value > 0 else metric_value + 0.02
            plt.text(i, label_pos, f'n:{int(support)}', ha='center', va='bottom', fontsize=9, rotation=90)
        axs.set_xlabel('Class')
        axs.tick_params(axis='x', rotation=90)
        axs.set_ylabel(metric.capitalize())
        axs.set_title(f'{metric.capitalize()} per Class (ordered by Support - descending)')
        
        f += 1
    fig.tight_layout()
    return fig

In [None]:
curr_training_date = '21-09'
process_metrics(n_classes_1000, curr_training_date)
process_metrics(n_classes_plus_10, curr_training_date)
process_metrics(n_classes_plus_20, curr_training_date)
process_metrics(n_classes_plus_30, curr_training_date)
process_metrics(n_classes_plus_rem, curr_training_date)