## Data Distribution

In [None]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from CNN.loader import get_split
from CNN.model import train
    
DATA_DIR = os.path.join('data', 'mammals_calls')
h = 164
w = 397

np.random.seed(2025)

In [None]:
subfolders = [f.path for f in os.scandir(DATA_DIR) if f.is_dir()]
data_info = {}
for subfolder in subfolders:
    species_name = os.path.basename(subfolder)
    file_count = len([f for f in os.listdir(subfolder) if os.path.isfile(os.path.join(subfolder, f))])
    data_info[species_name] = file_count
count_df = pd.DataFrame(list(data_info.items()), columns=['species', 'file_count'])
count_df = count_df.sort_values(by='file_count', ascending=False)


In [None]:
count_thousands = count_df[count_df['file_count'] > 1000]
count_hundreds = count_df[(count_df['file_count'] > 100) & (count_df['file_count'] < 1000)]
count_tens = count_df[count_df['file_count'] < 100]

plt.figure(figsize=(15, 6))
plt.bar(count_thousands['species'], count_thousands['file_count'], color="#87CEEB")
plt.bar(count_hundreds['species'], count_hundreds['file_count'], color='#00688B')
plt.bar(count_tens['species'], count_tens['file_count'], color="#191970")
plt.xlabel('Species')
plt.xticks(rotation=45, ha='right')
plt.ylabel('Log Scaled Number of Spectrograms')
plt.yscale("log")
plt.title('Number of Spectrograms per Species')

plt.tight_layout()
plt.show()

print(f'Total number of species: {count_df.shape[0]}')
print(f'Number of species with more than 1000 spectrograms: {count_thousands.shape[0]}')
print(f'Number of species with more than 100 but less than 1000 spectrograms: {count_hundreds.shape[0]}')
print(f'Number of species with less than 100 spectrograms: {count_tens.shape[0]}')


## Training CNN

Training della vm con le classi da più di 1000 immagini

In [None]:
dataset = os.listdir(DATA_DIR)
class_list_1000 = [f for f in dataset if len(os.listdir(os.path.join(DATA_DIR, f))) > 999 and
              os.path.isdir(os.path.join(DATA_DIR, f))]
n_classes_1000 = len(class_list_1000)
print(f'Total classes found: {n_classes_1000}')

split_perc = {'train': 0.8, 'val': 0.2}
split_ds_1000 = get_split(DATA_DIR, class_list_1000, split_perc, h, w)

In [None]:
train(split_ds_1000['train'], split_ds_1000['val'], patience=5,cp_path='checkpoints', w_h = (w, h), n_classes=n_classes_1000)

Training della cnn con le classi da più di 500 immagini

In [None]:
class_list_500 = [f for f in dataset if len(os.listdir(os.path.join(DATA_DIR, f))) > 499 and
              os.path.isdir(os.path.join(DATA_DIR, f))]
n_classes_500 = len(class_list_500)
print(f'Total classes found: {n_classes_500}')

split_ds_500 = get_split(DATA_DIR, class_list_500, split_perc, h, w)

In [None]:
train(split_ds_500['train'], split_ds_500['val'], patience=5,cp_path='checkpoints', w_h = (w, h), n_classes=n_classes_500)

Training della cnn con le classi da più di 100 immagini

In [None]:
class_list_100 = [f for f in dataset if len(os.listdir(os.path.join(DATA_DIR, f))) > 499 and
              os.path.isdir(os.path.join(DATA_DIR, f))]
n_classes_100 = len(class_list_100)
print(f'Total classes found: {n_classes_100}')

split_ds_100 = get_split(DATA_DIR, class_list_100, split_perc, h, w)

In [None]:
train(split_ds_100['train'], split_ds_100['val'], patience=5,cp_path='checkpoints', w_h = (w, h), n_classes=n_classes_100)

Training della cnn con le classi da più di 50 immagini

In [None]:
class_list_50 = [f for f in dataset if len(os.listdir(os.path.join(DATA_DIR, f))) > 499 and
              os.path.isdir(os.path.join(DATA_DIR, f))]
n_classes_50 = len(class_list_50)
print(f'Total classes found: {n_classes_50}')

split_ds_50 = get_split(DATA_DIR, class_list_50, split_perc, h, w)

In [None]:
train(split_ds_50['train'], split_ds_50['val'], patience=5, cp_path='checkpoints', w_h=(w, h), n_classes=n_classes_50)

Training della cnn con tutte le classi

In [None]:
class_list_all = [f for f in dataset if len(os.listdir(os.path.join(DATA_DIR, f))) > 499 and
              os.path.isdir(os.path.join(DATA_DIR, f))]
n_classes_all = len(class_list_all)
print(f'Total classes found: {n_classes_all}')

split_ds_all = get_split(DATA_DIR, class_list_all, split_perc, h, w)

In [None]:
train(split_ds_all['train'], split_ds_all['val'], patience=5,cp_path='checkpoints', w_h = (w, h), n_classes=n_classes_all)