In [1]:
import torch
torch.cuda.is_available()#pregunta si hay una gpu disponible

True

## Entrenamiento con HTS-AT para clasificar sonidos

<!-- Referece: 

[HTS-AT: A Hierarchical Token-Semantic Audio Transformer for Sound Classification and Detection, ICASSP 2022](https://arxiv.org/abs/2202.00874)

Following the HTS-AT's paper, in this tutorial, we would show how to use the HST-AT in the training of the ESC-50 Dataset.

The [ESC-50 dataset](https://github.com/karolpiczak/ESC-50) is a labeled collection of 2000 environmental audio recordings suitable for benchmarking methods of environmental sound classification. The dataset consists of 5-second-long recordings organized into 50 semantical classes (with 40 examples per class) loosely arranged into 5 major categories

Before running this tutorial, please make sure that you install the below packages by following steps:

1. download [the codebase](https://github.com/RetroCirce/HTS-Audio-Transformer), and put this tutorial notebook inside the codebase folder.

2. In the github code folder:

    > pip install -r requirements.txt

3. We do not include the installation of PyTorch in the requirment, since different machines require different vereions of CUDA and Toolkits. So make sure you install the PyTorch from [the official guidance](https://pytorch.org/).

4. Install the 'SOX' and the 'ffmpeg', we recommend that you run this code in Linux inside the Conda environment. In that, you can install them by:

    > sudo apt install sox
    
    > conda install -c conda-forge ffmpeg -->


In [2]:
# import basic packages
import os
import numpy as np
import wget
import sys
import gdown
import zipfile
import librosa
import subprocess
# in the notebook, we only can use one GPU
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
# Build the workspace and download the needed files

def create_path(path):
    if not os.path.exists(path):
        os.mkdir(path)

workspace = 'D:\\SEMESTRE 2-2023\\INTELIGENCIA ARTIFICIAL 3 - SIS330\\modelos_deteccion_sonidos\\HTS-Audio-Transformer\\workspace'
dataset_path = os.path.join(workspace, "dataset_sonidos_peligrosos")
checkpoint_path = os.path.join(workspace, "ckpt")
esc_raw_path = os.path.join(dataset_path, 'raw')


create_path(workspace)
create_path(dataset_path)
create_path(checkpoint_path)
create_path(esc_raw_path)

# Descargamos los pesos pre-entrenados para el modelo
if not os.path.exists(os.path.join(checkpoint_path,'htsat_audioset_pretrain.ckpt')):
    gdown.download(id='1OK8a5XuMVLyeVKF117L8pfxeZYdfSDZv', output=os.path.join(checkpoint_path,'htsat_audioset_pretrain.ckpt'))




In [4]:
sox_path = 'C:\\Program Files (x86)\\sox-14-4-2\\sox.exe'  # Asegúrate de que esta ruta sea correcta


In [5]:
# Preprocesamiento de los datos
meta_path = os.path.join(esc_raw_path, 'sonidos_peligrosos', 'meta', 'dataset_sonidos_peligrosos_recortado.csv')
audio_path = os.path.join(esc_raw_path, 'sonidos_peligrosos', 'audio')
resample_path = os.path.join(dataset_path, 'resample_sonidos_peligrosos')
savedata_path = os.path.join(dataset_path, 'sonidos_peligrosos-data.npy')
create_path(resample_path)

meta = np.loadtxt(meta_path , delimiter=',', dtype='str', skiprows=1)
audio_list = os.listdir(audio_path)

# resample
print("-------------Resample dataset_sonidos_peligrosos-------------")
for f in audio_list:
    full_f = os.path.join(audio_path, f)
    resample_f = os.path.join(resample_path, f)
    
    if not os.path.exists(resample_f):
        command = [sox_path, '-V1', full_f, '-r', '32000', resample_f]
        result = subprocess.run(command, capture_output=True, text=True)
        if result.returncode != 0:
            print(f"Error resampling {f}: {result.stderr}")
        else:
            print(f"Resampled {f} successfully.")

print("-------------Success-------------")

print("-------------Build Dataset-------------")

output_dict = [[] for _ in range(5)]
for label in meta:
    name = label[0]
    fold = int(float(label[1]))
    target = label[2]
    y, sr = librosa.load(os.path.join(resample_path, name), sr = None)
    output_dict[int(fold) - 1].append(
        {
            "name": name,
            "target": int(float(target)),
            "waveform": y
        }
    )
np.save(savedata_path, output_dict)
print("-------------Success-------------")
    

-------------Resample dataset_sonidos_peligrosos-------------
Resampled 1-102121-A-6.wav successfully.
Resampled 1-102955-A-1.wav successfully.
Resampled 1-103015-A-3.wav successfully.
Resampled 1-108676-A-5.wav successfully.
Resampled 1-109272-A-5.wav successfully.
Resampled 1-109543-A-4.wav successfully.
Resampled 1-110814-A-6.wav successfully.
Resampled 1-115200-A-6.wav successfully.
Resampled 1-120097-A-3.wav successfully.
Resampled 1-120866-A-5.wav successfully.
Resampled 1-125273-A-5.wav successfully.
Resampled 1-12566-A-2.wav successfully.
Resampled 1-127347-A-9.wav successfully.
Resampled 1-12739-A-0.wav successfully.
Resampled 1-128378-A-4.wav successfully.
Resampled 1-132413-A-9.wav successfully.
Resampled 1-133063-A-6.wav successfully.
Resampled 1-133529-A-7.wav successfully.
Resampled 1-133845-A-6.wav successfully.
Resampled 1-135068-A-1.wav successfully.
Resampled 1-1359-A-5.wav successfully.
Resampled 1-136190-A-8.wav successfully.
Resampled 1-138565-A-5.wav successfully.

In [6]:
# Load the model package
import torch
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

from utils import create_folder, dump_config
import esc_config as config
from sed_model import SEDWrapper
from data_generator import ESC_Dataset
from model.htsat import HTSAT_Swin_Transformer



In [7]:
# Data Preparation
class data_prep(pl.LightningDataModule):
    def __init__(self, train_dataset, eval_dataset, device_num):
        super().__init__()
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.device_num = device_num

    def train_dataloader(self):
        train_sampler = DistributedSampler(self.train_dataset, shuffle = False) if self.device_num > 1 else None
        train_loader = DataLoader(
            dataset = self.train_dataset,
            num_workers = config.num_workers,
            batch_size = config.batch_size // self.device_num,
            shuffle = False,
            sampler = train_sampler
        )
        return train_loader
    def val_dataloader(self):
        eval_sampler = DistributedSampler(self.eval_dataset, shuffle = False) if self.device_num > 1 else None
        eval_loader = DataLoader(
            dataset = self.eval_dataset,
            num_workers = config.num_workers,
            batch_size = config.batch_size // self.device_num,
            shuffle = False,
            sampler = eval_sampler
        )
        return eval_loader
    def test_dataloader(self):
        test_sampler = DistributedSampler(self.eval_dataset, shuffle = False) if self.device_num > 1 else None
        test_loader = DataLoader(
            dataset = self.eval_dataset,
            num_workers = config.num_workers,
            batch_size = config.batch_size // self.device_num,
            shuffle = False,
            sampler = test_sampler
        )
        return test_loader
    

In [8]:
print(config.classes_num)

10


In [9]:
# Set the workspace
device_num = torch.cuda.device_count()

print("Using", device_num, "GPUs")
print("each batch size:", config.batch_size // device_num)

full_dataset = np.load(os.path.join(config.dataset_path, "sonidos_peligrosos-data.npy"), allow_pickle = True)

# set exp folder
exp_dir = os.path.join(config.workspace, "results", config.exp_name)
checkpoint_dir = os.path.join(config.workspace, "results", config.exp_name, "checkpoint")
if not config.debug:
    create_folder(os.path.join(config.workspace, "results"))
    create_folder(exp_dir)
    create_folder(checkpoint_dir)
    dump_config(config, os.path.join(exp_dir, config.exp_name), False)

print("Using ESC")
dataset = ESC_Dataset(
    dataset = full_dataset,
    config = config,
    eval_mode = False
)
eval_dataset = ESC_Dataset(
    dataset = full_dataset,
    config = config,
    eval_mode = True
)

audioset_data = data_prep(dataset, eval_dataset, device_num)
checkpoint_callback = ModelCheckpoint(
    monitor = "acc",
    filename='l-{epoch:d}-{acc:.3f}',
    save_top_k = 20,
    mode = "max"
)




Using 1 GPUs
each batch size: 32
Using ESC


In [10]:
# Aqui en esta parte se prepara para el entrenamiento
trainer = pl.Trainer(
    deterministic=False,#para que los resultados sean reproducibles
    default_root_dir = checkpoint_dir,#ruta donde se guardan los checkpoints
    gpus = device_num, #numero de gpus
    val_check_interval = 1.0,#cada cuantas epocas se valida
    max_epochs = config.max_epoch,#numero de epocas
    auto_lr_find = True,    #para que busque el mejor learning rate
    sync_batchnorm = True, #para que sincronice el batchnorm
    callbacks = [checkpoint_callback], #para que guarde los checkpoints cada cierto tiempo
    accelerator = "ddp" if device_num > 1 else None, #para que use todas las gpus
    num_sanity_val_steps = 0, 
    resume_from_checkpoint = None, 
    replace_sampler_ddp = False,
    gradient_clip_val=1.0
)

sed_model = HTSAT_Swin_Transformer(
    spec_size=config.htsat_spec_size,
    patch_size=config.htsat_patch_size,
    in_chans=1,
    num_classes=config.classes_num,
    window_size=config.htsat_window_size,
    config = config,
    depths = config.htsat_depth,
    embed_dim = config.htsat_dim,
    patch_stride=config.htsat_stride,
    num_heads=config.htsat_num_head
)

model = SEDWrapper(
    sed_model = sed_model, 
    config = config,
    dataset = dataset
)

if config.resume_checkpoint is not None:
    print("Load Checkpoint from ", config.resume_checkpoint)
    ckpt = torch.load(config.resume_checkpoint, map_location="cpu")
    ckpt["state_dict"].pop("sed_model.head.weight")
    ckpt["state_dict"].pop("sed_model.head.bias")
    # finetune on the esc and spv2 dataset
    ckpt["state_dict"].pop("sed_model.tscam_conv.weight")
    ckpt["state_dict"].pop("sed_model.tscam_conv.bias")
    model.load_state_dict(ckpt["state_dict"], strict=False)



GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Load Checkpoint from  ./workspace/ckpt/htsat_audioset_pretrain.ckpt


In [11]:
# vaciar la memoria de la gpu
torch.cuda.empty_cache()
# Comenzar el entrenamiento
trainer.fit(model, audioset_data)

# Guardar el estado del modelo manualmente después del entrenamiento
model_path = 'modelo_entrenado/modelo_final_entrenado.ckpt'
torch.save(model.state_dict(), model_path)



LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                   | Params
-----------------------------------------------------
0 | sed_model | HTSAT_Swin_Transformer | 28.7 M
-----------------------------------------------------
27.6 M    Trainable params
1.1 M     Non-trainable params
28.7 M    Total params
114.657   Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Epoch 0:   0%|          | 0/63 [00:00<?, ?it/s] 



Epoch 0:  30%|███       | 19/63 [03:56<09:08, 12.46s/it, loss=2.21, v_num=4, loss_step=2.060]



Epoch 0: 100%|██████████| 63/63 [13:06<00:00, 12.48s/it, loss=1.59, v_num=4, loss_step=1.350]cuda:0 {'acc': 0.88}
Epoch 1: 100%|██████████| 63/63 [13:21<00:00, 12.72s/it, loss=0.0847, v_num=4, loss_step=0.0616, acc=0.880, loss_epoch=1.910]cuda:0 {'acc': 0.895}
Epoch 2: 100%|██████████| 63/63 [13:19<00:00, 12.69s/it, loss=0.0312, v_num=4, loss_step=0.0141, acc=0.895, loss_epoch=0.370] cuda:0 {'acc': 0.8925}
Epoch 3: 100%|██████████| 63/63 [13:47<00:00, 13.14s/it, loss=0.0057, v_num=4, loss_step=0.00386, acc=0.892, loss_epoch=0.0398] cuda:0 {'acc': 0.895}
Epoch 4: 100%|██████████| 63/63 [13:12<00:00, 12.58s/it, loss=0.011, v_num=4, loss_step=0.00222, acc=0.895, loss_epoch=0.00627]  cuda:0 {'acc': 0.8975}
Epoch 5: 100%|██████████| 63/63 [14:09<00:00, 13.49s/it, loss=0.00391, v_num=4, loss_step=0.00272, acc=0.897, loss_epoch=0.00947]cuda:0 {'acc': 0.9}
Epoch 6: 100%|██████████| 63/63 [14:11<00:00, 13.51s/it, loss=0.00591, v_num=4, loss_step=0.0014, acc=0.900, loss_epoch=0.0118]  cuda:0 {'a



Epoch 7: 100%|██████████| 63/63 [14:00<00:00, 13.35s/it, loss=0.00199, v_num=4, loss_step=0.0168, acc=0.905, loss_epoch=0.00484]  cuda:0 {'acc': 0.905}
Epoch 8: 100%|██████████| 63/63 [13:27<00:00, 12.81s/it, loss=0.00719, v_num=4, loss_step=0.000818, acc=0.905, loss_epoch=0.00164] cuda:0 {'acc': 0.915}
Epoch 9: 100%|██████████| 63/63 [14:32<00:00, 13.85s/it, loss=0.029, v_num=4, loss_step=0.000682, acc=0.915, loss_epoch=0.0047]   cuda:0 {'acc': 0.885}
Epoch 10: 100%|██████████| 63/63 [14:38<00:00, 13.94s/it, loss=0.00151, v_num=4, loss_step=0.00226, acc=0.885, loss_epoch=0.0179] cuda:0 {'acc': 0.905}
Epoch 11: 100%|██████████| 63/63 [16:31<00:00, 15.74s/it, loss=0.000701, v_num=4, loss_step=0.000417, acc=0.905, loss_epoch=0.00526]cuda:0 {'acc': 0.8975}
Epoch 12: 100%|██████████| 63/63 [15:27<00:00, 14.72s/it, loss=0.000645, v_num=4, loss_step=0.00063, acc=0.897, loss_epoch=0.000794] cuda:0 {'acc': 0.8975}
Epoch 13: 100%|██████████| 63/63 [14:53<00:00, 14.18s/it, loss=0.000418, v_num=4

REALIZAR PREDICCIONES DE SONIDOS

In [15]:

# recuperar el modelo entrenado para hacer predicciones
model_path = 'workspace/results/exp_htsat_dataset_sonidos_peligrosos/checkpoint/lightning_logs/version_4/checkpoints/l-epoch=49-acc=0.905.ckpt'

meta = np.loadtxt(meta_path , delimiter=',', dtype='str', skiprows=1)
gd = {}
for label in meta:
    name = label[0]
    target = label[2]
    gd[name] = target

class Audio_Classification:
    def __init__(self, model_path, config):
        super().__init__()

        self.device = torch.device('cuda')
        self.sed_model = HTSAT_Swin_Transformer(
            spec_size=config.htsat_spec_size,
            patch_size=config.htsat_patch_size,
            in_chans=1,
            num_classes=config.classes_num,
            window_size=config.htsat_window_size,
            config = config,
            depths = config.htsat_depth,
            embed_dim = config.htsat_dim,
            patch_stride=config.htsat_stride,
            num_heads=config.htsat_num_head
        )
        ckpt = torch.load(model_path, map_location="cpu")
        temp_ckpt = {}
        for key in ckpt["state_dict"]:
            temp_ckpt[key[10:]] = ckpt['state_dict'][key]
        self.sed_model.load_state_dict(temp_ckpt)
        self.sed_model.to(self.device)
        self.sed_model.eval()


    def predict(self, audiofile):

        if audiofile:
            waveform, sr = librosa.load(audiofile, sr=32000)

            with torch.no_grad():
                x = torch.from_numpy(waveform).float().to(self.device)
                output_dict = self.sed_model(x[None, :], None, True)
                pred = output_dict['clipwise_output']
                pred_post = pred[0].detach().cpu().numpy()
                pred_label = np.argmax(pred_post)
                pred_prob = np.max(pred_post)
            return pred_label, pred_prob


In [24]:
# Inferencia de audio
Audiocls = Audio_Classification(model_path, config)

# realizamos la prediccion para un audio de prueba
pred_label, pred_prob = Audiocls.predict('G:/Mi unidad/DatasetAudiosPrueba/sonidos_prueba_funcionamiento/disparo.wav')

print('Audiocls predict output: ', pred_label, pred_prob)

Audiocls predict output:  2 10.048427
