In [1]:
# Torch imports
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset
import torch.nn.functional as F
import torchvision.models as models

# Metrics and visualization
import time
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Models and feature extractor
from transformers import Wav2Vec2Processor, Wav2Vec2Model

# Others
import os
from tqdm import tqdm
import numpy as np
import re
import copy

  from .autonotebook import tqdm as notebook_tqdm


## Datasets

In [2]:
class CustomSpeechCommands(Dataset):
    def __init__(self, root, files_list, download=True, target_len=16000, mode="mfcc", cnn_model=None):
        """
        mode: 'mfcc', 'mfcc_delta', 'mfcc_delta_delta', 'cnn', 'wav2vec2'
        cnn_model: modelo CNN preentrenado o personalizado para extracci√≥n
        """
        self.target_len = target_len
        self.mode = mode
        self.cnn_model = cnn_model
        self.dataset = torchaudio.datasets.SPEECHCOMMANDS(root=root, download=download)
        self.indices = None
        self.splitter(files_list, root)

    def splitter(self, files_list, root):
        with open(files_list, 'r') as f:
            self.file_paths = [line.strip() for line in f.readlines()]

        self.all_paths = []
        for item in tqdm(self.dataset._walker, desc=f"Splitting {files_list}"):
            relative_path = os.path.relpath(
                item,
                start=os.path.join(root, "SpeechCommands", "speech_commands_v0.02")
            ).replace("\\", "/")
            self.all_paths.append(relative_path)

        self.indices = [i for i, path in enumerate(self.all_paths) if path in self.file_paths]
        print(f"Archivos encontrados: {len(self.indices)} / {len(self.file_paths)}")

    def pad_waveform(self, waveform):
        length = waveform.shape[-1]
        if length < self.target_len:
            waveform = F.pad(waveform, (0, self.target_len - length))
        elif length > self.target_len:
            waveform = waveform[:, :self.target_len]
        return waveform

    def extract_feature_single(self, waveform, sample_rate, feature_extractor=None, processor=None, device="cuda"):
        """
        Extrae features de UNA muestra seg√∫n el modo configurado.
        """
        waveform = self.pad_waveform(waveform).to(device)

        if feature_extractor is not None:
            feature_extractor = feature_extractor.to(device)

        # --- MFCC ---
        if self.mode == "mfcc":
            feat = feature_extractor(waveform).squeeze(0).cpu().transpose(0, 1)

        # --- MFCC + Delta ---
        elif self.mode == "mfcc_delta":
            base = feature_extractor(waveform)
            delta = torchaudio.functional.compute_deltas(base)
            feat = torch.cat([base, delta], dim=1).squeeze(0).cpu().transpose(0, 1)

        # --- MFCC + Delta + Delta-Delta ---
        elif self.mode == "mfcc_delta_delta":
            base = feature_extractor(waveform)
            delta = torchaudio.functional.compute_deltas(base)
            delta2 = torchaudio.functional.compute_deltas(delta)
            feat = torch.cat([base, delta, delta2], dim=1).squeeze(0).cpu().transpose(0, 1)

        # --- CNN ---
        elif self.mode == "cnn":
            spec_transform = torchaudio.transforms.MelSpectrogram(sample_rate=sample_rate).to(device)
            spec = spec_transform(waveform).unsqueeze(0)
            with torch.no_grad():
                embedding = self.cnn_model(spec.to(device)).cpu().squeeze()
            feat = embedding

        # --- Wav2Vec2 ---
        elif self.mode == "wav2vec2":
            waveform = waveform.squeeze(0)
            inputs = processor(
                waveform,
                sampling_rate=sample_rate,
                return_tensors="pt",
                padding=True
            ).to(device)
            with torch.no_grad():
                outputs = feature_extractor(**inputs)
            feat = outputs.last_hidden_state.squeeze(0).cpu()

        else:
            raise ValueError(f"Modo de extracci√≥n '{self.mode}' no soportado.")

        return feat

    def extract_features(self, feature_extractor=None, processor=None, device="cuda"):
        features, labels = [], []

        with torch.no_grad():
            for idx in tqdm(self.indices, desc=f"Extrayendo features ({self.mode})"):
                waveform, sample_rate, label, _, _ = self.dataset[idx]
                feat = self.extract_feature_single(
                    waveform, sample_rate, feature_extractor, processor, device
                )
                features.append(feat)
                labels.append(label)

        features = torch.stack(features)
        print(f"Features tensor: {features.shape}")

        return features, labels

    def save_features(self, feature_extractor=None, save_path=None, processor=None, device="cuda"):
        print(f"Guardando features ({self.mode}) en {save_path}")
        try:
            features, labels = self.extract_features(feature_extractor, processor, device)
            torch.save({"features": features, "labels": labels}, save_path)
            print(f"Features guardadas correctamente en {save_path}")
            print(f"Clases finales: {set(labels)}")
        except Exception as e:
            print(f"Error al guardar features en {save_path}: {e}")

    def __len__(self):
        return len(self.indices)

    def __getitem__(self, idx):
        original_idx = self.indices[idx]
        waveform, sample_rate, label, speaker_id, utterance_number = self.dataset[original_idx]
        waveform = self.pad_waveform(waveform)
        return waveform, sample_rate, label, speaker_id, utterance_number

class FeaturesDataset(Dataset):
    def __init__(self, features_path):
        """
        Carga un archivo .pt con 'features' y 'labels' previamente guardados.

        features_path: ruta al archivo .pt (por ejemplo 'data/train.pt')
        """
        data = torch.load(features_path)
        self.features = data["features"]
        self.labels = data["labels"]

        self.label_to_idx = {label: i for i, label in enumerate(sorted(set(self.labels)))}
        self.idx_to_label = {v: k for k, v in self.label_to_idx.items()}
        self.numeric_labels = torch.tensor([self.label_to_idx[l] for l in self.labels])

        print(f"Dataset cargado desde {features_path}")
        print(f" - {len(self.features)} ejemplos")
        print(f" - {len(self.label_to_idx)} clases")

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        feature = self.features[idx]
        label = self.numeric_labels[idx]
        return feature, label


## Models

In [None]:
class RNNModel(nn.Module):
    def __init__(
        self,
        rnn_type,
        n_input_channels,
        hidd_size=256,
        out_features = 35,
        num_layers=1,
    ):
        """
        Para utilizar una vanilla RNN entregue rnn_type="RNN"
        Para utilizar una LSTM entregue rnn_type="LSTM"
        Para utilizar una GRU entregue rnn_type="GRU"
        """
        super().__init__()

        self.rnn_type = rnn_type

        if rnn_type == "GRU":
            self.rnn_layer = nn.GRU(n_input_channels, hidd_size, batch_first=True, num_layers=num_layers)

        elif rnn_type == "LSTM":
            self.rnn_layer = nn.LSTM(n_input_channels, hidd_size, batch_first=True, num_layers=num_layers)

        elif rnn_type == "RNN":
            self.rnn_layer = nn.RNN(n_input_channels, hidd_size, batch_first=True, num_layers=num_layers, bidirectional=True)

        else:
            raise ValueError(f"rnn_type {rnn_type} not supported.")

        self.net = nn.Sequential(
            nn.Linear(hidd_size, out_features),
        )

        self.flatten_layer = nn.Flatten()

    def forward(self, x):
        if self.rnn_type == "GRU":
            out, h = self.rnn_layer(x)

        elif self.rnn_type == "LSTM":
            out, (h, c) = self.rnn_layer(x)

        elif self.rnn_type == "RNN":
            out, h = self.rnn_layer(x)

        out = h[-1]

        return self.net(out)

class TCNNModel(nn.Module):
    def __init__(self, n_input_channels, hidd_size=64, out_features=35):
        """
        Modelo T-CNN (Temporal Convolutional Neural Network)

        Args:
            n_input_channels (int): Canales de entrada (e.g., 13 para MFCC)
            hidd_size (int): N√∫mero base de canales en las capas convolucionales
            out_features (int): N√∫mero de clases de salida (e.g., 35)
        """
        super().__init__()

        # --- Bloques Convolucionales ---
        # nn.Conv1d espera la entrada como (Batch, Channels, SeqLen)
        
        # (B, 13, T) -> (B, 64, T/2)
        self.conv_block1 = nn.Sequential(
            nn.Conv1d(n_input_channels, hidd_size, kernel_size=5, padding=2),
            nn.BatchNorm1d(hidd_size),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )
        
        # (B, 64, T/2) -> (B, 128, T/4)
        self.conv_block2 = nn.Sequential(
            nn.Conv1d(hidd_size, hidd_size * 2, kernel_size=3, padding=1),
            nn.BatchNorm1d(hidd_size * 2),
            nn.ReLU(),
            nn.MaxPool1d(kernel_size=2)
        )

        # (B, 128, T/4) -> (B, 256, T/4)
        self.conv_block3 = nn.Sequential(
            nn.Conv1d(hidd_size * 2, hidd_size * 4, kernel_size=3, padding=1),
            nn.BatchNorm1d(hidd_size * 4),
            nn.ReLU(),
        )

        # --- Pooling Global y Clasificaci√≥n ---
        
        # Colapsa la dimensi√≥n de secuencia (T/4) a 1
        # (B, 256, T/4) -> (B, 256, 1)
        self.global_pool = nn.AdaptiveAvgPool1d(1)
        
        self.flatten = nn.Flatten()
        
        # (B, 256) -> (B, 35)
        self.fc = nn.Linear(hidd_size * 4, out_features)

    def forward(self, x):
        x = x.permute(0, 2, 1) 
        
        x = self.conv_block1(x)
        x = self.conv_block2(x)
        x = self.conv_block3(x)
        x = self.global_pool(x)
        x = self.flatten(x)
        
        # Clasificar
        return self.fc(x)

class PositionalEncoding(nn.Module):
    """
    Implementa el Positional Encoding para a√±adir informaci√≥n de posici√≥n.
    """
    def __init__(self, d_model: int, dropout: float = 0.1, max_len: int = 5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Matriz de Positional Encoding
        position = torch.arange(max_len).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-np.log(10000.0) / d_model))
        pe = torch.zeros(max_len, 1, d_model)
        pe[:, 0, 0::2] = torch.sin(position * div_term)
        pe[:, 0, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [BatchSize, SeqLen, EmbeddingDim]
        """
        # x.shape[1] es la longitud de la secuencia (SeqLen)
        x = x + self.pe[:x.size(1)].transpose(0, 1) # Transpose para hacer Broadcasting [1, SeqLen, EmbDim]
        return self.dropout(x)


class TransformerModel(nn.Module):
    def __init__(
        self,
        n_input_features: int,  # e.g., 13 for MFCCs
        n_output_classes: int = 35,
        d_model: int = 128,      # Dimensi√≥n de la representaci√≥n del Transformer
        nhead: int = 8,          # N√∫mero de cabezas de atenci√≥n
        d_hid: int = 256,        # Dimensi√≥n de la capa FeedForward (FNN)
        n_layers: int = 6,       # N√∫mero de bloques Codificadores
        dropout: float = 0.1
    ):
        super().__init__()
        
        self.model_type = 'Transformer'
        self.d_model = d_model
        
        # 1. Proyecci√≥n de entrada: de n_input_features a d_model
        self.input_projection = nn.Linear(n_input_features, d_model)
        
        # 2. Positional Encoding
        self.pos_encoder = PositionalEncoding(d_model, dropout)
        
        # 3. Bloques Codificadores del Transformer
        encoder_layers = nn.TransformerEncoderLayer(
            d_model=d_model, 
            nhead=nhead, 
            dim_feedforward=d_hid, 
            dropout=dropout,
            batch_first=True # Importante para que el input sea [Batch, Seq, Feature]
        )
        self.transformer_encoder = nn.TransformerEncoder(encoder_layers, n_layers)
        
        # 4. Capa de Clasificaci√≥n Final
        # La estrategia es tomar la representaci√≥n del PRIMER token (similar al [CLS] de BERT,
        # pero aqu√≠ usamos el primer frame de audio como vector de secuencia).
        self.classifier = nn.Sequential(
            nn.Linear(d_model, n_output_classes)
        )
        
        # Inicializaci√≥n de pesos (buena pr√°ctica para Transformers)
        self.init_weights()

    def init_weights(self):
        initrange = 0.1
        self.input_projection.bias.data.zero_()
        self.input_projection.weight.data.uniform_(-initrange, initrange)
        self.classifier[0].bias.data.zero_()
        self.classifier[0].weight.data.uniform_(-initrange, initrange)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        Args:
            x: Tensor, shape [BatchSize, SeqLen, n_input_features]
        """
        # 1. Proyecci√≥n de la entrada
        x = self.input_projection(x) * np.sqrt(self.d_model) # Factor de escalamiento
        
        # 2. Agregar Positional Encoding
        x = self.pos_encoder(x)
        
        # 3. Pasar por los Codificadores del Transformer
        # torch.Size([BatchSize, SeqLen, d_model])
        output = self.transformer_encoder(x) 
        
        # 4. Clasificaci√≥n: Tomar la salida del primer frame (SeqLen=0) como 
        # la representaci√≥n agregada de toda la secuencia.
        final_representation = output[:, 0, :] # [BatchSize, d_model]
        
        # 5. Capa Lineal Final
        return self.classifier(final_representation)

# %%


class CNN1DModel(nn.Module):
    def __init__(
        self,
        hidd_size=256,
        in_channels = 13,
        out_channels = 64,
        N_conv_blocks = 2,
    ):
        super().__init__()
        rnn_in = 0
        if N_conv_blocks == 1:
            self.conv_blocks = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size = 3, padding = 'same'),
                nn.ReLU(),
                nn.MaxPool1d(2)     
            )
            rnn_in = out_channels
        elif N_conv_blocks == 2:
            self.conv_blocks = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size = 3, padding = 'same'),
                nn.ReLU(),
                nn.MaxPool1d(2),
                nn.Conv1d(out_channels, out_channels, kernel_size = 3, padding = 'same'),
                nn.ReLU(),
                nn.MaxPool1d(2)
            )
            rnn_in = out_channels
        elif N_conv_blocks == 3:
            self.conv_blocks = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size = 3, padding = 'same'),
                nn.ReLU(),
                nn.MaxPool1d(2),
                nn.Conv1d(out_channels, out_channels, kernel_size = 3, padding = 'same'),
                nn.ReLU(),
                nn.MaxPool1d(2),
                nn.Conv1d(out_channels, out_channels, kernel_size = 3, padding = 'same'),
                nn.ReLU(),
                nn.MaxPool1d(2)
            )
            rnn_in = out_channels
            
        else:
            raise ValueError('Choose valid number (1-3)')

        self.rnn_layer = RNNModel(
            n_input_channels=rnn_in,
            rnn_type="RNN",
            hidd_size=hidd_size
        )

    def forward(self, x):
        perm_x = torch.permute(x, (0, 2, 1))
        conv_out = self.conv_blocks(perm_x)
        deperm_x = torch.permute(conv_out, (0, 2, 1))
        return self.rnn_layer(deperm_x)

class MejorCNN1DModel(nn.Module):
    def __init__(
        self,
        hidd_size=256,
        in_channels=13,   # Tus 13 MFCCs
        num_classes=35    # Clases de SpeechCommands
    ):
        super().__init__()
        
        # --- Bloques CNN ---
        # Aumentamos canales, usamos BatchNorm, y kernels m√°s grandes
        
        self.conv_blocks = nn.Sequential(
            # Bloque 1
            nn.Conv1d(
                in_channels=in_channels, 
                out_channels=64, 
                kernel_size=7,  
                padding='same'
            ),
            nn.BatchNorm1d(64), 
            nn.ReLU(),
            nn.MaxPool1d(2),    
            nn.Dropout(0.2),    

            # Bloque 2
            nn.Conv1d(
                in_channels=64, 
                out_channels=128,
                kernel_size=5, 
                padding='same'
            ),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),    
            nn.Dropout(0.2),    

            # Bloque 3
            nn.Conv1d(
                in_channels=128, 
                out_channels=256,
                kernel_size=3, 
                padding='same'
            ),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.MaxPool1d(2)     
            # La secuencia de salida ser√° L // 8
        )
        
        # --- Capa RNN ---
        # El input para la RNN ahora tiene 256 canales
        # (El tama√±o del feature de la CNN)
        rnn_in_features = 256 
        
        self.rnn_layer = RNNModel(
            n_input_channels=rnn_in_features,
            rnn_type="GRU",       # <-- RECOMENDADO: Usa GRU o LSTM, no "RNN"
            hidd_size=hidd_size,
            num_classes=num_classes
        )

    def forward(self, x):
        
        perm_x = x.permute(0, 2, 1)
        conv_out = self.conv_blocks(perm_x)
        deperm_x = conv_out.permute(0, 2, 1)
        
        return self.rnn_layer(deperm_x)

def save_model(model, path, config=None):
    """
    Guarda un modelo PyTorch de forma gen√©rica.

    Parameters:
    - model: instancia de cualquier nn.Module
    - path: ruta donde guardar
    - config: diccionario con los par√°metros necesarios para reconstruir el modelo
    """
    os.makedirs(os.path.dirname(path), exist_ok=True)

    checkpoint = {
        "state_dict": model.state_dict(),
        "config": config
    }

    torch.save(checkpoint, path)
    print(f"Modelo guardado en {path}")

def load_trained_model(model_class, checkpoint_path, device="cpu", **override_kwargs):
    """
    Carga un modelo guardado con save_model.

    Parameters:
    - model_class: la clase del modelo (RNNModel, TCNN, TransformerClassifier, etc.)
    - checkpoint_path: ruta al archivo .pt
    - device: "cpu" o "cuda"
    - override_kwargs: si quieres reemplazar par√°metros del config guardado.

    Returns:
    - instancia reconstruida del modelo listo para usar
    """
    checkpoint = torch.load(checkpoint_path, map_location=device)

    config = checkpoint["config"] or {}

    config.update(override_kwargs)
    model = model_class(**config)

    model.load_state_dict(checkpoint["state_dict"])
    model.to(device)
    model.eval()

    print(f"Modelo cargado desde {checkpoint_path}")
    return model



## Trainers

In [None]:
def train_step(x_batch, y_batch, model, optimizer, criterion, use_gpu):
    # Predicci√≥n
    y_predicted = model(x_batch)

    # C√°lculo de loss
    loss = criterion(y_predicted, y_batch)

    # Actualizaci√≥n de par√°metros
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return y_predicted, loss


def evaluate(val_loader, model, criterion, use_gpu):
    cumulative_loss = 0
    cumulative_predictions = 0
    data_count = 0

    for x_val, y_val in val_loader:
        if use_gpu:
            x_val = x_val.cuda()
            y_val = y_val.cuda()

        y_predicted = model(x_val)

        loss = criterion(y_predicted, y_val)

        class_prediction = torch.argmax(y_predicted, axis=1).long()

        cumulative_predictions += (y_val == class_prediction).sum().item()
        cumulative_loss += loss.item() * y_val.shape[0]
        data_count += y_val.shape[0]

    val_acc = cumulative_predictions / data_count
    val_loss = cumulative_loss / data_count

    return val_acc, val_loss

def measure_inference_time(model, dataloader, n_iters=30):
    model.eval()
    t0 = time.perf_counter()
    with torch.no_grad():
        for i, (X, _) in enumerate(dataloader):
            if i >= n_iters: break
            X = X.cuda()
            _ = model(X)
    return (time.perf_counter() - t0) / n_iters

def train_model(
    model,
    train_dataset,
    val_dataset,
    epochs,
    criterion,
    batch_size,
    lr,
    n_evaluations_per_epoch=6,
    use_gpu=False,
    patience=5,                 
    restore_best_weights=True,
    save_model = True
):
    if use_gpu:
        model.cuda()

    # Dataloaders
    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=use_gpu
    )
    val_loader = torch.utils.data.DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, pin_memory=use_gpu
    )

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9)

    curves = {"train_acc": [], "val_acc": [], "train_loss": [], "val_loss": []}

    t0 = time.perf_counter()
    iteration = 0
    n_batches = len(train_loader)

    best_val_loss = float("inf")
    epochs_without_improvement = 0
    best_weights = None

    print(n_batches)

    for epoch in range(epochs):
        print(f"\rEpoch {epoch + 1}/{epochs}")
        cumulative_train_loss = 0
        cumulative_train_corrects = 0
        examples_count = 0

        model.train()
        for i, (x_batch, y_batch) in enumerate(train_loader):
            if use_gpu:
                x_batch = x_batch.cuda()
                y_batch = y_batch.cuda()

            y_predicted, loss = train_step(x_batch, y_batch, model, optimizer, criterion, use_gpu)

            cumulative_train_loss += loss.item() * x_batch.shape[0]
            examples_count += y_batch.shape[0]

            class_prediction = torch.argmax(y_predicted, axis=1).long()
            cumulative_train_corrects += (y_batch == class_prediction).sum().item()

            if (i % (n_batches // n_evaluations_per_epoch) == 0) and (i > 0):
                train_loss = cumulative_train_loss / examples_count
                train_acc = cumulative_train_corrects / examples_count
                print(f"Iteration {iteration} - Batch {i}/{len(train_loader)} - Train loss: {train_loss}, Train acc: {train_acc}")

            iteration += 1

        with torch.no_grad():
            val_acc, val_loss = evaluate(val_loader, model, criterion, use_gpu)

        print(f"Val loss: {val_loss}, Val acc: {val_acc}")

        train_loss = cumulative_train_loss / examples_count
        train_acc = cumulative_train_corrects / examples_count

        curves["train_acc"].append(train_acc)
        curves["val_acc"].append(val_acc)
        curves["train_loss"].append(train_loss)
        curves["val_loss"].append(val_loss)

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            epochs_without_improvement = 0

            if restore_best_weights:
                best_weights = copy.deepcopy(model.state_dict())
        else:
            epochs_without_improvement += 1
            print(f"Sin mejora. Paciencia: {epochs_without_improvement}/{patience}")

            if epochs_without_improvement >= patience:
                print("Early stopping activado!")
                break

    # Restaurar mejores pesos
    if restore_best_weights and best_weights is not None:
        print("Restaurando mejores pesos del modelo‚Ä¶")
        model.load_state_dict(best_weights)

    total_time = time.perf_counter() - t0
    print(f"Tiempo total de entrenamiento: {total_time:.4f} [s]")

    if save_model:
        os.makedirs("saved_models", exist_ok=True)
        if save_path is None:
            save_path = f"saved_models/{model.rnn_type}_epochs{epochs}.pt"
        torch.save(model.state_dict(), save_path)
        print(f"Modelo guardado en: {save_path}")
    model.cpu()
    return curves, total_time

def show_curves(all_curves, suptitle=''):
    final_curve_means = {k: np.mean([c[k] for c in all_curves], axis=0) for k in all_curves[0].keys()}
    final_curve_stds = {k: np.std([c[k] for c in all_curves], axis=0) for k in all_curves[0].keys()}

    fig, ax = plt.subplots(1, 2, figsize=(13, 5))
    fig.set_facecolor('white')

    epochs = np.arange(len(final_curve_means["val_loss"])) + 1

    # ==== Plot de p√©rdidas ====
    ax[0].plot(epochs, final_curve_means['val_loss'], label='validation')
    ax[0].plot(epochs, final_curve_means['train_loss'], label='training')
    ax[0].fill_between(epochs, 
                       y1=final_curve_means["val_loss"] - final_curve_stds["val_loss"], 
                       y2=final_curve_means["val_loss"] + final_curve_stds["val_loss"], alpha=.5)
    ax[0].fill_between(epochs, 
                       y1=final_curve_means["train_loss"] - final_curve_stds["train_loss"], 
                       y2=final_curve_means["train_loss"] + final_curve_stds["train_loss"], alpha=.5)
    ax[0].set_xlabel('Epoch')
    ax[0].set_ylabel('Loss')
    ax[0].set_title('Loss evolution during training')
    ax[0].legend()

    # ==== Plot de precisi√≥n ====
    ax[1].plot(epochs, final_curve_means['val_acc'], label='validation')
    ax[1].plot(epochs, final_curve_means['train_acc'], label='training')
    ax[1].fill_between(epochs, 
                       y1=final_curve_means["val_acc"] - final_curve_stds["val_acc"], 
                       y2=final_curve_means["val_acc"] + final_curve_stds["val_acc"], alpha=.5)
    ax[1].fill_between(epochs, 
                       y1=final_curve_means["train_acc"] - final_curve_stds["train_acc"], 
                       y2=final_curve_means["train_acc"] + final_curve_stds["train_acc"], alpha=.5)
    ax[1].set_xlabel('Epoch')
    ax[1].set_ylabel('Accuracy')
    ax[1].set_title('Accuracy evolution during training')
    ax[1].legend()

    fig.suptitle(suptitle, fontsize=16, weight="bold")

    # ==== Guardar y cerrar ====
    filepath = os.path.join('img', f'{suptitle}.pdf')
    plt.savefig(filepath, bbox_inches='tight', format='pdf')
    plt.close(fig)  

def get_metrics_and_confusion_matrix(models, dataset, name=''):

    dataloader = torch.utils.data.DataLoader(dataset, batch_size=min(32, len(dataset)))

    y_true = torch.cat([y for _, y in dataloader])
    total_classes = len(torch.unique(y_true))

    if hasattr(dataset, "idx_to_label"):
        class_names = [dataset.idx_to_label[i] for i in range(total_classes)]
    elif hasattr(dataset, "labels"):
        class_names = dataset.labels
    else:
        class_names = [str(i) for i in range(total_classes)]

    counts = torch.bincount(y_true, minlength=total_classes)
    top10 = torch.argsort(counts, descending=True)[:10].tolist()

    # Mapear: top-10 ‚Üí se quedan igual, otras ‚Üí class_id = 10 ("others")
    top10_set = set(top10)

    y_true_A = y_true.clone()
    for cls in range(total_classes):
        if cls not in top10_set:
            y_true_A[y_true_A == cls] = 10  # others = class 10

    labels_A = top10 + ["others"]
    nA = 11

    mask_B = torch.tensor([c not in top10_set for c in y_true], dtype=torch.bool)
    y_true_B = y_true[mask_B]
    labels_B = sorted(torch.unique(y_true_B).tolist())

    def map_groupA(pred):
        predA = pred.clone()
        for cls in range(total_classes):
            if cls not in top10_set:
                predA[predA == cls] = 10  # others
        return predA

    def compute_group(models, dataloader, y_true_group, labels, mask=None, map_func=None):
        cms, accs = [], []

        for model in models:
            model.cpu()
            model.eval()

            preds = []
            for x, _ in dataloader:
                p = model(x).argmax(dim=1)
                if map_func:
                    p = map_func(p)
                preds.append(p)

            preds = torch.cat(preds)

            if mask is not None:
                preds = preds[mask]

            cm = confusion_matrix(
                y_true_group,
                preds,
                labels=list(range(len(labels))),
                normalize="true",
            )
            cms.append(cm)
            accs.append(accuracy_score(y_true_group, preds))

        return (
            np.mean(cms, axis=0),
            np.std(cms, axis=0),
            np.mean(accs) * 100,
            np.std(accs) * 100,
        )

    label_names_A = [class_names[cls] for cls in top10] + ["others"]

    cmA_mean, cmA_std, accA_mean, accA_std = compute_group(
        models, dataloader, y_true_A, labels_A, map_func=map_groupA
    )

    label_names_B = [class_names[c] for c in labels_B]

    cmB_mean, cmB_std, accB_mean, accB_std = compute_group(
        models, dataloader, y_true_B, labels_B, mask=mask_B
    )

    os.makedirs("img", exist_ok=True)

    def plot_cm(mean, std, classes, title, filename, fontsize, rotation = 45):
        fig, ax = plt.subplots(figsize=(9, 8))
        im = ax.imshow(mean, cmap=plt.cm.Blues)

        ax.set_xticks(np.arange(len(classes)))
        ax.set_yticks(np.arange(len(classes)))
        ax.set_xticklabels(classes, rotation=90, ha="center")
        ax.set_yticklabels(classes)

        # Texto en cada celda
        for i in range(len(classes)):
            for j in range(len(classes)):
                # Color del fondo de esta celda seg√∫n el colormap
                rgba = im.cmap(im.norm(mean[i, j]))
                r, g, b, _ = rgba
                
                # Luminancia percibida
                luminance = 0.299*r + 0.587*g + 0.114*b
                
                # Elegir color del texto: blanco si el fondo es oscuro
                text_color = "white" if luminance < 0.5 else "black"

                ax.text(
                    j, i,
                    f"{std[i,j]:.2f}",
                    ha="center", va="center",
                    fontsize=fontsize,
                    rotation=rotation,
                    rotation_mode="anchor",
                    color=text_color
            )
        ax.set_title(title)
        fig.colorbar(im, ax=ax)
        plt.tight_layout()
        plt.savefig(filename, bbox_inches="tight")
        plt.close()

    plot_cm(
        cmA_mean, cmA_std,
        label_names_A,
        f"Top-10 + others\nacc={accA_mean:.2f} ¬± {accA_std:.2f}%",
        f"img/conf_mat_groupA_{name}.pdf",
        8,
        0
    )

    print(f"[OK] Saved: img/conf_mat_groupA_{name}.pdf")

    plot_cm(
        cmB_mean, cmB_std,
        label_names_B,
        f"Remaining classes\nacc={accB_mean:.2f} ¬± {accB_std:.2f}%",
        f"img/conf_mat_groupB_{name}.pdf",
        5
    )

    print(f"[OK] Saved: img/conf_mat_groupB_{name}.pdf")

## Visualization

In [5]:
def plot_waveform(wf, sample_rate, label="", figname=None):
    """
    Muestra el waveform (izquierda) y los MFCCs (derecha) de una se√±al de audio.

    Par√°metros:
        wf (Tensor): se√±al de audio [1, N] o [N]
        sample_rate (int): frecuencia de muestreo (Hz)
        label (str): etiqueta opcional para el t√≠tulo
        figname (str): ruta para guardar la figura (si es None, solo muestra)
    """
    if isinstance(wf, torch.Tensor):
        wf = wf.squeeze().cpu()

    # === Transformaci√≥n MFCC ===
    mfcc_transform = torchaudio.transforms.MFCC(
        sample_rate=sample_rate,
        n_mfcc=13,
        melkwargs={"n_fft": 320, "hop_length": 160, "n_mels": 23},
        log_mels=True
    )
    mfcc = mfcc_transform(wf.unsqueeze(0)).squeeze().cpu().numpy()  # [n_mfcc, time]

    # === Crear figura con 2 subplots ===
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.set_style("whitegrid")

    # --- Waveform ---
    time = torch.arange(0, len(wf)) / sample_rate
    axes[0].plot(time, wf.numpy(), color="steelblue", linewidth=1.0)
    axes[0].set_title("Waveform", fontsize=12)
    axes[0].set_xlabel("Tiempo [s]")
    axes[0].set_ylabel("Amplitud")

    # --- MFCC ---
    sns.heatmap(mfcc, ax=axes[1], cmap="viridis", cbar=True)
    axes[1].set_title("MFCCs", fontsize=12)
    axes[1].set_xlabel("Tiempo (frames)")
    axes[1].set_ylabel("Coeficiente MFCC")

    fig.suptitle(f"Audio: {label}", fontsize=14, y=1.02)
    plt.tight_layout()

    # === Guardar o mostrar ===
    if figname:
        name = os.path.join('img', f'{figname}.pdf')
        plt.savefig(name, bbox_inches="tight")
        print(f"Figura guardada en {name}")
    else:
        plt.show()

    plt.close(fig)


## Feature extraction

In [6]:
# --- Configuraci√≥n base ---
ROOT_DIR = "data"
SAVE_DIR = os.path.join(ROOT_DIR, "features")
device = "cuda" if torch.cuda.is_available() else "cpu"
os.makedirs(SAVE_DIR, exist_ok=True)

# --- Par√°metros comunes ---
mfcc = torchaudio.transforms.MFCC(
    sample_rate=16000,
    n_mfcc=13,
    melkwargs={"n_fft": 320, "hop_length": 160, "n_mels": 23}
)

# --- Inicializar Wav2Vec2 una sola vez ---
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
wav2vec2 = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(device)

# --- Configuraci√≥n de modos y extractores ---
modes = {
    "mfcc": mfcc,
    "mfcc_delta": mfcc,
    "mfcc_delta_delta": mfcc,
    "wav2vec2": wav2vec2,
}

# --- Procesar para train y val ---
for split in ["train", "val", "test"]:
    list_path = os.path.join(ROOT_DIR, f"{split}_list.txt")

    for mode, extractor in modes.items():
        save_path = os.path.join(SAVE_DIR, f"{split}_{mode}.pt")

        if os.path.isfile(save_path):
            print(f"{save_path} ya existe, saltando...")
            continue

        print(f"\nExtrayendo {mode} para {split}...")

        dataset = CustomSpeechCommands(ROOT_DIR, list_path, mode=mode)
        if mode == "wav2vec2":
            dataset.save_features(
                feature_extractor=extractor,
                processor=processor,
                device=device,
                save_path=save_path,
            )
        else:
            dataset.save_features(
                feature_extractor=extractor,
                device=device,
                save_path=save_path,
            )

print("\nExtracci√≥n de features completada.")




data/features/train_mfcc.pt ya existe, saltando...
data/features/train_mfcc_delta.pt ya existe, saltando...
data/features/train_mfcc_delta_delta.pt ya existe, saltando...
data/features/train_wav2vec2.pt ya existe, saltando...
data/features/val_mfcc.pt ya existe, saltando...
data/features/val_mfcc_delta.pt ya existe, saltando...
data/features/val_mfcc_delta_delta.pt ya existe, saltando...
data/features/val_wav2vec2.pt ya existe, saltando...
data/features/test_mfcc.pt ya existe, saltando...
data/features/test_mfcc_delta.pt ya existe, saltando...
data/features/test_mfcc_delta_delta.pt ya existe, saltando...
data/features/test_wav2vec2.pt ya existe, saltando...

Extracci√≥n de features completada.


In [7]:
SAVE_DIR = os.path.join(ROOT_DIR, 'petes')
# --- Procesar para train y val ---
for split in ["train", "val", "test"]:
    list_path = os.path.join(ROOT_DIR, f"{split}_list.txt")
        
    for hl in [320, 160, 54, 32, 16]:
        mode = torchaudio.transforms.MFCC(
            sample_rate=16000,
            n_mfcc=13,
            log_mels = True,
            melkwargs={"n_fft": 320, "hop_length": hl, "n_mels": 23}
        )
        save_path = os.path.join(SAVE_DIR, f"{split}_{hl}_mfcc.pt")
        if os.path.isfile(save_path):
            print(f"{save_path} ya existe, saltando...")
            continue
        print(f"\nExtrayendo {mode} para {split}...")

        dataset = CustomSpeechCommands(ROOT_DIR, list_path, mode='mfcc')
        if mode == "wav2vec2":
            dataset.save_features(
                feature_extractor=extractor,
                processor=processor,
                device=device,
                save_path=save_path,
            )
        else:
            dataset.save_features(
                feature_extractor=mode,
                device=device,
                save_path=save_path,
            )

print("\nExtracci√≥n de features completada.")

data/petes/train_320_mfcc.pt ya existe, saltando...
data/petes/train_160_mfcc.pt ya existe, saltando...
data/petes/train_54_mfcc.pt ya existe, saltando...
data/petes/train_32_mfcc.pt ya existe, saltando...
data/petes/train_16_mfcc.pt ya existe, saltando...
data/petes/val_320_mfcc.pt ya existe, saltando...
data/petes/val_160_mfcc.pt ya existe, saltando...
data/petes/val_54_mfcc.pt ya existe, saltando...
data/petes/val_32_mfcc.pt ya existe, saltando...
data/petes/val_16_mfcc.pt ya existe, saltando...
data/petes/test_320_mfcc.pt ya existe, saltando...
data/petes/test_160_mfcc.pt ya existe, saltando...
data/petes/test_54_mfcc.pt ya existe, saltando...
data/petes/test_32_mfcc.pt ya existe, saltando...
data/petes/test_16_mfcc.pt ya existe, saltando...

Extracci√≥n de features completada.


## Training

In [8]:
ROOT_DIR = os.path.join("data","petes")
SAVE_DIR = ROOT_DIR
device = "cuda"

# lr = 5e-4
# batch_size = 32
# criterion = nn.CrossEntropyLoss()
# n_trains = 2

# epochs = 3
# use_gpu = True

# pattern = re.compile(r"train_(\d+)_mfcc.pt")
# hop_lengths = sorted(
#     [int(pattern.search(f).group(1)) for f in os.listdir(SAVE_DIR) if pattern.search(f)]
# )

# f1_scores = []
# f1_stds = []
# seq_lengths = []

# for hop_length in hop_lengths:
#     seq_len = 1 + 16000 // hop_length
#     seq_lengths.append(seq_len)

#     print(f"\n--- Hop length {hop_length} -> Secuencia {seq_len} frames ---")

#     train_dataset = FeaturesDataset(os.path.join(SAVE_DIR, f"train_{hop_length}_mfcc.pt"))
#     val_dataset   = FeaturesDataset(os.path.join(SAVE_DIR, f"val_{hop_length}_mfcc.pt"))
#     test_dataset  = FeaturesDataset(os.path.join(SAVE_DIR, f"test_{hop_length}_mfcc.pt"))

#     models = []
#     curves = []

#     for k in range(n_trains):
#         print(f"Entrenamiento {k+1}/{n_trains}")

#         # Crear modelo T-CNN
#         model = TCNNModel(n_input_channels=13, hidd_size=64, out_features=35)

#         curve, _ = train_model(
#             model,
#             train_dataset,
#             val_dataset,
#             epochs,
#             criterion,
#             batch_size,
#             lr,
#             n_evaluations_per_epoch=3,
#             use_gpu=use_gpu,
#         )

#         curves.append(curve)
#         models.append(model)

#     show_curves(curves, suptitle=f"TCNN_seq{seq_len}")

#     test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
#     metrics_mean, metrics_std, _ = evaluate_models_metrics(models, test_loader, criterion, use_gpu=use_gpu)

#     f1_scores.append(metrics_mean["f1"])
#     f1_stds.append(metrics_std["f1"])

#     print(f"F1 TCNN_seq{seq_len}: {metrics_mean['f1']:.3f} ¬± {metrics_std['f1']:.3f}")

#     get_metrics_and_confusion_matrix(models, test_dataset, name=f"TCNN_seq{seq_len}")

# # --- Visualizaci√≥n de F1 vs longitud de secuencia ---
# plt.figure(figsize=(8, 5))
# plt.errorbar(seq_lengths, f1_scores, yerr=f1_stds, label="TCNN", marker="o", capsize=4)
# plt.xlabel("Cantidad de frames en la secuencia (MFCC)")
# plt.ylabel("F1-score promedio (¬± std)")
# plt.title("F1-score TCNN seg√∫n longitud de secuencia")
# plt.legend()
# plt.grid(True)
# plt.tight_layout()
# plt.savefig("img/f1_vs_length_tcnn.pdf", bbox_inches="tight")
# plt.show()


In [None]:
# Preliminary testing
lr = 5e-4
batch_size = 32
criterion = nn.CrossEntropyLoss()
n_trains = 2 # N√∫mero de repeticiones para obtener media y std
epochs = 5   # Aumenta las √©pocas, los Transformers suelen necesitar m√°s


train_dataset = FeaturesDataset(os.path.join(SAVE_DIR, f"train_16_mfcc.pt"))
val_dataset = FeaturesDataset(os.path.join(SAVE_DIR, f"val_16_mfcc.pt"))
test_dataset = FeaturesDataset(os.path.join(SAVE_DIR, f"test_16_mfcc.pt"))

# Usa las dimensiones de tu dataset (MFCCs y n√∫mero de clases)
N_INPUT_FEATURES = train_dataset.features.shape[2]  # 13 MFCCs
N_OUTPUT_CLASSES = len(train_dataset.label_to_idx)  # 35 clases

# --- Configuraci√≥n del Transformer ---
TRANSFORMER_ARCH_PARAMS = {
    "n_input_features": N_INPUT_FEATURES,
    "n_output_classes": N_OUTPUT_CLASSES,
    "d_model": 128,
    "nhead": 8,
    "n_layers": 4, # Puedes empezar con 4-6 capas
    "d_hid": 512,  # Debe ser mayor que d_model, e.g., 4 * d_model
}
# -----------------------------------

ARCH = 'Transformer'
print(f'Entrenando Modelo {ARCH} con d_model={TRANSFORMER_ARCH_PARAMS["d_model"]}')

times_of_training = []
models = []
curves = []

for k in range(n_trains):
    print(f'Entrenando modelo {k+1}/{n_trains}')
    
    model = TransformerModel(**TRANSFORMER_ARCH_PARAMS) 
    
    # Entrenar
    all_curves, times = train_model(
        model, 
        train_dataset, 
        val_dataset, 
        epochs, 
        criterion, 
        batch_size, 
        lr, 
        n_evaluations_per_epoch=3, 
        use_gpu=True
    )
    curves.append(all_curves)
    times_of_training.append(times)
    models.append(model)
    
show_curves(curves, ARCH)
get_metrics_and_confusion_matrix(models, test_dataset, ARCH)

Dataset cargado desde data/petes/train_16_mfcc.pt
 - 32453 ejemplos
 - 35 clases
Dataset cargado desde data/petes/val_16_mfcc.pt
 - 3875 ejemplos
 - 35 clases
Dataset cargado desde data/petes/test_16_mfcc.pt
 - 4381 ejemplos
 - 35 clases
Entrenando Modelo Transformer con d_model=128
Entrenando modelo 1/2
1015
Epoch 1/5
Iteration 338 - Batch 338/1015 - Train loss: 1.6941541232893953, Train acc: 0.49557522123893805
Iteration 676 - Batch 676/1015 - Train loss: 1.331261065545075, Train acc: 0.6063977104874446
Iteration 1014 - Batch 1014/1015 - Train loss: 1.1365347909247074, Train acc: 0.6643145471913229
Val loss: 0.6124032637996059, Val acc: 0.8136774193548387
Epoch 2/5
Iteration 1353 - Batch 338/1015 - Train loss: 0.6150485655639024, Train acc: 0.8187684365781711
Iteration 1691 - Batch 676/1015 - Train loss: 0.5875045518711364, Train acc: 0.8263478581979321
Iteration 2029 - Batch 1014/1015 - Train loss: 0.5712779103587894, Train acc: 0.8308014667365112
Val loss: 0.5322634831705401, Val a

In [None]:
ROOT_DIR = "data"
SAVE_DIR = ROOT_DIR
device = "cuda"

lr = 5e-4
batch_size = 32
criterion = nn.CrossEntropyLoss()
n_trains = 5
epochs = 20
use_gpu = True

neurons_on_hidd_layer = [256, 128, 64, 32, 16, 8]

train_dataset = FeaturesDataset(os.path.join(SAVE_DIR, "train.pt"))
val_dataset   = FeaturesDataset(os.path.join(SAVE_DIR, "val.pt"))
test_dataset  = FeaturesDataset(os.path.join(SAVE_DIR, "test.pt"))

# Diccionarios para guardar resultados
f1_scores = {arch: [] for arch in ["GRU", "LSTM"]}
f1_stds   = {arch: [] for arch in ["GRU", "LSTM"]}

for arch in ["GRU", "LSTM", 'RNN']:
    print(f"\n======= Entrenando modelos tipo {arch} =======")

    for hidd_size in neurons_on_hidd_layer:
        print(f"\n--- Modelo con hidd_size = {hidd_size} ---")

        models = []
        curves = []

        for k in range(n_trains):
            print(f"Entrenamiento {k+1}/{n_trains}")
            model = RNNModel(rnn_type=arch, n_input_channels=13, hidd_size=hidd_size)

            curve, _ = train_model(
                model,
                train_dataset,
                val_dataset,
                epochs,
                criterion,
                batch_size,
                lr,
                n_evaluations_per_epoch=3,
                use_gpu=use_gpu,
            )

            curves.append(curve)
            models.append(model)

        # Mostrar curvas promedio
        show_curves(curves, suptitle=f"{arch}_h{hidd_size}")

        # Evaluar m√©tricas
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        metrics_mean, metrics_std, _ = evaluate_models_metrics(models, test_loader, criterion, use_gpu=use_gpu)

        f1_scores[arch].append(metrics_mean["f1"])
        f1_stds[arch].append(metrics_std["f1"])

        print(f"F1 {arch}_h{hidd_size}: {metrics_mean['f1']:.3f} ¬± {metrics_std['f1']:.3f}")

        # Guardar matriz de confusi√≥n promedio
        get_metrics_and_confusion_matrix(models, test_dataset, name=f"{arch}_h{hidd_size}")

# --- Gr√°fico F1 vs n√∫mero de neuronas ---
plt.figure(figsize=(8, 5))
for arch in ["GRU", "LSTM"]:
    plt.errorbar(neurons_on_hidd_layer, f1_scores[arch], yerr=f1_stds[arch], 
                 label=arch, marker="o", capsize=4)

plt.xlabel("N√∫mero de neuronas ocultas (hidd_size)")
plt.ylabel("F1-score promedio (¬± std)")
plt.title("F1-score seg√∫n tama√±o de capa oculta")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("img/f1_vs_hidd_size.pdf", bbox_inches="tight")
plt.show()


In [None]:
ROOT_DIR = "data/petes"
SAVE_DIR = ROOT_DIR
device = "cuda"

lr = 5e-4
batch_size = 32
criterion = nn.CrossEntropyLoss()
n_trains = 5
epochs = 20
use_gpu = True

pattern = re.compile(r"train_(\d+)_mfcc.pt")
hop_lengths = sorted(
    [int(pattern.search(f).group(1)) for f in os.listdir(SAVE_DIR) if pattern.search(f)]
)

# Diccionarios para guardar resultados
f1_scores = {"GRU": [], "LSTM": [], "TCNN": [], "RNN": []}
f1_stds   = {"GRU": [], "LSTM": [], "TCNN": [], "RNN": []}
seq_lengths = []

for hop_length in hop_lengths:
    seq_len = 1 + 16000 // hop_length
    seq_lengths.append(seq_len)

    print(f"\n======= Hop length {hop_length} -> Secuencia {seq_len} frames =======")

    # Cargar datasets
    train_dataset = FeaturesDataset(os.path.join(SAVE_DIR, f"train_{hop_length}_mfcc.pt"))
    val_dataset   = FeaturesDataset(os.path.join(SAVE_DIR, f"val_{hop_length}_mfcc.pt"))
    test_dataset  = FeaturesDataset(os.path.join(SAVE_DIR, f"test_{hop_length}_mfcc.pt"))

    # Entrenar cada tipo de modelo
    for arch in ["GRU", "LSTM", "TCNN", "RNN"]:
        print(f"\n--- Entrenando modelo tipo {arch} ---")

        models = []
        curves = []

        for k in range(n_trains):
            print(f"Entrenamiento {k+1}/{n_trains}")

            # Crear modelo seg√∫n tipo
            if arch in ["GRU", "LSTM", "RNN"]:
                model = RNNModel(
                    rnn_type=arch,
                    n_input_channels=13,
                    hidd_size=128,
                )
            elif arch == "TCNN":
                model = TCNNModel(
                    n_input_channels=13,
                    hidd_size=128,
                )
            else:
                raise ValueError("Modelo no reconocido")

            # Entrenamiento
            curve, _ = train_model(
                model,
                train_dataset,
                val_dataset,
                epochs,
                criterion,
                batch_size,
                lr,
                n_evaluations_per_epoch=3,
                use_gpu=use_gpu,
            )
            curves.append(curve)
            models.append(model)

        # Curvas de entrenamiento
        show_curves(curves, suptitle=f"{arch}_seq{seq_len}")

        # Evaluaci√≥n
        test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
        metrics_mean, metrics_std, _ = evaluate_models_metrics(models, test_loader, criterion, use_gpu=use_gpu)

        f1_scores[arch].append(metrics_mean["f1"])
        f1_stds[arch].append(metrics_std["f1"])

        print(f"F1 {arch}_seq{seq_len}: {metrics_mean['f1']:.3f} ¬± {metrics_std['f1']:.3f}")

        get_metrics_and_confusion_matrix(models, test_dataset, name=f"{arch}_seq{seq_len}")

# --- Visualizaci√≥n F1 vs longitud de secuencia ---
plt.figure(figsize=(8, 5))
for arch in ["GRU", "LSTM", "TCNN"]:
    plt.errorbar(seq_lengths, f1_scores[arch], yerr=f1_stds[arch], label=arch, marker="o", capsize=4)

plt.xlabel("Cantidad de frames en la secuencia (MFCC)")
plt.ylabel("F1-score promedio (¬± std)")
plt.title("Comparaci√≥n de F1-score seg√∫n longitud de secuencia")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("img/f1_vs_length_all.pdf", bbox_inches="tight")
plt.show()


In [None]:
ROOT_DIR = "data/petes"
SAVE_DIR = ROOT_DIR
device = "cuda"

lr = 5e-4
batch_size = 32
criterion = nn.CrossEntropyLoss()
n_trains = 5
epochs = 20
use_gpu = True

hop_length = 32
seq_len_input = 1 + 16000 // hop_length  # 501 frames de entrada
print(f"Largo de secuencia de entrada: {seq_len_input}")

# Cargar datasets generados con hop_length=32
train_dataset = FeaturesDataset(os.path.join(SAVE_DIR, f"train_{hop_length}_mfcc.pt"))
val_dataset   = FeaturesDataset(os.path.join(SAVE_DIR, f"val_{hop_length}_mfcc.pt"))
test_dataset  = FeaturesDataset(os.path.join(SAVE_DIR, f"test_{hop_length}_mfcc.pt"))

# Configuraciones a probar
N_conv_blocks_list = range(4)

# Resultados
f1_scores = []
f1_stds = []
seq_lengths_seen = []

for N_conv_blocks in N_conv_blocks_list:
    # Largo de secuencia que ver√° la RNN
    seq_len_rnn = seq_len_input // (2 ** N_conv_blocks)
    seq_lengths_seen.append(seq_len_rnn)

    print(f"\n=== Entrenando modelo con {N_conv_blocks} bloques conv "
          f"(seq_len RNN ‚âà {seq_len_rnn}) ===")

    models = []
    curves = []

    for k in range(n_trains):
        print(f"Entrenamiento {k+1}/{n_trains}")

        model = MejorCNN1DModel(
            N_conv_blocks=N_conv_blocks,
            hidd_size=128,        # puedes ajustar si quieres
            in_channels=13,
            out_channels=64
        )

        curve, _ = train_model(
            model,
            train_dataset,
            val_dataset,
            epochs,
            criterion,
            batch_size,
            lr,
            n_evaluations_per_epoch=3,
            use_gpu=use_gpu,
        )

        curves.append(curve)
        models.append(model)

    show_curves(curves, suptitle=f"CNN1D_{N_conv_blocks}blocks")

    # Evaluar m√©tricas
    test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    metrics_mean, metrics_std, _ = evaluate_models_metrics(models, test_loader, criterion, use_gpu=use_gpu)

    f1_scores.append(metrics_mean["f1"])
    f1_stds.append(metrics_std["f1"])

    print(f"F1 CNN1D_{N_conv_blocks}blocks: {metrics_mean['f1']:.3f} ¬± {metrics_std['f1']:.3f}")

    get_metrics_and_confusion_matrix(models, test_dataset, name=f"CNN1D_{N_conv_blocks}blocks")

# --- Gr√°fico ---
plt.figure(figsize=(8, 5))
plt.errorbar(seq_lengths_seen, f1_scores, yerr=f1_stds, marker="o", capsize=4, label="CNN+RNN")
plt.xlabel("Largo de secuencia que entra a la RNN (frames)")
plt.ylabel("F1-score promedio (¬± std)")
plt.title("F1-score vs Largo de secuencia visto por la RNN")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.savefig("img/f1_vs_seq_len_rnn.pdf", bbox_inches="tight")
plt.show()
