In [10]:
import os
import torchaudio
from torch import nn
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from models import RNNModel
from dataset import CustomSpeechCommands, FeaturesDataset
from trainer import train_step, evaluate, train_model, show_curves

In [11]:
# ==== Paths ====
ROOT_DIR = 'data'
train_pt = os.path.join(ROOT_DIR, 'train.pt')
val_pt = os.path.join(ROOT_DIR, 'val.pt')
test_pt = os.path.join(ROOT_DIR, 'test.pt')
TRAIN_LIST = os.path.join(ROOT_DIR,"train_list.txt")
VAL_LIST = os.path.join(ROOT_DIR, "val_list.txt")
TEST_LIST = os.path.join(ROOT_DIR, "test_list.txt")

if not os.path.isfile(train_pt):
    train_raw = CustomSpeechCommands(ROOT_DIR, TRAIN_LIST)
    val_raw = CustomSpeechCommands(ROOT_DIR, VAL_LIST)
    test_raw = CustomSpeechCommands(ROOT_DIR, TEST_LIST)
    mfcc_transform = torchaudio.transforms.MFCC(
        sample_rate=16000,
        n_mfcc=13, # número de coeficientes MFCC a extraer
        melkwargs={"n_fft": 320, "hop_length": 160, "n_mels": 23}, # 320 = 20ms, 160 = 10ms, 23 = número de filtros mel
        log_mels = True
    )
    train_raw.save_features(mfcc_transform, train_pt)
    test_raw.save_features(mfcc_transform, test_pt)
    val_raw.save_features(mfcc_transform, val_pt)

train_dataset = FeaturesDataset(train_pt)
test_dataset = FeaturesDataset(test_pt)
val_dataset = FeaturesDataset(val_pt)


print("¡Datasets cargados exitosamente!")
print(f"Train samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")

Splitting data/train_list.txt: 100%|██████████| 105829/105829 [00:00<00:00, 129552.05it/s]


Total archivos en dataset: 105829
Archivos en data/train_list.txt: 32453
Archivos encontrados: 32453


Splitting data/val_list.txt: 100%|██████████| 105829/105829 [00:00<00:00, 166374.00it/s]


Total archivos en dataset: 105829
Archivos en data/val_list.txt: 3875
Archivos encontrados: 3875


Splitting data/test_list.txt: 100%|██████████| 105829/105829 [00:00<00:00, 190726.53it/s]


Total archivos en dataset: 105829
Archivos en data/test_list.txt: 4381
Archivos encontrados: 4381
Guardando features en: data/train.pt


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)
Extrayendo features: 100%|██████████| 32453/32453 [01:16<00:00, 424.74it/s]


Features tensor: torch.Size([32453, 13, 101])
Features guardadas correctamente en data/train.pt
Clases finales: {'on', 'stop', 'right', 'yes', 'unknown', 'off', 'up', 'left', 'down', 'no', 'go'}
Guardando features en: data/test.pt


Extrayendo features: 100%|██████████| 4381/4381 [00:12<00:00, 355.79it/s]


Features tensor: torch.Size([4381, 13, 101])
Features guardadas correctamente en data/test.pt
Clases finales: {'on', 'stop', 'right', 'yes', 'unknown', 'off', 'up', 'left', 'down', 'no', 'go'}
Guardando features en: data/val.pt


Extrayendo features: 100%|██████████| 3875/3875 [00:09<00:00, 395.49it/s]


Features tensor: torch.Size([3875, 13, 101])
Features guardadas correctamente en data/val.pt
Clases finales: {'on', 'stop', 'right', 'yes', 'unknown', 'off', 'up', 'left', 'down', 'no', 'go'}
Dataset cargado desde data/train.pt
 - 32453 ejemplos
 - 11 clases
Dataset cargado desde data/test.pt
 - 4381 ejemplos
 - 11 clases
Dataset cargado desde data/val.pt
 - 3875 ejemplos
 - 11 clases
¡Datasets cargados exitosamente!
Train samples: 32453
Validation samples: 3875
Test samples: 4381


# Entrenamiento

In [21]:

print(train_dataset.features.shape)

torch.Size([32453, 13, 101])


In [None]:
lr = 5e-4
batch_size = 32
criterion = nn.CrossEntropyLoss()
n_trains = 5
model = RNNModel(rnn_type = 'RNN', n_input_channels=13)
epochs = 40

all_curves, times = train_model(model, train_dataset, val_dataset, epochs, criterion, batch_size, lr, n_evaluations_per_epoch=6, use_gpu=False)
show_curves(all_curves)


1015
Epoch 1/40


IndexError: Target 10 is out of bounds.