# Speech Recognition con español Argentino

Este notebook entrena un modelo de reconocimiento de voz para español argentino.

In [1]:
# Celda 1: Instalación de dependencias
import sys

!{sys.executable} -m pip install -q torch torchaudio datasets soundfile librosa torchcodec
#!{sys.executable} -m pip install -q  torch torchaudio torchcodec


In [1]:
# Celda 2: Imports y configuración inicial
import os
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram
import numpy as np
from datasets import load_dataset, concatenate_datasets, Audio
import random
import gc

# Semilla para reproducibilidad
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

# Detectar dispositivo (Al momento lo usamos con la T4 en colab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoria disponible: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("   En Colab: Runtime , Change runtime type , T4 GPU")

Usando dispositivo: cuda
GPU: Tesla T4
Memoria disponible: 14.74 GB


In [2]:
# Celda 3: Carga, Unión y Filtrado

from datasets import load_dataset, concatenate_datasets, Audio
import soundfile as sf
import io


print("Descargando/Cargando dataset...")

# 1. Cargar
ds_female = load_dataset("ylacombe/google-argentinian-spanish", "female", split="train")
ds_male = load_dataset("ylacombe/google-argentinian-spanish", "male", split="train")

# 2. Unir
full_dataset = concatenate_datasets([ds_female, ds_male])

# tengo que poner que no decodifique (decode=False), evita el error de librerías faltantes. (Dsp vemos si hace falta solucionarlo)
full_dataset = full_dataset.cast_column("audio", Audio(decode=False))


MAX_DURATION = 8.0 # para que no falle

def filter_long_samples(example):
    try:
        # Obtenemos los bytes crudos sin procesar
        audio_bytes = example['audio']['bytes']


        with io.BytesIO(audio_bytes) as buffer:
            info = sf.info(buffer)
            return info.duration <= MAX_DURATION
    except Exception as e:
        # Si el archivo está corrupto, se decartaa
        return False

print(f"Cantidad antes de filtrar: {len(full_dataset)}")
print("Aplicando filtro de duración (Max 8s)...")

# el filtro
full_dataset = full_dataset.filter(filter_long_samples)
print(f"Cantidad después de filtrar: {len(full_dataset)}")


full_dataset = full_dataset.shuffle(seed=42)

# hago el split Train/Test
dataset_split = full_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset_hf = dataset_split['train']
val_dataset_hf = dataset_split['test']

print(f"Muestras de entrenamiento: {len(train_dataset_hf)}")
print(f"Muestras de validación: {len(val_dataset_hf)}")

Descargando/Cargando dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/910 [00:00<?, ?B/s]

female/train-00000-of-00004-6fb30f4d957d(…):   0%|          | 0.00/404M [00:00<?, ?B/s]

female/train-00001-of-00004-d6234d86f707(…):   0%|          | 0.00/412M [00:00<?, ?B/s]

female/train-00002-of-00004-f9730bbec196(…):   0%|          | 0.00/410M [00:00<?, ?B/s]

female/train-00003-of-00004-03ac2065ea9d(…):   0%|          | 0.00/399M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3921 [00:00<?, ? examples/s]

male/train-00000-of-00002-920b805572ae22(…):   0%|          | 0.00/357M [00:00<?, ?B/s]

male/train-00001-of-00002-f6f0bfbdc6bb1d(…):   0%|          | 0.00/350M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1818 [00:00<?, ? examples/s]

Cantidad antes de filtrar: 5739
Aplicando filtro de duración (Max 8s)...


Filter:   0%|          | 0/5739 [00:00<?, ? examples/s]

Cantidad después de filtrar: 5536
Muestras de entrenamiento: 4982
Muestras de validación: 554


In [3]:
# Celda 4: Crear vocabulario de caracteres

# Obtener todos los textos
all_text = " ".join(train_dataset_hf['text']) + " ".join(val_dataset_hf['text'])
unique_chars = sorted(list(set(all_text.lower())))
char_map = {'<BLANK>': 0}
for i, char in enumerate(unique_chars):
    char_map[char] = i + 1

index_map = {v: k for k, v in char_map.items()}

print(f"Tamaño del vocabulario: {len(char_map)}")
print(f"Caracteres incluidos: {''.join(unique_chars[:50])}...")

def text_to_int_sequence(text):
    """Convierte texto a secuencia de índices"""
    text = text.lower()
    return [char_map[c] for c in text if c in char_map]

def int_sequence_to_text(seq):
    """Convierte secuencia de índices a texto"""
    return ''.join([index_map[i] for i in seq if i != 0])

Tamaño del vocabulario: 44
Caracteres incluidos:  !,-.3:?abcdefghijklmnopqrstuvwxyz¡¿áéíñóúü...


In [None]:
import io
import warnings

warnings.filterwarnings("ignore")

class HFSpeechDataset(Dataset):
    def __init__(self, hf_dataset, transform=None, target_sample_rate=16000):
        self.dataset = hf_dataset
        self.transform = transform
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        try:

            audio_bytes = item['audio']['bytes']
            with io.BytesIO(audio_bytes) as buffer:
                waveform, orig_sr = torchaudio.load(buffer)

            # Procesar Audio y convertir a Mono
            if waveform.dim() > 1 and waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)
            if waveform.dim() == 1:
                waveform = waveform.unsqueeze(0)


            if orig_sr != self.target_sample_rate:
                resampler = torchaudio.transforms.Resample(orig_sr, self.target_sample_rate)
                waveform = resampler(waveform)

            # Espectrograma
            if self.transform:
                spectrogram = self.transform(waveform) # Salida: [1, 128, Time]
            else:
                spectrogram = waveform

            # Transponer para LSTM: [Time, 128]
            # squeeze(0) quita el canal [128, Time] ylo transpose por [Time, 128]
            spectrogram = spectrogram.squeeze(0).transpose(0, 1)

            #varias claves para evitar KeyError
            text = item.get('transcription') or item.get('text') or item.get('sentence') or ""

            transcript_seq = text_to_int_sequence(text)
            transcript_seq = torch.tensor(transcript_seq, dtype=torch.int32)

            return spectrogram, transcript_seq

        except Exception as e:
            # Si falla devuelvo una muestra silenciosa válida para no romper el batch

            dummy_spec = torch.zeros(1, 128)
            dummy_trans = torch.tensor([0], dtype=torch.int32) # Token vacío
            return dummy_spec, dummy_trans

def collate_fn(batch):
    spectrograms = []
    transcript_seqs = []
    input_lengths = []
    target_lengths = []

    for (spectrogram, transcript_seq) in batch:
        if spectrogram is None or spectrogram.shape[0] == 0: continue

        spectrograms.append(spectrogram)
        transcript_seqs.append(transcript_seq)
        input_lengths.append(spectrogram.shape[0]) # Largo en Tiempo
        target_lengths.append(len(transcript_seq))


    spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
    transcript_seqs = torch.nn.utils.rnn.pad_sequence(transcript_seqs, batch_first=True)

    return spectrograms, transcript_seqs, input_lengths, target_lengths

In [None]:
# Celda 6: DataLoaders
BATCH_SIZE = 64  #64 o 128 entra cómodo con la T4.... creo jjsjs

train_transform = nn.Sequential(
    MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    torchaudio.transforms.TimeMasking(time_mask_param=35)
)

val_transform = MelSpectrogram(sample_rate=16000, n_mels=128) # Sin aumento para validación

# Instanciar Datasets
train_ds = HFSpeechDataset(train_dataset_hf, transform=train_transform)
val_ds = HFSpeechDataset(val_dataset_hf, transform=val_transform)

# DataLoaders
kwargs = {'num_workers': 4, 'pin_memory': True} if device.type == 'cuda' else {}

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, **kwargs)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, **kwargs)

print(f"Batch Size: {BATCH_SIZE}")
print(f"Batches por época: {len(train_loader)}")

Batch Size: 64
Batches por época: 78


In [None]:
class SpeechRecognitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SpeechRecognitionModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=3, bidirectional=True, batch_first=True)
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        # x shape esperado: [Batch, Time, Features] oesa: Batch, Time, 128

        # Verificación de seguridad: Si llega [Batch, 1, Time, 128], lo arreglamos
        if x.dim() == 4:
            x = x.squeeze(1)

        # Pasamos por la LSTM
        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = F.log_softmax(x, dim=2)
        return x

In [7]:
# Celda 8: Inicializar el Modelooooo, lo corregi ;)

import gc
import torch.nn as nn
import torch.optim as optim


print("Inicializando modelo escalado...")

# Limpiar memoria vieja
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Hiperparámetros (Testeando para el mejor rendimiento)
input_size = 128
hidden_size = 512   # 512 neuronas para aprender mejor
output_size = len(char_map)

# Crear modelo
model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)

# Loss y optimizador
criterion = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-5)

# scheduler , saque el verbose=True para que ande
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=2
)

# Contar parámetros
total_params = sum(p.numel() for p in model.parameters())
print(f"Modelo mas pesado en {device}")
print(f"  - Total parámetros: {total_params:,}")
print(f"  - Hidden size: {hidden_size}")

Inicializando modelo escalado...
Modelo mas pesado en cuda
  - Total parámetros: 15,274,028
  - Hidden size: 512


In [None]:
# Celda 9: Entreenamiento
num_epochs = 5

print(f"Entrenamiento - {num_epochs} épocas")

# Asegurar modo entrenamiento
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0

    print(f"\nÉpoca {epoch+1}/{num_epochs}")
    print("-" * 60)

    for i, (inputs, targets, input_lengths, target_lengths) in enumerate(train_loader):
        try:
            inputs = inputs.to(device)
            targets = targets.to(device)

            # Forward pass
            optimizer.zero_grad()
            outputs = model(inputs) # Salida: [Batch, Time, Class]

            # CTC Loss espera: (Time, Batch, Class)
            outputs = outputs.permute(1, 0, 2)

            # Calcular loss
            loss = criterion(outputs, targets, input_lengths, target_lengths)

            # Backward pass
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            running_loss += loss.item()

            # Imprimir progreso
            if (i + 1) % 10 == 0:
                print(f"  Batch [{i+1}/{len(train_loader)}] - Loss: {loss.item():.4f}")

        except RuntimeError as e:
            print(f"  [Advertencia] Salteando batch {i} por error de dimensiones: {e}")
            continue

    epoch_loss = running_loss / len(train_loader)
    print(f"\n Época {epoch+1} completada - Loss promedio: {epoch_loss:.4f}")

    scheduler.step(epoch_loss)
    current_lr = optimizer.param_groups[0]['lr']
    print(f"  Learning Rate actual: {current_lr:.6f}")

Entrenamiento - 5 épocas

Época 1/5
------------------------------------------------------------
  Batch [10/78] - Loss: 7.5546
  Batch [20/78] - Loss: 3.8659
  Batch [30/78] - Loss: 3.1287
  Batch [40/78] - Loss: 3.0383
  Batch [50/78] - Loss: 3.0391
  Batch [60/78] - Loss: 2.9944
  Batch [70/78] - Loss: 2.9535

 Época 1 completada - Loss promedio: 4.5763
  Learning Rate actual: 0.000500

Época 2/5
------------------------------------------------------------
  Batch [10/78] - Loss: 2.9027
  Batch [20/78] - Loss: 2.8893
  Batch [30/78] - Loss: 2.9527
  Batch [40/78] - Loss: 2.9212
  Batch [50/78] - Loss: 2.8801
  Batch [60/78] - Loss: 2.8278
  Batch [70/78] - Loss: 2.7657

 Época 2 completada - Loss promedio: 2.8847
  Learning Rate actual: 0.000500

Época 3/5
------------------------------------------------------------
  Batch [10/78] - Loss: 2.6807
  Batch [20/78] - Loss: 2.6879
  Batch [30/78] - Loss: 2.5633
  Batch [40/78] - Loss: 2.5621
  Batch [50/78] - Loss: 2.4809
  Batch [60/78

In [9]:
# Celda 10: validacion con algunos ejemplos

model.eval()

with torch.no_grad():
    # Tomar un batch de validación
    data_iter = iter(val_loader)
    inputs, targets, input_lengths, target_lengths = next(data_iter)

    inputs = inputs.to(device)
    if inputs.dim() == 4:
        inputs = inputs.squeeze(1)

    outputs = model(inputs)

    # decodificacion
    decoded_indices = torch.argmax(outputs, dim=2)

    # 5 ejemplos
    num_examples = min(5, len(inputs))
    for i in range(num_examples):
        # Predicción
        pred_idx = decoded_indices[i].cpu().numpy()

        pred_text_raw = []
        last_idx = -1
        for idx in pred_idx:
            if idx != 0 and idx != last_idx:
                pred_text_raw.append(idx)
            last_idx = idx

        pred_str = int_sequence_to_text(pred_text_raw)

        # Real
        target_idx = targets[i].cpu().numpy()
        target_str = int_sequence_to_text(target_idx)

        print(f"Ejemplo {i+1}:")
        print(f"  Real:       {target_str}")
        print(f"  Predicción: {pred_str}")


Ejemplo 1:
  Real:       los gatos duermen casi todo el día y hacen sus necesidades solo una vez al día
  Predicción: los mtos durrmencacitoa cencs mececias somorrunae sara
Ejemplo 2:
  Real:       creo que necesitás una cita con el doctor.
  Predicción: le te nesescitasunacita conmendoto
Ejemplo 3:
  Real:       la mermelada de zarzamora es muy fácil de hacer
  Predicción: lam melae sarsamura emusacilace
Ejemplo 4:
  Real:       la granizada destruyó toda la plantación de lechuga
  Predicción: ladrsaestuto po latlantaciande chu
Ejemplo 5:
  Real:       ¿él solo es director o también escribe sus guiones?
  Predicción: e soeeretor cotades fre sudanes


In [None]:
# Celda 11: Para guardar el modelo
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"speech_recognition_model_{timestamp}.pt"

# Guardar modelo, vocabulario y configuración
torch.save({
    'model_state_dict': model.state_dict(),
    'char_map': char_map,
    'index_map': index_map,
    'config': {
        'input_size': input_size,
        'hidden_size': hidden_size,
        'output_size': output_size
    }
}, model_path)

print(f"Modelo guardado en: {model_path}")