# Speech Recognition con espa√±ol Argentino

Este notebook entrena un modelo RNN con BiLSTM osea (Bidireccional Long Short Term Memory) Esto permite hacer un reconocimiento de voz mas robusto.

In [None]:
# Celda 1: Instalaci√≥n de dependencias
import sys

!{sys.executable} -m pip install -q torch torchaudio datasets soundfile librosa torchcodec
#!{sys.executable} -m pip install -q  torch torchaudio torchcodec


Instalando dependencias...
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2.0/2.0 MB[0m [31m73.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalaci√≥n completada


In [2]:
# Celda 2: Imports y configuraci√≥n inicial
import os
import torch
import torchaudio
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram
import numpy as np
from datasets import load_dataset, concatenate_datasets, Audio
import random
import gc

# Semilla para reproducibilidad
torch.manual_seed(42)
random.seed(42)
np.random.seed(42)

# Detectar dispositivo (Al momento lo usamos con la T4 en colab)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Usando dispositivo: {device}")

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoria disponible: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("   En Colab: Runtime , Change runtime type , T4 GPU")

Usando dispositivo: cuda
GPU: Tesla T4
Memoria disponible: 14.74 GB


In [None]:
# Celda 3: Carga, Uni√≥n y Filtrado ahora permitiendo audios de hasta 20s

from datasets import load_dataset, concatenate_datasets, Audio
import soundfile as sf
import io

print("Descargando/Cargando dataset...")

# 1. Cargar
ds_female = load_dataset("ylacombe/google-argentinian-spanish", "female", split="train")
ds_male = load_dataset("ylacombe/google-argentinian-spanish", "male", split="train")

# 2. Unir
full_dataset = concatenate_datasets([ds_female, ds_male])

# Modo "Solo Bytes" para evitar errores de Windows
full_dataset = full_dataset.cast_column("audio", Audio(decode=False))

MAX_DURATION = 20.0

def filter_long_samples(example):
    try:
        audio_bytes = example['audio']['bytes']
        with io.BytesIO(audio_bytes) as buffer:
            info = sf.info(buffer)
            return info.duration <= MAX_DURATION
    except Exception:
        return False

print(f"Cantidad total inicial: {len(full_dataset)}")
print(f"Aplicando filtro de duraci√≥n (Max {MAX_DURATION}s)...")

# Filtramos
full_dataset = full_dataset.filter(filter_long_samples)
print(f"Cantidad final para entrenar: {len(full_dataset)}")

# Shuffle y Split
full_dataset = full_dataset.shuffle(seed=42)
dataset_split = full_dataset.train_test_split(test_size=0.1, seed=42)

train_dataset_hf = dataset_split['train']
val_dataset_hf = dataset_split['test']

print(f"Muestras de entrenamiento: {len(train_dataset_hf)}")
print(f"Muestras de validaci√≥n: {len(val_dataset_hf)}")

Descargando/Cargando dataset...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/910 [00:00<?, ?B/s]

female/train-00000-of-00004-6fb30f4d957d(‚Ä¶):   0%|          | 0.00/404M [00:00<?, ?B/s]

female/train-00001-of-00004-d6234d86f707(‚Ä¶):   0%|          | 0.00/412M [00:00<?, ?B/s]

female/train-00002-of-00004-f9730bbec196(‚Ä¶):   0%|          | 0.00/410M [00:00<?, ?B/s]

female/train-00003-of-00004-03ac2065ea9d(‚Ä¶):   0%|          | 0.00/399M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3921 [00:00<?, ? examples/s]

male/train-00000-of-00002-920b805572ae22(‚Ä¶):   0%|          | 0.00/357M [00:00<?, ?B/s]

male/train-00001-of-00002-f6f0bfbdc6bb1d(‚Ä¶):   0%|          | 0.00/350M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1818 [00:00<?, ? examples/s]

Cantidad total inicial: 5739
Aplicando filtro de duraci√≥n (Max 20.0s)...


Filter:   0%|          | 0/5739 [00:00<?, ? examples/s]

Cantidad final para entrenar: 5739
Muestras de entrenamiento: 5165
Muestras de validaci√≥n: 574


In [None]:
# Celda 4: Crear vocabulario de caracteres

# Obtener todos los textos
all_text = " ".join(train_dataset_hf['text']) + " ".join(val_dataset_hf['text'])
unique_chars = sorted(list(set(all_text.lower())))
char_map = {'<BLANK>': 0}
for i, char in enumerate(unique_chars):
    char_map[char] = i + 1

index_map = {v: k for k, v in char_map.items()}

print(f"Tama√±o del vocabulario: {len(char_map)}")
print(f"Caracteres incluidos: {''.join(unique_chars[:50])}...")

def text_to_int_sequence(text):
    """Convierte texto a secuencia de √≠ndices"""
    text = text.lower()
    return [char_map[c] for c in text if c in char_map]

def int_sequence_to_text(seq):
    """Convierte secuencia de √≠ndices a texto"""
    return ''.join([index_map[i] for i in seq if i != 0])

Tama√±o del vocabulario: 44
Caracteres incluidos:  !,-.3:?abcdefghijklmnopqrstuvwxyz¬°¬ø√°√©√≠√±√≥√∫√º...


In [None]:
# Celda 5: Dataset Usando SoundFile directamente (lo otro daba mucho problema)
import io
import soundfile as sf 
import torch
import torchaudio
from torch.utils.data import Dataset
import numpy as np

class HFSpeechDataset(Dataset):
    def __init__(self, hf_dataset, transform=None, target_sample_rate=16000):
        self.dataset = hf_dataset
        self.transform = transform
        self.target_sample_rate = target_sample_rate

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        item = self.dataset[idx]

        try:
            # Bypasseamos torchaudio.load
            audio_bytes = item['audio']['bytes']
            with io.BytesIO(audio_bytes) as buffer:
                # sf.read devuelve: (data_numpy, sample_rate)
                # data_numpy suele ser [Tiempo, Canales] o [Tiempo] si es mono
                audio_np, orig_sr = sf.read(buffer)

            # Convertir a Tensor de Torch
            waveform = torch.from_numpy(audio_np).float()

            if waveform.dim() == 1:
                # Si es mono [Tiempo] a [1, Tiempo]
                waveform = waveform.unsqueeze(0)
            else:
                # Si es est√©reo [Tiempo, Canales] a [Canales, Tiempo]
                waveform = waveform.t()

            # convertir a Mono 
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # resamplear a 16kHz
            if orig_sr != self.target_sample_rate:
                resampler = torchaudio.transforms.Resample(orig_sr, self.target_sample_rate)
                waveform = resampler(waveform)

            # generar Espectrograma
            if self.transform:
                spectrogram = self.transform(waveform)
            else:
                spectrogram = waveform

            # [Channels, Mel, Time] -> squeeze -> [Mel, Time] -> transpose -> [Time, Mel]
            spectrogram = spectrogram.squeeze(0).transpose(0, 1)

            # El dataset usa la clave 'text' 
            text = item.get('text') or item.get('transcription') or ""

            transcript_seq = text_to_int_sequence(text)
            transcript_seq = torch.tensor(transcript_seq, dtype=torch.int32)

            return spectrogram, transcript_seq

        except Exception as e:
            print(f"Error cargando √≠ndice {idx}: {e}")
            # Dummy de seguridad
            dummy_spec = torch.zeros(10, 128) # Le damos un largo de 10 para que no sea Loss Infinito
            dummy_trans = torch.tensor([0], dtype=torch.int32)
            return dummy_spec, dummy_trans
        
def collate_fn(batch):
    spectrograms = []
    transcript_seqs = []
    input_lengths = []
    target_lengths = []

    for (spectrogram, transcript_seq) in batch:
        if spectrogram is None or spectrogram.shape[0] == 0: continue

        spectrograms.append(spectrogram)
        transcript_seqs.append(transcript_seq)
        input_lengths.append(spectrogram.shape[0])
        target_lengths.append(len(transcript_seq))

    spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
    transcript_seqs = torch.nn.utils.rnn.pad_sequence(transcript_seqs, batch_first=True)

    return spectrograms, transcript_seqs, input_lengths, target_lengths

In [None]:
# Celda 6: DataLoaders 

BATCH_SIZE = 16  # 32 era mucho xd

train_transform = nn.Sequential(
    MelSpectrogram(sample_rate=16000, n_mels=128),
    torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
    torchaudio.transforms.TimeMasking(time_mask_param=35)
)

val_transform = MelSpectrogram(sample_rate=16000, n_mels=128)

# Instanciar Datasets
train_ds = HFSpeechDataset(train_dataset_hf, transform=train_transform)
val_ds = HFSpeechDataset(val_dataset_hf, transform=val_transform)


kwargs = {'num_workers': 2, 'pin_memory': True} if device.type == 'cuda' else {}

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn, drop_last=True, **kwargs)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn, drop_last=False, **kwargs)

print(f"Configuraci√≥n de Velocidad: Batch Size {BATCH_SIZE}")
print(f"Pasos por √©poca: {len(train_loader)}")

Configuraci√≥n de Velocidad: Batch Size 16
Pasos por √©poca: 322




In [None]:
# Celda 7: Arquitectura del Modelo coon algunas mocdificaciones
class SpeechRecognitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=5):
        super(SpeechRecognitionModel, self).__init__()
        # Aumente num_layers por defecto a 5 (antes 3)
        self.lstm = nn.LSTM(
            input_size,
            hidden_size,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
            dropout=0.3 # Dropout entre capas LSTM para evitar memorizaci√≥n
        )
        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        if x.dim() == 4:
            x = x.squeeze(1)

        x, _ = self.lstm(x)
        x = self.dropout(x)
        x = self.fc(x)
        x = F.log_softmax(x, dim=2)
        return x

In [None]:
# Celda 8: Inicializar Modelo "Balanceado"
import gc
import torch.nn as nn
import torch.optim as optim

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

input_size = 128
hidden_size = 512   # 512 pq 768 era demasiado lento.
num_layers = 3      # 3 capas, 5 era muy pesado.
output_size = len(char_map)


model = SpeechRecognitionModel(input_size, hidden_size, output_size, num_layers=num_layers).to(device)

# Optimizador
criterion = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4, weight_decay=1e-4)

# Scheduler
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=3
)

total_params = sum(p.numel() for p in model.parameters())
print(f" Modelo Listo en {device}")
print(f"   Tama√±o: {hidden_size} neuronas | {num_layers} capas")
print(f"   Par√°metros: {total_params:,} ")

Inicializando modelo optimizado para velocidad...
 Modelo Listo en cuda
   Tama√±o: 512 neuronas | 3 capas
   Par√°metros: 15,274,028 


In [None]:
# Celda 9: Entrenamiento sin Gradient Accumulation, (Rompia todo)

# Funci√≥n CER
def compute_cer_batch(preds, targets):
    def levenshtein(s1, s2):
        if len(s1) < len(s2): return levenshtein(s2, s1)
        if len(s2) == 0: return len(s1)
        previous_row = range(len(s2) + 1)
        for i, c1 in enumerate(s1):
            current_row = [i + 1]
            for j, c2 in enumerate(s2):
                insertions = previous_row[j + 1] + 1
                deletions = current_row[j] + 1
                substitutions = previous_row[j] + (c1 != c2)
                current_row.append(min(insertions, deletions, substitutions))
            previous_row = current_row
        return previous_row[-1]

    total_dist = 0
    total_len = 0
    for p, t in zip(preds, targets):
        total_dist += levenshtein(p, t)
        total_len += len(t)
    return total_dist / total_len if total_len > 0 else 1.0

# Configuraci√≥n
num_epochs = 35
print(f"Inicio del entrenamiento {num_epochs} √âpocas con Batch {BATCH_SIZE}")

model.train()
best_cer = 1.0

for epoch in range(num_epochs):
    running_loss = 0.0

    for i, (inputs, targets, input_lengths, target_lengths) in enumerate(train_loader):
        try:
            inputs = inputs.to(device)
            targets = targets.to(device)

            # 1. Forward
            optimizer.zero_grad() # Limpiamos gradientes Sin acumulaci√≥n
            outputs = model(inputs)
            outputs = outputs.permute(1, 0, 2)

            # 2. Loss
            loss = criterion(outputs, targets, input_lengths, target_lengths)

            # 3. Backward y Step inmediato
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            running_loss += loss.item()

        except RuntimeError as e: #si nos quedamos sin memoria, creeme ya paso jajaj
            if "out of memory" in str(e):
                print(f"  [WARN] OOM en batch {i}. Saltando...")
                torch.cuda.empty_cache()
            continue

    epoch_loss = running_loss / len(train_loader)

    # Validaci√≥n CER (Solo tengo 1 batch para que sea r√°pido)
    model.eval()
    with torch.no_grad():
        val_iter = iter(val_loader)
        v_inputs, v_targets, _, _ = next(val_iter)
        v_inputs = v_inputs.to(device)
        if v_inputs.dim() == 4: v_inputs = v_inputs.squeeze(1)

        v_out = model(v_inputs)
        decoded = torch.argmax(v_out, dim=2)

        pred_strs = []
        target_strs = []
        for k in range(min(5, len(v_inputs))):
            raw_pred = decoded[k].cpu().numpy()
            # L√≥gica simple para sacar ceros y repetidos
            pred_tokens = [t for t, last in zip(raw_pred, [-1]+list(raw_pred[:-1])) if t != 0 and t != last]
            pred_strs.append(int_sequence_to_text(pred_tokens))
            target_strs.append(int_sequence_to_text(v_targets[k].cpu().numpy()))

        current_cer = compute_cer_batch(pred_strs, target_strs)

    model.train()

    # Guardar mejor
    msg = ""
    if current_cer < best_cer:
        best_cer = current_cer
        torch.save(model.state_dict(), "best_model_fast.pth")
        msg = "Guardado"

    print(f"Ep {epoch+1}/{num_epochs} | Loss: {epoch_loss:.4f} | CER: {current_cer:.2%} | {msg}")

    scheduler.step(epoch_loss)

Inicio del entrenamiento 35 √âpocas con Batch 16
Ep 1/35 | Loss: 2.8505 | CER: 65.28% | üíæ
Ep 2/35 | Loss: 1.8396 | CER: 40.38% | üíæ
Ep 3/35 | Loss: 1.4551 | CER: 32.83% | üíæ
Ep 4/35 | Loss: 1.2450 | CER: 27.55% | üíæ
Ep 5/35 | Loss: 1.1072 | CER: 25.28% | üíæ
Ep 6/35 | Loss: 0.9919 | CER: 25.28% | 
Ep 7/35 | Loss: 0.8930 | CER: 21.51% | üíæ
Ep 8/35 | Loss: 0.8118 | CER: 23.02% | 
Ep 9/35 | Loss: 0.7449 | CER: 20.38% | üíæ
Ep 10/35 | Loss: 0.6795 | CER: 18.49% | üíæ
Ep 11/35 | Loss: 0.6428 | CER: 20.00% | 
Ep 12/35 | Loss: 0.5933 | CER: 17.74% | üíæ
Ep 13/35 | Loss: 0.5541 | CER: 18.87% | 
Ep 14/35 | Loss: 0.5175 | CER: 15.85% | üíæ
Ep 15/35 | Loss: 0.4803 | CER: 16.60% | 
Ep 16/35 | Loss: 0.4539 | CER: 18.11% | 
Ep 17/35 | Loss: 0.4221 | CER: 13.58% | üíæ
Ep 18/35 | Loss: 0.4028 | CER: 15.09% | 
Ep 19/35 | Loss: 0.3868 | CER: 14.34% | 
Ep 20/35 | Loss: 0.3626 | CER: 15.47% | 
Ep 21/35 | Loss: 0.3495 | CER: 14.34% | 
Ep 22/35 | Loss: 0.3367 | CER: 17.36% | 


KeyboardInterrupt: 

In [None]:
# Celda 10: validacion con algunos ejemplos

model.eval()

with torch.no_grad():
    # Tomar un batch de validaci√≥n
    data_iter = iter(val_loader)
    inputs, targets, input_lengths, target_lengths = next(data_iter)

    inputs = inputs.to(device)
    if inputs.dim() == 4:
        inputs = inputs.squeeze(1)

    outputs = model(inputs)

    # decodificacion
    decoded_indices = torch.argmax(outputs, dim=2)

    # 5 ejemplos
    num_examples = min(5, len(inputs))
    for i in range(num_examples):
        # Predicci√≥n
        pred_idx = decoded_indices[i].cpu().numpy()

        pred_text_raw = []
        last_idx = -1
        for idx in pred_idx:
            if idx != 0 and idx != last_idx:
                pred_text_raw.append(idx)
            last_idx = idx

        pred_str = int_sequence_to_text(pred_text_raw)

        # y los reales
        target_idx = targets[i].cpu().numpy()
        target_str = int_sequence_to_text(target_idx)

        print(f"Ejemplo {i+1}:")
        print(f"  Real:       {target_str}")
        print(f"  Predicci√≥n: {pred_str}")


Ejemplo 1:
  Real:       actualmente este actor tiene una pel√≠cula en cartelera
  Predicci√≥n: atumente este actor tiene unla pel√≠culan cartelera
Ejemplo 2:
  Real:       me podes mandar mas informacion sobre los hechos.
  Predicci√≥n: me pod√©s manlar m√°s informaci√≥n sob re los hechos
Ejemplo 3:
  Real:       ¬øme pod√©s buscar tres calcoman√≠as diferentes?
  Predicci√≥n: ¬øme pod√©s buscar tres calcoman√≠as diferentes?
Ejemplo 4:
  Real:       ¬øpensaste darles un yogurt con frutas? ¬°a los chicos les encanta!
  Predicci√≥n: ensatedales un logrs con futas a los chicos ses encanta
Ejemplo 5:
  Real:       √©l es una persona que viene de un barrio muy humilde
  Predicci√≥n: ¬øel es una persona qe dien un varri muy nide.


In [None]:
# Celda 11: Para guardar el modelo
import datetime

timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
model_path = f"speech_recognition_model_{timestamp}.pt"

# Guardar modelo, vocabulario y configuraci√≥n
torch.save({
    'model_state_dict': model.state_dict(),
    'char_map': char_map,
    'index_map': index_map,
    'config': {
        'input_size': input_size,
        'hidden_size': hidden_size,
        'output_size': output_size
    }
}, model_path)

print(f"Modelo guardado en: {model_path}")