In [None]:
# Instalaciones
import sys

!pip install -q datasets>=2.6.1 git+https://github.com/huggingface/transformers
!pip install -q librosa evaluate jiwer gradio accelerate soundfile

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
sentence-transformers 5.1.2 requires transformers<5.0.0,>=4.41.0, but you have transformers 5.0.0.dev0 which is incompatible.[0m[31m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Celda 1: Instalaciones, imports y configuración

import torch
from datasets import load_dataset, concatenate_datasets, Audio, DatasetDict
from transformers import WhisperFeatureExtractor, WhisperTokenizer, WhisperProcessor
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
from dataclasses import dataclass
from typing import Any, Dict, List, Union
import io
import librosa
import numpy as np

MODEL_ID = "openai/whisper-small"
LANG = "Spanish"
TASK = "transcribe"
DATASET_NAME = "ylacombe/google-argentinian-spanish"


In [None]:
# Celda 2: Cargar y preparar datos

ds_female = load_dataset(DATASET_NAME, "female", split="train")
ds_male = load_dataset(DATASET_NAME, "male", split="train")
raw_dataset = concatenate_datasets([ds_female, ds_male])

# Mezclar y dividir
raw_dataset = raw_dataset.shuffle(seed=42)
dataset_split = raw_dataset.train_test_split(test_size=0.1)
data_argentina = DatasetDict({
    "train": dataset_split["train"],
    "test": dataset_split["test"]
})

# Al igual que cuando desarollamos el modelo RNN usamos decode=False para evitar el error de torchcodec
data_argentina = data_argentina.cast_column("audio", Audio(sampling_rate=16000, decode=False))

print(f"Datos listos: {len(data_argentina['train'])} entrenamiento | {len(data_argentina['test'])} prueba")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Datos listos: 5165 entrenamiento | 574 prueba


In [None]:
# Celda 3: Procesadores

feature_extractor = WhisperFeatureExtractor.from_pretrained(MODEL_ID)
tokenizer = WhisperTokenizer.from_pretrained(MODEL_ID, language=LANG, task=TASK)
processor = WhisperProcessor.from_pretrained(MODEL_ID, language=LANG, task=TASK)

def prepare_dataset(batch):
    audio = batch["audio"]


    audio_bytes = audio["bytes"]
    with io.BytesIO(audio_bytes) as buffer:
        array, sampling_rate = librosa.load(buffer, sr=16000)
    batch["input_features"] = feature_extractor(array, sampling_rate=16000).input_features[0]

    # Tokenizar texto
    text_col = "transcription" if "transcription" in batch else "text"
    batch["labels"] = tokenizer(batch[text_col]).input_ids

    return batch

data_argentina = data_argentina.map(prepare_dataset, remove_columns=data_argentina.column_names["train"], num_proc=1)


Map:   0%|          | 0/5165 [00:00<?, ? examples/s]

Map:   0%|          | 0/574 [00:00<?, ? examples/s]

In [None]:
# Celda 4: Data Collator y Métricas

@dataclass
class DataCollatorSpeechSeq2SeqWithPadding:
    processor: Any
    def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
        input_features = [{"input_features": feature["input_features"]} for feature in features]
        batch = self.processor.feature_extractor.pad(input_features, return_tensors="pt")

        label_features = [{"input_ids": feature["labels"]} for feature in features]
        labels_batch = self.processor.tokenizer.pad(label_features, return_tensors="pt")

        labels = labels_batch["input_ids"].masked_fill(labels_batch.attention_mask.ne(1), -100)
        if (labels[:, 0] == self.processor.tokenizer.bos_token_id).all().cpu().item():
            labels = labels[:, 1:]

        batch["labels"] = labels
        return batch

data_collator = DataCollatorSpeechSeq2SeqWithPadding(processor=processor)
metric = evaluate.load("wer")

def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    label_ids[label_ids == -100] = tokenizer.pad_token_id

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(label_ids, skip_special_tokens=True)

    wer = 100 * metric.compute(predictions=pred_str, references=label_str)
    return {"wer": wer}

In [None]:
# Celda 5: Entrenamiento
from transformers import WhisperForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = WhisperForConditionalGeneration.from_pretrained(MODEL_ID)

model.config.use_cache = False

model.generation_config.language = "spanish"
model.generation_config.task = "transcribe"
forced_decoder_ids = processor.get_decoder_prompt_ids(language="spanish", task="transcribe")
model.generation_config.forced_decoder_ids = forced_decoder_ids

model.generation_config.pad_token_id = processor.tokenizer.pad_token_id
model.generation_config.eos_token_id = processor.tokenizer.eos_token_id

# Limpiamos la config principal para evitar errores al guardar
model.config.forced_decoder_ids = None
model.config.suppress_tokens = None

print("Configuración corregida: Idioma forzado a Español.")

# Configurar Argumentos de Entrenamiento
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-argentino",
    per_device_train_batch_size=16,       # Entrena al limite en la t4 con 16 jajajajaj
    gradient_accumulation_steps=1,
    gradient_checkpointing=False,        # Apagado para estabilidad
    learning_rate=1e-5,
    warmup_steps=50,
    max_steps=1000,
    fp16=True,
    eval_strategy="steps",
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    generation_max_length=225,
    save_steps=200,
    eval_steps=200,
    logging_steps=25,
    report_to=["tensorboard"],
    load_best_model_at_end=True,
    metric_for_best_model="wer",
    greater_is_better=False,
)

# Crear el Entrenador
trainer = Seq2SeqTrainer(
    args=training_args,
    model=model,
    train_dataset=data_argentina["train"],
    eval_dataset=data_argentina["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    processing_class=processor.feature_extractor,
)

try:
    # Intenta retomar si existe un checkpoint previo
    trainer.train(resume_from_checkpoint=True)
except ValueError:
    print(" No se encontró checkpoint previo, iniciando de cero ")
    trainer.train()

#  Guardar al finalizar
model.config.use_cache = True
model.save_pretrained("./modelo_final")
processor.save_pretrained("./modelo_final")
tokenizer.save_pretrained("./modelo_final")
print(" Modelo guardado en ./modelo_final")

In [13]:
# Celda 6: Inferencia (Corregida)
import torch
import librosa
import numpy as np


device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

print("\n Prueba con data set ")
sample = data_argentina["test"][0]

real_text = tokenizer.decode(sample['labels'], skip_special_tokens=True)
print(f"Real: {real_text}")
input_features = torch.tensor(sample["input_features"]).unsqueeze(0).to(device)

# Generamos la predicción
with torch.no_grad():
    generated_ids = model.generate(input_features)
transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(f"Predicción Modelo: {transcription}")


print("\n Prueba real con microphono")
import os
from transformers import pipeline

if os.path.exists("prueba.wav"):
    pipe = pipeline("automatic-speech-recognition", model=model, tokenizer=tokenizer, feature_extractor=feature_extractor, device=0)
    result = pipe("prueba.wav")
    print(f"Archivo 'prueba.wav': {result['text']}")
else:
    print("No se encontró 'prueba.wav' para probar la pipeline.")

Usando dispositivo: cuda

 Prueba con data set 
Real: Los finlandeses se bañan en piscinas heladas después de darse un baño de vapor
Predicción Modelo: Los finlandeses se bañan en piscinas heladas después de darse un baño de vapor

 Prueba real con microphono
Archivo 'prueba.wav': Hola buenos días ¿Cómo estás? Está lindo el clima ¿no? ¿Qué me puedo decirle? Hay que ver si infiere todo bien


In [None]:
# Celda 7 Guardado y Empaquetado

import shutil
from google.colab import files

trainer.save_model("./whisper-argentino-final")
processor.save_pretrained("./whisper-argentino-final")
tokenizer.save_pretrained("./whisper-argentino-final")

shutil.make_archive("whisper_argentino", 'zip', "./whisper-argentino-final")

files.download("whisper_argentino.zip")