In [1]:
!pip install numpy pandas matplotlib librosa torch torchaudio torchcodec

Collecting torchcodec
  Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Downloading torchcodec-0.8.1-cp312-cp312-manylinux_2_28_x86_64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m85.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torchcodec
Successfully installed torchcodec-0.8.1


# Data Preprocessing

In [2]:
import os
import pandas as pd
import numpy as np
import librosa
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram
from sklearn.model_selection import train_test_split
import soundfile as sf
import torchaudio.transforms as T
import torch.nn as nn
import torch.nn.functional as F

In [3]:
from datasets import load_dataset, concatenate_datasets

ds_f = load_dataset("ylacombe/google-argentinian-spanish", "female")
ds_m = load_dataset("ylacombe/google-argentinian-spanish", "male")

ds = concatenate_datasets([ds_f["train"], ds_m["train"]])

print(ds.column_names)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/910 [00:00<?, ?B/s]

female/train-00000-of-00004-6fb30f4d957d(…):   0%|          | 0.00/404M [00:00<?, ?B/s]

female/train-00001-of-00004-d6234d86f707(…):   0%|          | 0.00/412M [00:00<?, ?B/s]

female/train-00002-of-00004-f9730bbec196(…):   0%|          | 0.00/410M [00:00<?, ?B/s]

female/train-00003-of-00004-03ac2065ea9d(…):   0%|          | 0.00/399M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3921 [00:00<?, ? examples/s]

male/train-00000-of-00002-920b805572ae22(…):   0%|          | 0.00/357M [00:00<?, ?B/s]

male/train-00001-of-00002-f6f0bfbdc6bb1d(…):   0%|          | 0.00/350M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1818 [00:00<?, ? examples/s]

['audio', 'text', 'speaker_id']


In [4]:
import re

ALLOWED_CHARS = "abcdefghijklmnñopqrstuvwxyzáéíóúü "

def clean_text(batch):
    text = batch["text"].lower()

    text = re.sub(r"[¿?¡!.,:;\-_/()\[\]\"']", " ", text)

    text = re.sub(r"[0-9]", "", text)

    text = "".join(ch for ch in text if ch in ALLOWED_CHARS)

    text = re.sub(r"\s+", " ", text).strip()

    batch["text"] = text
    return batch

ds = ds.map(clean_text)

Map:   0%|          | 0/5739 [00:00<?, ? examples/s]

In [5]:
print(ds)

splits = ds.train_test_split(test_size=0.1, seed=59)
train_df = splits["train"]
val_df = splits["test"]

print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

Dataset({
    features: ['audio', 'text', 'speaker_id'],
    num_rows: 5739
})
Training samples: 5165, Validation samples: 574


In [6]:
print(ds[0]["text"])
print(ds[7]["text"])

para la caída del cabello tengo un nuevo champú
adidas sacó una versión especial de zapatillas a las que no les entra el agua


In [7]:
extra_chars = list("áéíóúüñ")

all_text = " ".join(train_df['text']) + " " + " ".join(val_df['text'])
all_text = all_text.lower()

dataset_chars = set(all_text)  # todos los que realmente aparecen
allowed_chars = sorted(dataset_chars.union(extra_chars))

char_map = {"<BLANK>": 0}
for i, ch in enumerate(allowed_chars, start=1):
    char_map[ch] = i

index_map = {v: k for k, v in char_map.items()}

In [8]:
def text_to_int_sequence(text):
    text = text.lower()
    return [char_map.get(c, char_map[' ']) for c in text]

def int_sequence_to_text(seq):
    #return ''.join([index_map[i] for i in seq])
    return ''.join([index_map.get(i, '') for i in seq])

# Creating a Custom Dataset

In [9]:
augment = nn.Sequential(
    T.FrequencyMasking(freq_mask_param=15),
    T.TimeMasking(time_mask_param=35)
)

In [10]:
class SpeechDataset(Dataset):
    def __init__(self, df, char_map, transform=None, augment=None, mean=None, std=None):
        self.df = df
        self.char_map = char_map
        self.transform = transform
        self.augment = augment
        self.mean = mean
        self.std = std

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df[idx]
        audio = item["audio"]["array"]
        sample_rate = item["audio"]["sampling_rate"]
        transcript = item["text"]

        waveform = torch.tensor(audio)
        if waveform.dim() == 1:
          waveform = waveform.unsqueeze(0)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Resample to 16kHz
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
            sample_rate = 16000

        max_val = torch.max(torch.abs(waveform))
        if max_val > 0:
          waveform = waveform/max_val

        spectrogram = self.transform(waveform)
        spectrogram = torch.log(spectrogram + 1e-6)
        if self.mean is not None and self.std is not None:
          spectrogram = (spectrogram - self.mean) / (self.std + 1e-6)
        #else:
        #  spectrogram = (spectrogram - spectrogram.mean()) / (spectrogram.std() + 1e-6)

        if self.augment is not None:
            spectrogram = self.augment(spectrogram)

        spectrogram = spectrogram.squeeze(0).transpose(0, 1)

        # Convert transcript to int sequence
        transcript_seq = torch.tensor(text_to_int_sequence(transcript), dtype=torch.long)

        return spectrogram, transcript_seq

In [11]:
def collate_fn(batch):
    spectrograms = []
    transcript_seqs = []
    input_lengths = []
    target_lengths = []

    for (spectrogram, transcript_seq) in batch:
        if spectrogram.dim() != 2:
            raise ValueError(f"Expected spectrogram with 2 dims (T, n_mels), got {spectrogram.shape}")

        spectrograms.append(spectrogram)
        transcript_seqs.append(transcript_seq)
        input_lengths.append(spectrogram.shape[0])
        target_lengths.append(len(transcript_seq))

    spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
    transcript_seqs = torch.nn.utils.rnn.pad_sequence(transcript_seqs, batch_first=True)

    return spectrograms, transcript_seqs, input_lengths, target_lengths

In [12]:
def compute_dataset_norm(loader):
    total_sum = 0.0
    total_sq_sum = 0.0
    total_count = 0

    for spectrograms, _, _, _ in loader:
        # spectrograms shape: (B, T, M)
        B, T_spectrograms_shape, M = spectrograms.shape
        count = B * T_spectrograms_shape * M

        total_sum += spectrograms.sum()
        total_sq_sum += (spectrograms ** 2).sum()
        total_count += count

    mean = total_sum / total_count
    var = (total_sq_sum / total_count) - mean**2
    std = torch.sqrt(var)

    return mean.item(), std.item()

In [22]:
transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)

#estos train_tmp son una chanchada pero los hago para calcular la media y el desvio de todo el dataset
train_dataset_tmp = SpeechDataset(train_df, char_map, transform=transform, augment=augment)
train_loader_tmp = DataLoader(train_dataset_tmp, batch_size=32, shuffle=False, collate_fn=collate_fn)

dataset_mean, dataset_std = compute_dataset_norm(train_loader_tmp)

#train_dataset = SpeechDataset(train_df.reset_index(drop=True), char_map, transform=transform)
#val_dataset = SpeechDataset(val_df.reset_index(drop=True), char_map, transform=transform)
train_dataset = SpeechDataset(train_df, char_map, transform=transform, augment=augment, mean=dataset_mean, std=dataset_std)
val_dataset = SpeechDataset(val_df, char_map, transform=transform, augment=None, mean=dataset_mean, std=dataset_std)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)



In [23]:
torch.save({"mean": dataset_mean, "std": dataset_std}, "dataset_norm.pth")

In [24]:
print(f"Dataset mean: {dataset_mean}")
print(f"Dataset std: {dataset_std}")

Dataset mean: -4.2623724937438965
Dataset std: 5.515571594238281


# Building the model

In [31]:
class SpeechRecognitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SpeechRecognitionModel, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU()
        )

        self.post_cnn_size = 32*input_size

        #lstm original
        #self.lstm = nn.LSTM(input_size, hidden_size, num_layers=3, bidirectional=True, batch_first=True)
        self.lstm = nn.LSTM(self.post_cnn_size, hidden_size, num_layers=3, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        #codigo nuevo
        B, T, F_dim = x.shape
        x = x.permute(0, 2, 1).unsqueeze(1)
        x = self.conv(x)
        x = x.permute(0, 3, 1, 2).contiguous()
        x = x.view(B, T, -1)
        #fin codigo nuevo
        x, _ = self.lstm(x)
        x = self.fc(x)
        x = F.log_softmax(x, dim=2)
        return x

In [34]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoria disponible: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("   En Colab: Runtime , Change runtime type , T4 GPU")

input_size = 128  # Number of Mel features
hidden_size = 512 #representa el estado interno, info que la red "recuerda" o "aprende"
output_size = len(char_map)  # Number of characters

model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)

GPU: Tesla T4
Memoria disponible: 14.74 GB


## Training the model

In [35]:
criterion = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [30]:
#OJITO CON ESTO limpia la ram de la gpu
torch.cuda.empty_cache()

In [36]:
num_epochs = 13  # Increase this number for better performance

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, targets, input_lengths, target_lengths) in enumerate(train_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        # CTC Loss expects (T, N, C)
        outputs = outputs.permute(1, 0, 2)

        input_lengths_tensor = torch.IntTensor(input_lengths)    # shape (N,)
        target_lengths_tensor = torch.IntTensor(target_lengths)  # shape (N,)

        targets_list = []
        for t, l in zip(targets, target_lengths):
            targets_list.append(t[:l])
        targets_concat = torch.cat(targets_list).to(dtype=torch.long)  # 1D

        outputs = outputs.to(device)               # (T, N, C)
        targets_concat = targets_concat.to(device)
        input_lengths_tensor = input_lengths_tensor.to(device)
        target_lengths_tensor = target_lengths_tensor.to(device)

        loss = criterion(outputs, targets_concat, input_lengths_tensor, target_lengths_tensor)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {i+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} completed with average loss: {running_loss/len(train_loader):.4f}")

Epoch 1/13, Step 1/323, Loss: 25.9831
Epoch 1/13, Step 11/323, Loss: 3.1350
Epoch 1/13, Step 21/323, Loss: 2.9811
Epoch 1/13, Step 31/323, Loss: 2.9777
Epoch 1/13, Step 41/323, Loss: 2.9894
Epoch 1/13, Step 51/323, Loss: 3.0291
Epoch 1/13, Step 61/323, Loss: 2.9852
Epoch 1/13, Step 71/323, Loss: 3.0011
Epoch 1/13, Step 81/323, Loss: 3.0551
Epoch 1/13, Step 91/323, Loss: 3.0089
Epoch 1/13, Step 101/323, Loss: 3.0515
Epoch 1/13, Step 111/323, Loss: 3.0430
Epoch 1/13, Step 121/323, Loss: 3.0339
Epoch 1/13, Step 131/323, Loss: 3.0123
Epoch 1/13, Step 141/323, Loss: 2.9902
Epoch 1/13, Step 151/323, Loss: 3.0153
Epoch 1/13, Step 161/323, Loss: 3.0197
Epoch 1/13, Step 171/323, Loss: 2.9385
Epoch 1/13, Step 181/323, Loss: 2.9503
Epoch 1/13, Step 191/323, Loss: 2.9294
Epoch 1/13, Step 201/323, Loss: 2.9835
Epoch 1/13, Step 211/323, Loss: 2.9388
Epoch 1/13, Step 221/323, Loss: 2.9485
Epoch 1/13, Step 231/323, Loss: 2.8986
Epoch 1/13, Step 241/323, Loss: 2.9219
Epoch 1/13, Step 251/323, Loss: 2.9

## Validate the model

In [37]:
import editdistance

def cer(pred, ref):
    return editdistance.eval(pred, ref) / len(ref)

In [38]:
def ctc_decode(indices, blank_index=0):
    """
    indices: tensor shape (T,) con los índices ya argmaxeados.
    """
    if isinstance(indices, torch.Tensor):
        indices = indices.cpu().numpy()

    collapsed = []
    prev = None
    for idx in indices:
        if idx != prev:
            collapsed.append(idx)
        prev = idx

    result = [i for i in collapsed if i != blank_index]

    return result

In [39]:
model.eval()
total_cer = 0.0
num_samples = 0

with torch.no_grad():
    for i, (inputs, targets, input_lengths, target_lengths) in enumerate(val_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        # CTC Loss expects (T, N, C)
        outputs = outputs.permute(1, 0, 2)

        decoded_indices = torch.argmax(outputs, dim=2)  # (T, N)

        T, N = decoded_indices.shape
        for j in range(N):
            pred_indices = decoded_indices[:, j].cpu().numpy()
            pred_indices_decoded = ctc_decode(pred_indices, blank_index=0)
            pred_text = int_sequence_to_text(pred_indices_decoded)

            target_indices = targets[j].cpu().numpy()
            target_length = target_lengths[j]
            target_indices = target_indices[:target_length]
            target_text = int_sequence_to_text([int(i) for i in target_indices])

            current_cer = cer(pred_text, target_text)
            total_cer += current_cer
            num_samples += 1

            if i == 0 and j < 7:
                print(f"Predicho: '{pred_text}'")
                print(f"Real:     '{target_text}'")
                print(f"CER: {current_cer:.4f}")
                print("---")

    print(f"Average CER: {total_cer / num_samples:.4f}")

Predicho: 'yosa de canto da n acustarlas en tradas'
Real:     'ya sabés cuánto van a costar las entradas'
CER: 0.3171
---
Predicho: 'de echos siquereste mane una catura de pantayla de mispoti fay teroeyas comparter poroca sap'
Real:     'de hecho si querés te mando una captura de pantalla de mi spotify te lo voy a compartir por whatsapp'
CER: 0.2500
---
Predicho: 'la calle está cina de hoyos'
Real:     'la calle está llena de hoyos'
CER: 0.1071
---
Predicho: 'verifiques su instalacion electerca'
Real:     'verifique su instalación eléctrica'
CER: 0.1471
---
Predicho: 'en qué pise estál de partavento'
Real:     'en qué piso está el departamento'
CER: 0.1562
---
Predicho: 'me podés pandar fotos de tugato'
Real:     'me podés mandar fotos de tu gato'
CER: 0.0625
---
Predicho: 'veaente ma rcomende u sa alsicleca o uy entaxci'
Real:     'realmente me recomendás usar la bicicleta o voy en taxi'
CER: 0.3455
---
Average CER: 0.1581


# Inferencia

In [40]:
def predict(audio_path, model, transform, device):
    stats = torch.load("dataset_norm.pth", map_location=device)
    dataset_mean = stats["mean"]
    dataset_std = stats["std"]
    waveform, sample_rate = torchaudio.load(audio_path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)

    max_val = torch.max(torch.abs(waveform))
    if max_val > 0:
      waveform = waveform/max_val

    spectrogram = transform(waveform)
    spectrogram = torch.log(spectrogram + 1e-6)
    #spectrogram = (spectrogram - spectrogram.mean()) / (spectrogram.std() + 1e-6)
    spectrogram = (spectrogram - dataset_mean) / (dataset_std + 1e-6)
    spectrogram = spectrogram.squeeze(0).transpose(0, 1)

    #spectrogram = spectrogram.to(device)
    spectrogram = spectrogram.unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(spectrogram)
        outputs = outputs.permute(1, 0, 2)  # (T, N, C)

        decoded_indices = torch.argmax(outputs, dim=2)  # (T, N)
        time = outputs.shape[0] #time es T
        #pred_indices = decoded_indices[:spectrogram.shape[1], 0].cpu().numpy()  # (T,)
        pred_indices = decoded_indices[:  , 0].cpu().numpy()

        pred_indices_decoded = ctc_decode(pred_indices, blank_index=0)
        pred_text = int_sequence_to_text(pred_indices_decoded)
        return pred_text

In [41]:
def calculate_dbfs(waveform):
    if isinstance(waveform, np.ndarray):
        waveform = torch.tensor(waveform, dtype=torch.float32)
    else:
        waveform = waveform.float()

    if waveform.dim() > 1:
        waveform = waveform.mean(dim=0)

    rms = torch.sqrt(torch.mean(waveform ** 2))

    dbfs = 20 * torch.log10(rms + 1e-12)

    return dbfs.item()


In [43]:
#test_audio_path = "/content/test.flac"  # Provide path to a .flac audio file
test_audio_path = "/content/UsuarioNoEncontrado.wav"

audio_dataset = train_df[0]["audio"]["array"]
waveform, sample_rate = torchaudio.load(test_audio_path)

print(f"Decibeles del dataset: {calculate_dbfs(audio_dataset)}")
print(f"Decibeles del audio a predecir: {calculate_dbfs(waveform)}")

Decibeles del dataset: -21.254837036132812
Decibeles del audio a predecir: -25.840717315673828


In [44]:
#test_audio_path = "/content/test.flac"  # Provide path to a .flac audio file
predicted_text = predict(test_audio_path, model, transform, device)
print()
print(f"Predicted Transcript: {predicted_text}")


Predicted Transcript: yario ne tontrado


# Exportar el modelo

## Pesos

In [None]:
torch.save(model.state_dict(), "speech_model_weights.pth")

# Cargar pesos
#model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)
#model.load_state_dict(torch.load("speech_model_weights.pth", map_location=device))
#model.eval()  # poner en modo evaluación

## Pesos + arquitectura

In [None]:
# Guardar modelo completo
torch.save(model, "speech_model_full.pth")

# Cargar modelo completo
#model = torch.load("speech_model_full.pth", map_location=device)
#model.eval()

## Pesos + optimizador
####(Para continuar el entrenamiento)

In [45]:
# Guardar checkpoint
torch.save({
    'epoch': epoch,                     # opcional, para saber en qué época estás
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': running_loss,               # opcional
}, "checkpoint.pth")

# Reconstruir modelo y optimizador
#model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Cargar checkpoint
#checkpoint = torch.load("checkpoint.pth", map_location=device)
#model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#start_epoch = checkpoint['epoch'] + 1
#running_loss = checkpoint['loss']
#model.train()

In [None]:
# Reconstruir modelo y optimizador
model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Cargar checkpoint
checkpoint = torch.load("checkpoint.pth", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
start_epoch = checkpoint['epoch'] + 1
running_loss = checkpoint['loss']
model.train()

SpeechRecognitionModel(
  (lstm): LSTM(128, 512, num_layers=3, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=1024, out_features=35, bias=True)
)