<a href="https://colab.research.google.com/github/ValentinPastre/proyecto-2025/blob/Speech-to-Text/speech_to_text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install numpy pandas matplotlib librosa torch torchaudio torchcodec



# Data Preprocessing

In [None]:
import os
import pandas as pd
import numpy as np
import librosa
import torch
import torchaudio
from torch.utils.data import Dataset, DataLoader
from torchaudio.transforms import MelSpectrogram
from sklearn.model_selection import train_test_split
import soundfile as sf

In [None]:
from datasets import load_dataset, concatenate_datasets

ds_f = load_dataset("ylacombe/google-argentinian-spanish", "female")
ds_m = load_dataset("ylacombe/google-argentinian-spanish", "male")

ds = concatenate_datasets([ds_f["train"], ds_m["train"]])

print(ds.column_names)

['audio', 'text', 'speaker_id']


In [None]:
print(ds)

splits = ds.train_test_split(test_size=0.1, seed=42)
train_df = splits["train"]
val_df = splits["test"]

print(f"Training samples: {len(train_df)}, Validation samples: {len(val_df)}")

Dataset({
    features: ['audio', 'text', 'speaker_id'],
    num_rows: 5739
})
Training samples: 5165, Validation samples: 574


In [None]:
print(train_df[0]["text"])

Del subte al teatro son diez minutos


In [None]:
#char_map_str = """
#a 1
#b 2
#c 3
#d 4
#e 5
#f 6
#g 7
#h 8
#i 9
#j 10
#k 11
#l 12
#m 13
#n 14
#o 15
#p 16
#q 17
#r 18
#s 19
#t 20
#u 21
#v 22
#w 23
#x 24
#y 25
#z 26
#ñ 27
#á 28
#é 29
#í 30
#ó 31
#ú 32
#? 33
#¿ 34
#! 35
#¡ 36
#SPACE 37
#"""
#char_map = {}
#index_map = {}
#char_map['<blank>'] = 0
#index_map[0] = ''
#for line in char_map_str.strip().split('\n'):
#    ch, index = line.split()
#    if ch == "SPACE":
#        ch = " "
#    index = int(index) + 1
#    #char_map[ch] = int(index)
#    char_map[ch] = index
#    #index_map[int(index)] = ch
#    index_map[index] = ch

In [None]:
all_text = " ".join(train_df['text']) + " " + " ".join(val_df['text'])
all_text = all_text.lower()

allowed_chars = sorted(list(set(all_text)))  # todos los que realmente aparecen

char_map = {"<BLANK>": 0}
for i, ch in enumerate(allowed_chars, start=1):
    char_map[ch] = i

index_map = {v: k for k, v in char_map.items()}

In [None]:
def text_to_int_sequence(text):
    text = text.lower()
    return [char_map.get(c, char_map[' ']) for c in text]

def int_sequence_to_text(seq):
    #return ''.join([index_map[i] for i in seq])
    return ''.join([index_map.get(i, '') for i in seq])

In [None]:
print(train_df[0]["text"])
print(text_to_int_sequence(train_df[0]["text"]))

Del subte al teatro son diez minutos
[12, 13, 20, 1, 27, 29, 10, 28, 13, 1, 9, 20, 1, 28, 13, 9, 28, 26, 23, 1, 27, 23, 22, 1, 12, 17, 13, 34, 1, 21, 17, 22, 29, 28, 23, 27]


# Creating a Custom Dataset

In [None]:
class SpeechDataset(Dataset):
    def __init__(self, df, char_map, transform=None):
        self.df = df
        self.char_map = char_map
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        item = self.df[idx]
        audio = item["audio"]["array"]
        sample_rate = item["audio"]["sampling_rate"]
        transcript = item["text"]

        waveform = torch.tensor(audio)
        if waveform.dim() == 1:
          waveform = waveform.unsqueeze(0)

        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)

        # Resample to 16kHz
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(sample_rate, 16000)
            waveform = resampler(waveform)
            sample_rate = 16000

        max_val = torch.max(torch.abs(waveform))
        if max_val > 0:
          waveform = waveform/max_val


        spectrogram = self.transform(waveform)
        spectrogram = torch.log(spectrogram + 1e-6)
        spectrogram = (spectrogram - spectrogram.mean()) / (spectrogram.std() + 1e-6)
        spectrogram = spectrogram.squeeze(0).transpose(0, 1)

        # Convert transcript to int sequence
        transcript_seq = torch.tensor(text_to_int_sequence(transcript), dtype=torch.long)

        return spectrogram, transcript_seq

In [None]:
def collate_fn(batch):
    spectrograms = []
    transcript_seqs = []
    input_lengths = []
    target_lengths = []

    for (spectrogram, transcript_seq) in batch:
        if spectrogram.dim() != 2:
            raise ValueError(f"Expected spectrogram with 2 dims (T, n_mels), got {spectrogram.shape}")

        spectrograms.append(spectrogram)
        transcript_seqs.append(transcript_seq)
        input_lengths.append(spectrogram.shape[0])
        target_lengths.append(len(transcript_seq))

    spectrograms = torch.nn.utils.rnn.pad_sequence(spectrograms, batch_first=True)
    transcript_seqs = torch.nn.utils.rnn.pad_sequence(transcript_seqs, batch_first=True)

    return spectrograms, transcript_seqs, input_lengths, target_lengths

In [None]:
transform = torchaudio.transforms.MelSpectrogram(sample_rate=16000, n_mels=128)

#train_dataset = SpeechDataset(train_df.reset_index(drop=True), char_map, transform=transform)
#val_dataset = SpeechDataset(val_df.reset_index(drop=True), char_map, transform=transform)
train_dataset = SpeechDataset(train_df, char_map, transform=transform)
val_dataset = SpeechDataset(val_df, char_map, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)



In [None]:
import torch

for batch in train_loader:
    spectrograms, targets, input_lengths, target_lengths = batch

    spec = spectrograms[0]   # elegimos el primer espectrograma del batch (shape: [Time, Mel])

    print("Shape:", spec.shape)
    print("Min:", spec.min().item())
    print("Max:", spec.max().item())
    print("Mean:", spec.mean().item())
    print("Std:", spec.std().item())
    break


Shape: torch.Size([601, 128])
Min: -1.5699036121368408
Max: 2.5417768955230713
Mean: -3.322388764104289e-08
Std: 0.738749623298645


# Building the model

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class SpeechRecognitionModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SpeechRecognitionModel, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=3, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        x, _ = self.lstm(x)
        x = self.fc(x)
        x = F.log_softmax(x, dim=2)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memoria disponible: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print("   En Colab: Runtime , Change runtime type , T4 GPU")

input_size = 128  # Number of Mel features
hidden_size = 256
output_size = len(char_map)  # Number of characters

model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)

GPU: Tesla T4
Memoria disponible: 14.74 GB


## Training the model

In [None]:
criterion = nn.CTCLoss(blank=0, zero_infinity=True)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
num_epochs = 50  # Increase this number for better performance

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for i, (inputs, targets, input_lengths, target_lengths) in enumerate(train_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        # CTC Loss expects (T, N, C)
        outputs = outputs.permute(1, 0, 2)

        input_lengths_tensor = torch.IntTensor(input_lengths)    # shape (N,)
        target_lengths_tensor = torch.IntTensor(target_lengths)  # shape (N,)

        targets_list = []
        for t, l in zip(targets, target_lengths):
            targets_list.append(t[:l])
        targets_concat = torch.cat(targets_list).to(dtype=torch.long)  # 1D

        outputs = outputs.to(device)               # (T, N, C)
        targets_concat = targets_concat.to(device)
        input_lengths_tensor = input_lengths_tensor.to(device)
        target_lengths_tensor = target_lengths_tensor.to(device)

        loss = criterion(outputs, targets_concat, input_lengths_tensor, target_lengths_tensor)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        if i % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {i+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    print(f"Epoch {epoch+1} completed with average loss: {running_loss/len(train_loader):.4f}")

Epoch 1/50, Step 1/162, Loss: 29.8145
Epoch 1/50, Step 11/162, Loss: 3.2776
Epoch 1/50, Step 21/162, Loss: 3.1958
Epoch 1/50, Step 31/162, Loss: 3.0839
Epoch 1/50, Step 41/162, Loss: 3.0498
Epoch 1/50, Step 51/162, Loss: 2.9827
Epoch 1/50, Step 61/162, Loss: 3.0065
Epoch 1/50, Step 71/162, Loss: 3.0162
Epoch 1/50, Step 81/162, Loss: 2.9989
Epoch 1/50, Step 91/162, Loss: 2.9837
Epoch 1/50, Step 101/162, Loss: 2.9545
Epoch 1/50, Step 111/162, Loss: 2.9823
Epoch 1/50, Step 121/162, Loss: 2.9859
Epoch 1/50, Step 131/162, Loss: 2.9355
Epoch 1/50, Step 141/162, Loss: 2.9482
Epoch 1/50, Step 151/162, Loss: 2.9579
Epoch 1/50, Step 161/162, Loss: 2.9308
Epoch 1 completed with average loss: 3.5318
Epoch 2/50, Step 1/162, Loss: 2.9288
Epoch 2/50, Step 11/162, Loss: 2.9326
Epoch 2/50, Step 21/162, Loss: 2.9634
Epoch 2/50, Step 31/162, Loss: 2.9507
Epoch 2/50, Step 41/162, Loss: 2.9369
Epoch 2/50, Step 51/162, Loss: 2.9363
Epoch 2/50, Step 61/162, Loss: 2.9347
Epoch 2/50, Step 71/162, Loss: 2.9733


## Validate the model

In [None]:
def cer(prediction, reference):
    if len(reference) == 0:
        return 1.0 if len(prediction) > 0 else 0.0

    pred_chars = list(prediction.replace(' ', ''))
    ref_chars = list(reference.replace(' ', ''))

    if len(pred_chars) == 0:
        return 1.0

    errors = 0
    min_len = min(len(pred_chars), len(ref_chars))
    max_len = max(len(pred_chars), len(ref_chars))

    for i in range(min_len):
        if pred_chars[i] != ref_chars[i]:
            errors += 1

    errors += (max_len - min_len)

    return errors / len(ref_chars)

In [None]:
def ctc_decode(indices, blank_index=0):
    """
    indices: tensor shape (T,) con los índices ya argmaxeados.
    """
    if isinstance(indices, torch.Tensor):
        indices = indices.cpu().numpy()

    collapsed = []
    prev = None
    for idx in indices:
        if idx != prev:
            collapsed.append(idx)
        prev = idx

    result = [i for i in collapsed if i != blank_index]

    return result

In [None]:
model.eval()
total_cer = 0.0
num_samples = 0

with torch.no_grad():
    for i, (inputs, targets, input_lengths, target_lengths) in enumerate(val_loader):
        inputs = inputs.to(device)
        targets = targets.to(device)

        outputs = model(inputs)
        # CTC Loss expects (T, N, C)
        outputs = outputs.permute(1, 0, 2)

        decoded_indices = torch.argmax(outputs, dim=2)  # (T, N)

        T, N = decoded_indices.shape
        for j in range(N):
            pred_indices = decoded_indices[:, j].cpu().numpy()
            pred_indices_decoded = ctc_decode(pred_indices, blank_index=0)
            pred_text = int_sequence_to_text(pred_indices_decoded)

            target_indices = targets[j].cpu().numpy()
            target_length = target_lengths[j]
            target_indices = target_indices[:target_length]
            target_text = int_sequence_to_text([int(i) for i in target_indices])

            current_cer = cer(pred_text, target_text)
            total_cer += current_cer
            num_samples += 1

            if i == 0 and j < 3:
                print(f"Predicho: '{pred_text}'")
                print(f"Real:     '{target_text}'")
                print(f"CER: {current_cer:.4f}")
                print("---")

    print(f"Average CER: {total_cer / num_samples:.4f}")

# Inferencia

In [None]:
def predict(audio_path, model, transform, device):
    waveform, sample_rate = torchaudio.load(audio_path)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(sample_rate, 16000)
        waveform = resampler(waveform)

    max_val = torch.max(torch.abs(waveform))
    if max_val > 0:
      waveform = waveform/max_val

    spectrogram = transform(waveform)
    spectrogram = torch.log(spectrogram + 1e-6)
    spectrogram = (spectrogram - spectrogram.mean()) / (spectrogram.std() + 1e-6)
    spectrogram = spectrogram.squeeze(0).transpose(0, 1)

    #spectrogram = spectrogram.to(device)
    spectrogram = spectrogram.unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(spectrogram)
        outputs = outputs.permute(1, 0, 2)  # (T, N, C)

        decoded_indices = torch.argmax(outputs, dim=2)  # (T, N)
        T = outputs.shape[0]
        pred_indices = decoded_indices[:spectrogram.shape[1], 0].cpu().numpy()  # (T,)

        pred_indices_decoded = ctc_decode(pred_indices, blank_index=0)
        pred_text = int_sequence_to_text(pred_indices_decoded)
        return pred_text

In [None]:
#test_audio_path = "/content/test.flac"  # Provide path to a .flac audio file
test_audio_path = "/content/CapturarImagen.wav"
predicted_text = predict(test_audio_path, model, transform, device)
print(f"Predicted Transcript: {predicted_text}")

Predicted Transcript: catura rigaf.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Exportar el modelo

## Pesos

In [None]:
torch.save(model.state_dict(), "speech_model_weights.pth")

# Cargar pesos
#model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)
#model.load_state_dict(torch.load("speech_model_weights.pth", map_location=device))
#model.eval()  # poner en modo evaluación

## Pesos + arquitectura

In [None]:
# Guardar modelo completo
torch.save(model, "speech_model_full.pth")

# Cargar modelo completo
#model = torch.load("speech_model_full.pth", map_location=device)
#model.eval()

## Pesos + optimizador
####(Para continuar el entrenamiento)

In [None]:
# Guardar checkpoint
torch.save({
    'epoch': epoch,                     # opcional, para saber en qué época estás
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': running_loss,               # opcional
}, "checkpoint.pth")

# Reconstruir modelo y optimizador
#model = SpeechRecognitionModel(input_size, hidden_size, output_size).to(device)
#optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#
# Cargar checkpoint
#checkpoint = torch.load("checkpoint.pth", map_location=device)
#model.load_state_dict(checkpoint['model_state_dict'])
#optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#start_epoch = checkpoint['epoch'] + 1
#running_loss = checkpoint['loss']
#model.train()