In [1]:
!pip install kaggle==1.5.12



In [2]:
# !mkdir -p ~/.kaggle
# !cp kaggle.json ~/.kaggle/
# !chmod 600 ~/.kaggle/kaggle.json  # Set permissions

In [3]:
# !mkdir datasets

In [4]:
# !kaggle datasets download -d mozillaorg/common-voice -p /content/datasets --force

In [5]:
%cd datasets

/content/datasets


In [6]:
# !unzip common-voice.zip

In [7]:
import pandas as pd

In [8]:
dev_df = pd.read_csv('cv-valid-dev.csv')
# dev_df.head()

In [9]:
# dev_df.info()

In [10]:
# !pip install pydub

In [11]:
# from IPython.display import Audio, display
# import os

# def display_audio(audio_path):
#   display(Audio(audio_path))

# audio_directory = 'cv-valid-dev/cv-valid-dev'
# audio_files = [audio for audio in os.listdir(audio_directory)]

# for audio_file in audio_files[:5]:
#   audio_path = os.path.join(audio_directory, audio_file)
#   display_audio(audio_path)


In [12]:
# dev_df['down_votes'].value_counts()

In [13]:
# dev_df[dev_df['down_votes'] == 0][:10]

In [14]:
#

 => It seems that the difference between upvotes and downvotes doesn't relate to the quality of audios

**Preprocessing steps**



In [15]:
dev_df = dev_df.drop(columns=dev_df.columns[dev_df.columns.get_loc('up_votes') : dev_df.columns.get_loc('duration') + 1])

In [16]:
# dev_df.head()

In [17]:
import torch
import os
import torchaudio
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings('ignore')

In [18]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [19]:
train_df, test_df = torch.utils.data.random_split(dev_df, [0.8, 0.2])

In [20]:
def preprocess_for_rnnt_torchaudio(file_path, target_sr=16000, max_duration=5):
    """GPU-accelerated preprocessing pipeline for RNN-T using TorchAudio"""

    # 1. Load audio directly to GPU (PyTorch 2.0+)
    waveform, sr = torchaudio.load(file_path)
    waveform = waveform.to(device)

    # 2. Resample if needed
    if sr != target_sr:
        waveform = torchaudio.functional.resample(waveform, sr, target_sr)

    # 3. Trim silence (VAD)
    waveform = torchaudio.functional.vad(waveform, sample_rate=target_sr, trigger_level=20)

    # 4. Peak normalization (GPU)
    waveform = torch.nn.functional.normalize(waveform, dim=-1)

    # 5. Fixed-length padding/cropping
    max_samples = target_sr * max_duration
    if waveform.size(-1) > max_samples:
        waveform = waveform[..., :max_samples]
    else:
        pad_amount = max_samples - waveform.size(-1)
        waveform = torch.nn.functional.pad(waveform, (0, pad_amount))

    # 6. Extract Log-Mel features (GPU)
    mel_transform = torchaudio.transforms.MelSpectrogram(
        sample_rate=target_sr,
        n_fft=400,
        hop_length=160,
        n_mels=80,
    ).to(device)

    mel_spec = mel_transform(waveform)
    log_mel = torch.log(mel_spec + 1e-6)  # (1, 80, T)

    return log_mel.squeeze(0).T  # (T, 80)

In [21]:
audio_directory = 'cv-valid-dev'
audio_files = [train_df.dataset.loc[train_df.indices[i], 'filename'] for i in range(len(train_df))]

In [22]:
features = []
for audio in audio_files:
  audio_feature = preprocess_for_rnnt_torchaudio(os.path.join(audio_directory, audio))
  features.append(audio_feature)

In [23]:
texts = [dev_df.loc[idx, 'text'] for idx in train_df.indices]

In [24]:
# texts[:5]

In [25]:
import re

In [26]:
def clean_text(text):
    # Keep original case - REMOVED .lower()
    text = text.strip()

    # Handle apostrophes/contractions carefully
    text = re.sub(r"([!\"'#$%&()*\+,-./:;<=>?@\\\[\]^_`{|}~])", r" \1 ", text)
    text = re.sub("[^A-Za-z0-9]+", " ", text)
    text = re.sub(" +", " ", text)

    return text

In [27]:
# Apply cleaning
cleaned_texts = [clean_text(t) for t in texts if isinstance(t, str)]

In [28]:
# cleaned_texts[:5]

In [29]:
from tokenizers import Tokenizer, models, trainers
from tokenizers import pre_tokenizers

In [30]:
# Initialize with byte-level BPE (better for names/contractions)
tokenizer = Tokenizer(models.BPE(unk_token="<unk>"))

# Simpler pre-tokenizer (preserve apostrophes)
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
    pre_tokenizers.Whitespace(),
    pre_tokenizers.Split("'", behavior="isolated")  # Special handling for apostrophes
])

# Trainer with larger vocab
trainer = trainers.BpeTrainer(
    special_tokens=["<pad>", "<unk>", "<bos>", "<eos>"],
    vocab_size=3800,  # Increased for better word coverage
    min_frequency=3,
    initial_alphabet=pre_tokenizers.ByteLevel.alphabet()  # Better for special characters
)

# Train on properly cleaned text
tokenizer.train_from_iterator([clean_text(t) for t in texts], trainer)
tokenizer.save("cv_tokenizer.json")

In [31]:
# # Load tokenizer
# tokenizer = Tokenizer.from_file("cv_tokenizer.json")

# # Test encoding
# sample_text = "Dr O'Neill 's patient said We 'll check again tomorrow"
# encoding = tokenizer.encode(sample_text.lower())
# print("Tokens:", encoding.tokens)
# print("IDs:", encoding.ids)

In [32]:
class Encoder(torch.nn.Module):
    def __init__(self, input_dim=80, hidden_dim=256, num_layers=3):
        super().__init__()
        self.lstm = torch.nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            bidirectional=True,
            batch_first=True,
        )
        self.linear = torch.nn.Linear(hidden_dim * 2, hidden_dim)

    def forward(self, x):
        x, _ = self.lstm(x)  # (B, T, 2*H)
        x = self.linear(x)    # (B, T, H)
        return x

In [33]:
class Decoder(torch.nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256):
        super().__init__()
        self.embed = torch.nn.Embedding(vocab_size, embed_dim)
        self.lstm = torch.nn.LSTM(embed_dim, hidden_dim, batch_first=True)

    def forward(self, y):
        y = self.embed(y)     # (B, U, E)
        y, _ = self.lstm(y)   # (B, U, H)
        return y

In [34]:
class JointNetwork(torch.nn.Module):
    def __init__(self, hidden_dim, vocab_size):
        super().__init__()
        self.linear = torch.nn.Linear(hidden_dim, vocab_size)

    def forward(self, h_enc, h_dec):
        # h_enc: (B, T, H), h_dec: (B, U, H)
        h_enc = h_enc.unsqueeze(2)  # (B, T, 1, H)
        h_dec = h_dec.unsqueeze(1)  # (B, 1, U, H)
        out = torch.tanh(h_enc + h_dec)  # (B, T, U, H)
        return self.linear(out)  # (B, T, U, V)

In [35]:
class RNNTransducer(torch.nn.Module):
    def __init__(self, vocab_size, encoder_dim=256, decoder_dim=256):
        super().__init__()
        self.encoder = Encoder(hidden_dim=encoder_dim)
        self.decoder = Decoder(vocab_size, hidden_dim=decoder_dim)
        self.joint = JointNetwork(encoder_dim, vocab_size)

    def forward(self, x, y):
        h_enc = self.encoder(x)  # (B, T, H)
        h_dec = self.decoder(y)  # (B, U, H)
        logits = self.joint(h_enc, h_dec)  # (B, T, U, V)
        return logits.contiguous()

In [36]:
# Continue from where the notebook left off

# First, let's prepare the dataset class
class AudioTextDataset(torch.utils.data.Dataset):
    def __init__(self, features, texts, tokenizer, max_text_length=100):
        self.features = features
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_text_length = max_text_length

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        # Get audio features
        audio_feature = self.features[idx]

        # Get and tokenize text
        text = self.texts[idx]
        encoding = self.tokenizer.encode(text)
        text_ids = encoding.ids

        # Add BOS and EOS tokens
        text_ids = [tokenizer.token_to_id("<bos>")] + text_ids + [tokenizer.token_to_id("<eos>")]

        # Pad text sequence
        if len(text_ids) < self.max_text_length:
            pad_amount = self.max_text_length - len(text_ids)
            text_ids = text_ids + [tokenizer.token_to_id("<pad>")] * pad_amount
        else:
            text_ids = text_ids[:self.max_text_length]

        return {
            'audio': audio_feature,
            'text': torch.tensor(text_ids, dtype=torch.long)
        }

In [37]:
# Create datasets
train_dataset = AudioTextDataset(
    features=features,
    texts=cleaned_texts,
    tokenizer=tokenizer
)

In [38]:
test_cleaned_texts = [clean_text(dev_df.loc[idx, 'text']) for idx in test_df.indices]
test_features = []
audio_directory = 'cv-valid-dev'
audio_files = [test_df.dataset.loc[test_df.indices[i], 'filename'] for i in range(len(test_df))]
for audio in audio_files:
  audio_feature = preprocess_for_rnnt_torchaudio(os.path.join(audio_directory, audio))
  test_features.append(audio_feature)

test_dataset = AudioTextDataset(
    features=test_features,
    texts=test_cleaned_texts,
    tokenizer=tokenizer
)

In [39]:
# torch.cuda.empty_cache()

In [40]:
# Create data loaders
def collate_fn(batch):
    # Pad audio features to same length
    audio_features = [item['audio'] for item in batch]
    max_audio_len = max(f.size(0) for f in audio_features)

    padded_audio = []
    for f in audio_features:
        pad_amount = max_audio_len - f.size(0)
        if pad_amount > 0:
            f = torch.nn.functional.pad(f, (0, 0, 0, pad_amount), value=0)
        padded_audio.append(f)

    audio_tensor = torch.stack(padded_audio)
    text_tensor = torch.stack([item['text'] for item in batch])

    return {
        'audio': audio_tensor.to(device),
        'text': text_tensor.to(device)
    }

batch_size = 2
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn
)

In [45]:
def rnnt_loss(logits, targets, input_lengths, target_lengths, blank=0):
    # logits: (B, T, U, V)
    # targets: (B, U)
    # input_lengths: (B,) - length of each audio sequence
    # target_lengths: (B,) - length of each text sequence

    # Ensure logits are contiguous
    if not logits.is_contiguous():
        logits = logits.contiguous()

    # Convert to log-probs
    log_probs = torch.nn.functional.log_softmax(logits, dim=-1)

    # Transpose for torchaudio's expectation (T, B, U, V)
    log_probs = log_probs.permute(1, 0, 2, 3)

    # Ensure the permuted tensor is contiguous
    if not log_probs.is_contiguous():
        log_probs = log_probs.contiguous()

    # Compute RNN-T loss
    loss = torchaudio.functional.rnnt_loss(
        log_probs,
        targets,
        input_lengths,
        target_lengths,
        blank=blank,
        reduction='mean'
    )

    return loss

In [46]:
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# Training loop
def train_epoch(model, dataloader, optimizer, accumulation_steps=4):
    model.train()
    total_loss = 0

    for batch in dataloader:
        audio = batch['audio']
        text = batch['text']

        # Get input/target lengths
        input_lengths = torch.tensor([a.size(0) for a in audio], dtype=torch.int32, device=device)
        target_lengths = torch.tensor([(t != tokenizer.token_to_id("<pad>")).sum() for t in text], dtype=torch.int32, device=device)

        # Forward pass
        optimizer.zero_grad()
        logits = model(audio, text[:, :-1])  # Exclude EOS token for decoder input

        # Compute loss
        loss = rnnt_loss(
            logits.contiguous(),
            text[:, 1:].type(torch.int32),  # Exclude BOS token for targets
            input_lengths,
            target_lengths - 1  # Subtract 1 because we excluded BOS
        )

        # Backward pass
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    return total_loss / len(dataloader)

In [43]:
# Evaluation loop
def evaluate(model, dataloader):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for batch in dataloader:
            audio = batch['audio']
            text = batch['text']

            input_lengths = torch.tensor([a.size(0) for a in audio], dtype=torch.int32, device=device)
            target_lengths = torch.tensor([(t != tokenizer.token_to_id("<pad>")).sum() for t in text], dtype=torch.int32, device=device)

            logits = model(audio, text[:, :-1])

            loss = rnnt_loss(
                logits.contiguous(),
                text[:, 1:],
                input_lengths,
                target_lengths - 1
            )

            total_loss += loss.item()

    return total_loss / len(dataloader)

In [47]:
# Training
num_epochs = 5
for epoch in range(num_epochs):
    train_loss = train_epoch(model, train_loader, optimizer)
    val_loss = evaluate(model, test_loader)

    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Val Loss: {val_loss:.4f}")


RuntimeError: batch dimension mismatch between logits and logit_lengths

In [None]:
# Inference function
def predict(model, audio_feature, tokenizer, max_decode_len=100):
    model.eval()

    # Add batch dimension
    audio_feature = audio_feature.unsqueeze(0).to(device)
    input_length = torch.tensor([audio_feature.size(1)], device=device)

    # Initialize with BOS token
    decoded = [tokenizer.token_to_id("<bos>")]

    with torch.no_grad():
        h_enc = model.encoder(audio_feature)  # (1, T, H)

        for _ in range(max_decode_len):
            # Get last predicted token
            last_token = torch.tensor([decoded[-1]], device=device).unsqueeze(0)

            # Decoder step
            h_dec = model.decoder(last_token)  # (1, 1, H)

            # Joint network
            logits = model.joint(h_enc, h_dec)  # (1, T, 1, V)
            log_probs = torch.nn.functional.log_softmax(logits.squeeze(2), dim=-1)  # (1, T, V)

            # Sum over time (approximate)
            scores = log_probs.sum(1)  # (1, V)
            next_token = scores.argmax(-1).item()

            # Stop if EOS is predicted
            if next_token == tokenizer.token_to_id("<eos>"):
                break

            decoded.append(next_token)

    # Convert to text
    tokens = tokenizer.decode(decoded[1:])  # Skip BOS
    return tokens


In [None]:
# Test inference
test_idx = 0
test_audio = features[test_idx]
test_text = cleaned_texts[test_idx]

predicted_text = predict(model, test_audio, tokenizer)
print(f"Original: {test_text}")
print(f"Predicted: {predicted_text}")