# End-to-End ASR Training with CTC

This notebook trains a complete Automatic Speech Recognition system
using Log-Mel spectrograms, character-level tokenization, and
Connectionist Temporal Classification (CTC).


In [1]:
import os
import sys

# Get project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add to Python path
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("Project root added:", PROJECT_ROOT)


Project root added: C:\Users\intel\OneDrive\Desktop\PhD preparation\speech-recognition-asr


In [2]:
import torch
from torch.utils.data import DataLoader

from src.dataset import CommonVoiceAUSDataset
from src.features import extract_log_mel
from src.tokenizer import CharTokenizer
from src.model import ASRModel
from src.collate import collate_batch
from src.train import train_step


In [3]:
dataset = CommonVoiceAUSDataset("../data/raw/commonvoice_en_au")

tokenizer = CharTokenizer()
tokenizer.build_vocab(dataset.df["sentence"].astype(str))

def prepare_sample(idx):
    sample = dataset.get_sample(idx)
    log_mel = extract_log_mel(sample["audio_path"])
    tokens = tokenizer.encode(sample["text"])

    return {
        "log_mel": log_mel,
        "tokens": tokens
    }

samples = [prepare_sample(i) for i in range(20)]  # small subset
loader = DataLoader(samples, batch_size=2, collate_fn=collate_batch)


In [4]:
model = ASRModel(
    n_mels=80,
    hidden_dim=256,
    vocab_size=len(tokenizer.char2idx)
)

ctc_loss = torch.nn.CTCLoss(
    blank=tokenizer.char2idx[tokenizer.blank_token],
    zero_infinity=True
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

for epoch in range(3):
    for batch in loader:
        loss = train_step(model, batch, optimizer, ctc_loss)
    print(f"Epoch {epoch+1} | Loss: {loss:.4f}")


Epoch 1 | Loss: 6.2589


Epoch 2 | Loss: 6.0617


Epoch 3 | Loss: 5.5049
