# End-to-End ASR Training with CTC

This notebook trains a complete Automatic Speech Recognition system
using Log-Mel spectrograms, character-level tokenization, and
Connectionist Temporal Classification (CTC).


In [None]:
import os
import sys

# Get project root
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))

# Add to Python path
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

print("Project root added:", PROJECT_ROOT)


In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader

from src.decode import greedy_decode
from src.features import spec_augment
from src.dataset import CommonVoiceAUSDataset
from src.features import extract_log_mel
from src.tokenizer import CharTokenizer
from src.model import ASRModel
from src.collate import collate_batch
from src.train import train_step


In [None]:
dataset = CommonVoiceAUSDataset("../data/raw/commonvoice_en_au")

tokenizer = CharTokenizer()
tokenizer.build_vocab(dataset.df["sentence"].astype(str))

def prepare_sample(idx):
    sample = dataset.get_sample(idx)

    # Extract features
    log_mel = extract_log_mel(sample["audio_path"])

    # Apply SpecAugment
    log_mel = spec_augment(log_mel)

    # Encode text
    tokens = tokenizer.encode(sample["text"])

    return {
        "log_mel": log_mel,
        "tokens": tokens
    }


samples = [prepare_sample(i) for i in range(20)]  # small subset
loader = DataLoader(samples, batch_size=2, collate_fn=collate_batch)


In [None]:
model = ASRModel(
    n_mels=80,
    hidden_dim=256,
    vocab_size=len(tokenizer.char2idx)
)

ctc_loss = torch.nn.CTCLoss(
    blank=tokenizer.char2idx[tokenizer.blank_token],
    zero_infinity=True
)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

loss_history = []

for epoch in range(3):
    epoch_loss = 0

    for batch in loader:
        loss = train_step(model, batch, optimizer, ctc_loss)
        loss_history.append(loss)
        epoch_loss += loss

    print(f"Epoch {epoch+1} | Avg Loss: {epoch_loss / len(loader):.4f}")



In [None]:
window = 5
smoothed = np.convolve(loss_history, np.ones(window)/window, mode='valid')

plt.figure(figsize=(8,4))
plt.plot(loss_history, alpha=0.4, label="Raw Loss")
plt.plot(smoothed, label="Smoothed Loss", linewidth=2)
plt.legend()
plt.title("Training Loss Curve")
plt.xlabel("Iterations")
plt.ylabel("CTC Loss")
plt.grid(True)
plt.savefig("../graphs/training_loss.png")
plt.show()


In [None]:
model.eval()
with torch.no_grad():
    for batch in loader:
        features, targets, feat_lens, tgt_lens = batch
        log_probs = model(features)
        predictions = greedy_decode(log_probs, tokenizer)
        break

print("Example Predictions:")
for p in predictions[:5]:
    print(p)


### Observations

- CTC successfully aligns variable-length inputs and outputs without explicit alignment.
- Loss decreases steadily, confirming correct optimization.
- Model struggles with longer utterances, suggesting benefit from:
  - Beam search
  - Language modeling
  - Attention-based encoders
