In [None]:
import sys
sys.path.insert(0, 'src')

In [2]:
import os
import pickle
import time
import yaml
import numpy as np
import torch
from edit_distance import SequenceMatcher
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [None]:
import math
import numbers
import torch
from torch import nn
from torch.nn import functional as F


class WhiteNoise(nn.Module):
    def __init__(self, std=0.1):
        super().__init__()
        self.std = std

    def forward(self, x):
        noise = torch.randn_like(x) * self.std
        return x + noise

class MeanDriftNoise(nn.Module):
    def __init__(self, std=0.1):
        super().__init__()
        self.std = std

    def forward(self, x):
        _, C = x.shape
        noise = torch.randn(1, C) * self.std
        return x + noise

class FeatureMasking(nn.Module):
    """
    Randomly masks out features (neural channels) to zero with a given probability.
    This helps the model not rely too heavily on specific channels and improves robustness.

    Arguments:
        mask_prob (float): Probability of masking each individual feature value (0.0 to 1.0)
    """
    def __init__(self, mask_prob=0.1):
        super().__init__()
        self.mask_prob = mask_prob

    def forward(self, x):
        if self.mask_prob <= 0 or not self.training:
            return x
        # Create random mask: each element has mask_prob chance of being masked
        mask = torch.rand_like(x) < self.mask_prob
        # Set masked elements to zero
        return x.masked_fill(mask, 0)

class GaussianSmoothing(nn.Module):
    def __init__(self, channels, kernel_size, sigma, dim=2):
        super(GaussianSmoothing, self).__init__()
        if isinstance(kernel_size, numbers.Number):
            kernel_size = [kernel_size] * dim
        if isinstance(sigma, numbers.Number):
            sigma = [sigma] * dim

        kernel = 1
        meshgrids = torch.meshgrid(
            [torch.arange(size, dtype=torch.float32) for size in kernel_size]
        )
        for size, std, mgrid in zip(kernel_size, sigma, meshgrids):
            mean = (size - 1) / 2
            kernel *= (
                1
                / (std * math.sqrt(2 * math.pi))
                * torch.exp(-(((mgrid - mean) / std) ** 2) / 2)
            )

        kernel = kernel / torch.sum(kernel)

        kernel = kernel.view(1, 1, *kernel.size())
        kernel = kernel.repeat(channels, *[1] * (kernel.dim() - 1))

        self.register_buffer("weight", kernel)
        self.groups = channels

        if dim == 1:
            self.conv = F.conv1d
        elif dim == 2:
            self.conv = F.conv2d
        elif dim == 3:
            self.conv = F.conv3d
        else:
            raise RuntimeError(
                "Only 1, 2 and 3 dimensions are supported. Received {}.".format(dim)
            )

    def forward(self, input):
        return self.conv(input, weight=self.weight, groups=self.groups, padding="same")


In [4]:
from torch.utils.data import Dataset


class SpeechDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data
        self.transform = transform
        self.n_days = len(data)
        self.n_trials = sum([len(d["sentenceDat"]) for d in data])

        self.neural_feats = []
        self.phone_seqs = []
        self.neural_time_bins = []
        self.phone_seq_lens = []
        self.days = []
        for day in range(self.n_days):
            for trial in range(len(data[day]["sentenceDat"])):
                self.neural_feats.append(data[day]["sentenceDat"][trial])
                self.phone_seqs.append(data[day]["phonemes"][trial])
                self.neural_time_bins.append(data[day]["sentenceDat"][trial].shape[0])
                self.phone_seq_lens.append(data[day]["phoneLens"][trial])
                self.days.append(day)

    def __len__(self):
        return self.n_trials

    def __getitem__(self, idx):
        neural_feats = torch.tensor(self.neural_feats[idx], dtype=torch.float32)

        if self.transform:
            neural_feats = self.transform(neural_feats)

        return (
            neural_feats,
            torch.tensor(self.phone_seqs[idx], dtype=torch.int32),
            torch.tensor(self.neural_time_bins[idx], dtype=torch.int32),
            torch.tensor(self.phone_seq_lens[idx], dtype=torch.int32),
            torch.tensor(self.days[idx], dtype=torch.int64),
        )

In [None]:
import torch
from torch import nn

class GRUDecoder(nn.Module):
    def __init__(
        self,
        neural_dim,
        n_classes,
        hidden_dim,
        layer_dim,
        nDays=24,
        dropout=0,
        device="cuda",
        strideLen=4,
        kernelLen=14,
        gaussianSmoothWidth=0,
        bidirectional=False,
    ):
        super(GRUDecoder, self).__init__()

        self.layer_dim = layer_dim
        self.hidden_dim = hidden_dim
        self.neural_dim = neural_dim
        self.n_classes = n_classes
        self.nDays = nDays
        self.device = device
        self.dropout = dropout
        self.strideLen = strideLen
        self.kernelLen = kernelLen
        self.gaussianSmoothWidth = gaussianSmoothWidth
        self.bidirectional = bidirectional
        self.inputLayerNonlinearity = torch.nn.Softsign()
        self.unfolder = torch.nn.Unfold(
            (self.kernelLen, 1), dilation=1, padding=0, stride=self.strideLen
        )
        self.gaussianSmoother = GaussianSmoothing(
            neural_dim, 20, self.gaussianSmoothWidth, dim=1
        )
        self.dayWeights = torch.nn.Parameter(torch.randn(nDays, neural_dim, neural_dim))
        self.dayBias = torch.nn.Parameter(torch.zeros(nDays, 1, neural_dim))

        for x in range(nDays):
            self.dayWeights.data[x, :, :] = torch.eye(neural_dim)

        self.gru_decoder = nn.GRU(
            (neural_dim) * self.kernelLen,
            hidden_dim,
            layer_dim,
            batch_first=True,
            dropout=self.dropout,
            bidirectional=self.bidirectional,
        )

        for name, param in self.gru_decoder.named_parameters():
            if "weight_hh" in name:
                nn.init.orthogonal_(param)
            if "weight_ih" in name:
                nn.init.xavier_uniform_(param)

        for x in range(nDays):
            setattr(self, "inpLayer" + str(x), nn.Linear(neural_dim, neural_dim))

        for x in range(nDays):
            thisLayer = getattr(self, "inpLayer" + str(x))
            thisLayer.weight = torch.nn.Parameter(
                thisLayer.weight + torch.eye(neural_dim)
            )

        if self.bidirectional:
            self.fc_decoder_out = nn.Linear(
                hidden_dim * 2, n_classes + 1
            ) 
        else:
            self.fc_decoder_out = nn.Linear(hidden_dim, n_classes + 1)  

    def forward(self, neuralInput, dayIdx):
        neuralInput = torch.permute(neuralInput, (0, 2, 1))
        neuralInput = self.gaussianSmoother(neuralInput)
        neuralInput = torch.permute(neuralInput, (0, 2, 1))

        dayWeights = torch.index_select(self.dayWeights, 0, dayIdx)
        transformedNeural = torch.einsum(
            "btd,bdk->btk", neuralInput, dayWeights
        ) + torch.index_select(self.dayBias, 0, dayIdx)
        transformedNeural = self.inputLayerNonlinearity(transformedNeural)

        stridedInputs = torch.permute(
            self.unfolder(
                torch.unsqueeze(torch.permute(transformedNeural, (0, 2, 1)), 3)
            ),
            (0, 2, 1),
        )

        if self.bidirectional:
            h0 = torch.zeros(
                self.layer_dim * 2,
                transformedNeural.size(0),
                self.hidden_dim,
                device=self.device,
            ).requires_grad_()
        else:
            h0 = torch.zeros(
                self.layer_dim,
                transformedNeural.size(0),
                self.hidden_dim,
                device=self.device,
            ).requires_grad_()

        hid, _ = self.gru_decoder(stridedInputs, h0.detach())

        seq_out = self.fc_decoder_out(hid)
        return seq_out

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
CONFIG_PATH = "/content/drive/MyDrive/ECE_243A/final.yaml"
DATASET_PATH = os.path.expanduser("/content/drive/MyDrive/ECE_243A/ptDecoder_ctc")
OUTPUT_DIR = os.path.expanduser("/content/drive/MyDrive/ECE_243A")
DEVICE = "cuda"

In [8]:
# Load config
with open(CONFIG_PATH, 'r') as f:
    config = yaml.safe_load(f)

In [9]:
print(f"\nLoaded configuration from: {CONFIG_PATH}")
print(f"Model: {config['nLayers']} layers, {config['nUnits']} units, bidirectional={config['bidirectional']}")
print(f"Training: {config['nBatch']} batches, batch_size={config['batchSize']}")
print(f"Optimizer: SGD (momentum={config['momentum']}, nesterov={config['useNesterov']})")
print(f"Learning rate: {config['lrStart']} → {config['lrEnd']}")


Loaded configuration from: /content/drive/MyDrive/ECE_243A/final.yaml
Model: 5 layers, 1024 units, bidirectional=True
Training: 15000 batches, batch_size=64
Optimizer: SGD (momentum=0.9, nesterov=True)
Learning rate: 0.02 → 0.005


In [None]:
# Import LM module
# from neural_decoder.phoneme_lm import PhonemeLM, beam_search_decode, create_phoneme_map
try:
  import kenlm
  LM_AVAILABLE = True
except ImportError:
  print("KenLM not available.")

class PhonemeLM:
    def __init__(self, lm_path: str, phoneme_map: dict = None):
        if not LM_AVAILABLE:
            raise RuntimeError("kenlm python bindings not available. Install kenlm to use phoneme LM.")
        if not os.path.exists(lm_path):
            raise FileNotFoundError(f"LM file not found: {lm_path}")
        self.model = kenlm.Model(lm_path)
        self.phoneme_map = phoneme_map or {}
        self._score_cache = {}

    @staticmethod
    def _tokens_to_str(tokens):
        return " ".join(tokens)

    def id_sequence_to_tokens(self, id_seq):
        tokens = []
        for i in id_seq:
            
            t = self.phoneme_map.get(int(i), None)
            if t is None:
                t = f"PH{int(i)}"
            tokens.append(t)
        return tokens

    def score(self, id_seq):
        key = tuple(id_seq)
        if key in self._score_cache:
            return self._score_cache[key]
        tokens = self.id_sequence_to_tokens(id_seq)
        s = self.model.score(self._tokens_to_str(tokens), bos=False, eos=False)
        self._score_cache[key] = float(s)
        return float(s)

    def clear_cache(self):
        self._score_cache.clear()


def beam_search(log_probs, lm_wrapper: PhonemeLM = None, lm_weight=0.8, beam_width=8, blank_id=0, topk_acoustic=5):

    if isinstance(log_probs, torch.Tensor):
        lp = log_probs.detach().cpu().numpy()
    else:
        lp = np.array(log_probs)
    T, V = lp.shape

    beams = [([], 0.0, 0.0)]
    for t in range(T):
        step = lp[t]  
        topk_idx = np.argsort(step)[-topk_acoustic:][::-1]  
        new_beams = {}
        for seq, a_score, l_score in beams:
            for idx in topk_idx:
                token_logp = float(step[idx])
                if idx == blank_id:
                    new_seq = tuple(seq)
                    new_a = a_score + token_logp
                    new_l = l_score  
                else:
                    new_seq = tuple(list(seq) + [int(idx)])
                    new_a = a_score + token_logp
                    if lm_wrapper is not None:
                        lm_s = lm_wrapper.score(new_seq)
                        new_l = lm_s
                    else:
                        new_l = 0.0

                combined = new_a + (lm_weight * new_l)
                if new_seq not in new_beams or combined > new_beams[new_seq][0]:
                    new_beams[new_seq] = (combined, new_a, new_l)

        sorted_beams = sorted(new_beams.items(), key=lambda x: x[1][0], reverse=True)[:beam_width]
        beams = [(list(k), v[1], v[2]) for k, v in sorted_beams]

    best = max(beams, key=lambda b: b[1] + lm_weight * b[2])
    decoded = best[0]
    collapsed = []
    prev = None
    for tok in decoded:
        if tok == prev:
            prev = tok
            continue
        if tok != blank_id:
            collapsed.append(tok)
        prev = tok
    return collapsed

def load_phoneme_map(path, n_classes):
    if path is None:
        return {i: f"PH{i}" for i in range(1, n_classes + 1)}
    if not os.path.exists(path):
        print(f"Phoneme map path not found: {path}. Falling back to synthetic tokens.")
        return {i: f"PH{i}" for i in range(1, n_classes + 1)}
    try:
        with open(path, "r", encoding="utf-8") as f:
            lines = [l.strip() for l in f if l.strip()]
            m = {}
            for idx, tok in enumerate(lines, start=1):
                m[idx] = tok
            for i in range(1, n_classes + 1):
                if i not in m:
                    m[i] = f"PH{i}"
            return m
    except Exception:
        pass
    return {i: f"PH{i}" for i in range(1, n_classes + 1)}

# Load the language model
LM_PATH = "/content/drive/MyDrive/ECE_243A/phoneme_lm.arpa"
PHONEME_MAP_PATH = "/content/drive/MyDrive/ECE_243A/phoneme_map.txt"
phoneme_map = load_phoneme_map(PHONEME_MAP_PATH, 40)
lm = PhonemeLM(LM_PATH, phoneme_map=phoneme_map)

LM_WEIGHT = 0.6  
BEAM_WIDTH = 10 

print(f"Language Model loaded from: {LM_PATH}")

Language Model loaded from: /content/drive/MyDrive/ECE_243A/phoneme_lm.arpa


============================================================================
TRAINING CODE
============================================================================

In [11]:
def collate_fn(batch):
    X, y, X_lens, y_lens, days = zip(*batch)
    X_padded = pad_sequence(X, batch_first=True, padding_value=0)
    y_padded = pad_sequence(y, batch_first=True, padding_value=0)
    return (
        X_padded,
        y_padded,
        torch.stack(X_lens),
        torch.stack(y_lens),
        torch.stack(days),
    )

In [12]:
# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [13]:
# Set seed
torch.manual_seed(config['seed'])
np.random.seed(config['seed'])

In [14]:
# Load data
print(f"\nLoading data from: {DATASET_PATH}")
with open(DATASET_PATH, "rb") as f:
    data = pickle.load(f)


Loading data from: /content/drive/MyDrive/ECE_243A/ptDecoder_ctc


In [15]:
train_ds = SpeechDataset(data["train"])
test_ds = SpeechDataset(data["test"])

In [16]:
train_loader = DataLoader(
    train_ds,
    batch_size=config['batchSize'],
    shuffle=True,
    num_workers=0,
    pin_memory=True,
    collate_fn=collate_fn,
)
test_loader = DataLoader(
    test_ds,
    batch_size=config['batchSize'],
    shuffle=False,
    num_workers=0,
    pin_memory=True,
    collate_fn=collate_fn,
)

In [17]:
print(f"Train samples: {len(train_ds)}")
print(f"Test samples: {len(test_ds)}")

Train samples: 8780
Test samples: 880


In [18]:
# Create model
print(f"\nCreating model...")
model = GRUDecoder(
    neural_dim=config['nInputFeatures'],
    n_classes=config['nClasses'],
    hidden_dim=config['nUnits'],
    layer_dim=config['nLayers'],
    nDays=len(data["train"]),
    dropout=config['dropout'],
    device=DEVICE,
    strideLen=config['strideLen'],
    kernelLen=config['kernelLen'],
    gaussianSmoothWidth=config['gaussianSmoothWidth'],
    bidirectional=config['bidirectional'],
).to(DEVICE)


Creating model...


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


In [19]:
# Loss and optimizer
loss_ctc = torch.nn.CTCLoss(blank=0, reduction="mean", zero_infinity=True)
optimizer = torch.optim.SGD(
    model.parameters(),
    lr=config['lrStart'],
    momentum=config['momentum'],
    nesterov=config['useNesterov'],
    weight_decay=config['l2_decay'],
)
scheduler = torch.optim.lr_scheduler.ExponentialLR(
    optimizer,
    gamma=0.9995
)

In [20]:
# Training loop
print(f"\nStarting training for {config['nBatch']} batches...")
print("=" * 70)


Starting training for 15000 batches...


In [21]:
test_loss_list = []
test_cer_list = []
best_cer = None
start_time = time.time()

In [22]:
for batch_idx in range(config['nBatch']):
    model.train()

    # Get batch
    X, y, X_len, y_len, day_idx = next(iter(train_loader))
    X, y, X_len, y_len, day_idx = (
        X.to(DEVICE),
        y.to(DEVICE),
        X_len.to(DEVICE),
        y_len.to(DEVICE),
        day_idx.to(DEVICE),
    )

    # Augmentation
    if config['whiteNoiseSD'] > 0:
        X += torch.randn(X.shape, device=DEVICE) * config['whiteNoiseSD']
    if config['constantOffsetSD'] > 0:
        X += torch.randn([X.shape[0], 1, X.shape[2]], device=DEVICE) * config['constantOffsetSD']
    if config.get('featureMaskProb', 0) > 0:
        # Feature masking: randomly zero out individual feature values
        mask = torch.rand_like(X) < config['featureMaskProb']
        X = X.masked_fill(mask, 0)

    # Forward
    pred = model.forward(X, day_idx)
    loss = loss_ctc(
        torch.permute(pred.log_softmax(2), [1, 0, 2]),
        y,
        ((X_len - config['kernelLen']) / config['strideLen']).to(torch.int32),
        y_len,
    )
    loss = torch.sum(loss)

    # Backward
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    # Evaluation every 500 batches (less frequent to save time)
    if batch_idx % 500 == 0:
        with torch.no_grad():
          model.eval()
          all_loss = []
          total_edit = 0
          total_len = 0

          for eval_idx, (X, y, X_len, y_len, test_day_idx) in enumerate(test_loader):
              if eval_idx >= 20:
                  break

              X, y, X_len, y_len, test_day_idx = (
                  X.to(DEVICE),
                  y.to(DEVICE),
                  X_len.to(DEVICE),
                  y_len.to(DEVICE),
                  test_day_idx.to(DEVICE),
              )

              pred = model.forward(X, test_day_idx)
              loss = loss_ctc(
                  torch.permute(pred.log_softmax(2), [1, 0, 2]),
                  y,
                  ((X_len - config['kernelLen']) / config['strideLen']).to(torch.int32),
                  y_len,
              )
              loss = torch.sum(loss)
              all_loss.append(loss.cpu().item())

              # CTC-adjusted frame lengths
              adjusted_lens = ((X_len - config['kernelLen']) / config['strideLen']).to(torch.int32)

              for i in range(pred.shape[0]):
                  T = adjusted_lens[i].item()
                  logits = pred[i, :T, :]

                  # LM-aware beam search
                  decoded = beam_search(
                      logits,
                      lm_wrapper=lm,
                      lm_weight=LM_WEIGHT,
                      beam_width=BEAM_WIDTH,
                      blank_id=0
                  )

                  # Convert to lists for edit distance
                  decoded = list(decoded)
                  target = y[i, :y_len[i]].cpu().tolist()

                  matcher = SequenceMatcher(a=target, b=decoded)
                  total_edit += matcher.distance()
                  total_len += len(target)

          avg_loss = np.sum(all_loss) / len(all_loss)
          cer = total_edit / total_len

          elapsed = (time.time() - start_time) / 500 if batch_idx > 0 else 0.0
          print(f"batch {batch_idx:5d}, ctc loss: {avg_loss:.4f}, cer: {cer:.4f}, time/batch: {elapsed:.3f}s")
          start_time = time.time()

          # Save stats
          test_loss_list.append(avg_loss)
          test_cer_list.append(cer)

          # Track best model
          if best_cer is None or cer < best_cer:
              best_cer = cer
              torch.save(model.state_dict(), os.path.join(OUTPUT_DIR, "modelWeights.pt"))
              print(f"  → New best CER: {cer:.4f}, model saved!")

          # Write stats
          stats = {
              "testLoss": np.array(test_loss_list),
              "testCER": np.array(test_cer_list),
          }
          with open(os.path.join(OUTPUT_DIR, "trainingStats.pkl"), "wb") as f:
              pickle.dump(stats, f)

  return self.conv(input, weight=self.weight, groups=self.groups, padding="same")


KeyboardInterrupt: 

In [None]:
print("\n" + "=" * 70)
print("TRAINING COMPLETE!")
print("=" * 70)
print(f"Best CER: {best_cer:.4f} ({best_cer*100:.2f}%)")
print(f"Model saved to: {OUTPUT_DIR}/modelWeights.pt")
print(f"Stats saved to: {OUTPUT_DIR}/trainingStats.pkl")
print("=" * 70)

# Language Model Evaluation

Now let's evaluate the model with a phoneme language model for improved accuracy!

In [23]:
# Install KenLM if not already installed
try:
    import kenlm
    print("✓ KenLM already installed")
except ImportError:
    print("Installing KenLM...")
    !pip install https://github.com/kpu/kenlm/archive/master.zip
    import kenlm
    print("✓ KenLM installed successfully")

✓ KenLM already installed


In [35]:
# Evaluate with Language Model
print("=" * 70)
print("EVALUATING WITH LANGUAGE MODEL")
print("=" * 70)

model_test = GRUDecoder(
    neural_dim=config['nInputFeatures'],
    n_classes=config['nClasses'],
    hidden_dim=512,
    layer_dim=3,
    nDays=len(data["train"]),
    dropout=config['dropout'],
    device=DEVICE,
    strideLen=config['strideLen'],
    kernelLen=config['kernelLen'],
    gaussianSmoothWidth=config['gaussianSmoothWidth'],
    bidirectional=config['bidirectional'],
).to(DEVICE)
model_test.load_state_dict(torch.load("/content/drive/MyDrive/ECE_243A/GRU-Opt-BEST.pt", map_location="cpu"))

model_test.eval()
all_predictions_baseline = []
all_predictions_lm = []
all_targets = []

LM_WEIGHT_TEST = 0.2
BEAM_WIDTH_TEST = 8

print(f"LM Weight: {LM_WEIGHT_TEST}")
print(f"Beam Width: {BEAM_WIDTH_TEST}")
print()

with torch.no_grad():
    for batch_idx, (X, y, X_len, y_len, day_idx) in enumerate(test_loader):
        X = X.to(DEVICE)
        day_idx = day_idx.to(DEVICE)

        # Forward pass
        logits = model_test(X, day_idx)
        lengths = ((X_len - config['kernelLen']) / config['strideLen']).long()
        log_probs = torch.log_softmax(logits, dim=-1)

        # Decode each sample in batch
        for i in range(len(y)):
            seq_len = int(lengths[i])
            lp = log_probs[i, :seq_len, :]  # [T, V]

            # Baseline: Greedy decoding (no LM)
            greedy = torch.argmax(lp, dim=-1).cpu().numpy()
            decoded_baseline = []
            prev = None
            for tok in greedy:
                if tok == prev or tok == 0:  # Skip repeats and blank
                    prev = tok
                    continue
                decoded_baseline.append(tok)
                prev = tok
            all_predictions_baseline.append(decoded_baseline)

            # With LM: Beam search
            decoded_lm = beam_search(
                lp,
                lm_wrapper=lm,
                lm_weight=LM_WEIGHT_TEST,
                beam_width=BEAM_WIDTH_TEST,
                blank_id=0,
                topk_acoustic=5
            )
            all_predictions_lm.append(decoded_lm)

            # Get target
            target = y[i, :y_len[i]].cpu().numpy().tolist()
            all_targets.append(target)

        if (batch_idx + 1) % 10 == 0:
            print(f"  Processed {batch_idx + 1}/{len(test_loader)} batches")

print("✓ Evaluation complete!")

EVALUATING WITH LANGUAGE MODEL
LM Weight: 0.2
Beam Width: 8

  Processed 10/14 batches
✓ Evaluation complete!


In [1]:
# Compute CER/PER
def compute_error_rate(predictions, targets):
    total_edit = 0
    total_len = 0
    for pred, target in zip(predictions, targets):
        matcher = SequenceMatcher(a=target, b=pred)
        total_edit += matcher.distance()
        total_len += len(target)
    return total_edit / total_len if total_len > 0 else 0.0

cer_baseline = compute_error_rate(all_predictions_baseline, all_targets)
cer_lm = compute_error_rate(all_predictions_lm, all_targets)
improvement = (cer_baseline - cer_lm) / cer_baseline * 100  # % improvement

print(f"\nBaseline (Greedy):")
print(f"  CER/PER: {cer_baseline:.4f} ({cer_baseline*100:.2f}%)")
print(f"\nWith Language Model:")
print(f"  CER/PER: {cer_lm:.4f} ({cer_lm*100:.2f}%)")
print(f"  Improvement: {improvement:.2f}% relative")
print(f"  Absolute gain: {(cer_baseline - cer_lm)*100:.2f} percentage points")
print(f"Language Model improved accuracy by {improvement:.1f}%")

NameError: name 'all_predictions_baseline' is not defined