In [1]:
import os
import sys
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import pandas as pd
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math
# Device setup for GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Optional: For debugging
from aux import VocabularyBuilder
import warnings
# Ignore all UserWarnings specifically from torchaudio and torch.nn.modules.transformer
warnings.filterwarnings("ignore", category=UserWarning, message=".*torchaudio.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*nested tensors.*")


Using device: cuda


In [2]:
# Paths
CSV_PATH = 'geo/train.csv'
VAL_CSV_PATH = 'geo/dev.csv'
DATA_DIR = 'geo/clips'
MAX_LENGTH = 200
SAMPLE_RATE = 16000  # Assuming 16kHz; adjust if needed
N_MELS = 80
HOP_LENGTH = 160  # 10ms hop
WIN_LENGTH = 400  # 25ms window

vocab_builder = VocabularyBuilder(train_csv_path=CSV_PATH, val_csv_path=VAL_CSV_PATH)
CHAR_MAP, INV_CHAR_MAP, VOCAB_SIZE = vocab_builder.build_vocab()

# 3. Display the results
print("--- Vocabulary Generation Successful ---")
print(f"VOCAB_SIZE: {VOCAB_SIZE}")

# Show the characters found (excluding the special tokens)
found_chars = ''.join([INV_CHAR_MAP[i] for i in sorted(INV_CHAR_MAP.keys()) if i >= 3])
print(f"Characters found: {found_chars}")

print("\nCHAR_MAP (First 5 entries):")
print({k: v for i, (k, v) in enumerate(CHAR_MAP.items()) if i < 5})

print("\n--- Next Steps ---")
print(f"The variables CHAR_MAP, INV_CHAR_MAP, and VOCAB_SIZE (={VOCAB_SIZE}) are now set and ready for model training.")

--- Vocabulary Generation Successful ---
VOCAB_SIZE: 35
Characters found:  abcdefghijklmnoprstuvz«»ĉĝĥĵŝŭﬁ

CHAR_MAP (First 5 entries):
{'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, ' ': 3, 'a': 4}

--- Next Steps ---
The variables CHAR_MAP, INV_CHAR_MAP, and VOCAB_SIZE (=35) are now set and ready for model training.


In [3]:
class AudioDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)  # Assumes columns: 'file', 'transcript'

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['file']
        transcript = self.df.iloc[idx]['transcript'].strip().lower()

        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLE_RATE:
            waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)

        mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, win_length=WIN_LENGTH, n_fft=WIN_LENGTH
        )
        mel_spec = torch.log(mel_transform(waveform) + 1e-9)  # Log-Mel
        mel_spec = mel_spec.squeeze(0).transpose(0, 1)  # (seq_len, n_mels)

        target = [CHAR_MAP['<SOS>']] + [CHAR_MAP[c] for c in transcript if c in CHAR_MAP] + [CHAR_MAP['<EOS>']]
        target = torch.tensor(target, dtype=torch.long)

        return mel_spec, target

def collate_fn(batch):
    mels, targets = zip(*batch)
    mel_lens = torch.tensor([len(m) for m in mels])
    target_lens = torch.tensor([len(t) for t in targets])
    mels_padded = pad_sequence(mels, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=CHAR_MAP['<PAD>'])
    return mels_padded, targets_padded, mel_lens, target_lens

# Model components
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim=512):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[..., None] * emb[None, :]  
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

class AudioEncoder(nn.Module):
    def __init__(self, input_dim=80, hidden_dim=512, num_layers=12, num_heads=8):
        super().__init__()
        self.conv_sub = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=2048, dropout=0.3, batch_first=True)
        self.transformer_enc = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pos_emb = SinusoidalPosEmb(hidden_dim)

    def forward(self, x, lengths):
        # x: (batch, seq_len, 80)
        x = x.transpose(1, 2)  # For conv1d: (batch, 80, seq_len)
        x = self.conv_sub(x)  # (batch, 256, seq_len//4)
        x = x.transpose(1, 2)  # (batch, seq_len//4, 256)
        seq_len = x.size(1)
        pos = torch.arange(0, seq_len, device=x.device).unsqueeze(0).repeat(x.size(0), 1)
        x = x + self.pos_emb(pos)
        mask = torch.arange(seq_len, device=x.device)[None, :] >= (lengths // 4)[:, None]
        x = self.transformer_enc(x, src_key_padding_mask=mask)
        return x, mask

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.query_proj = nn.Linear(hidden_dim, hidden_dim)
        self.key_proj = nn.Linear(hidden_dim, hidden_dim)
        self.value_proj = nn.Linear(hidden_dim, hidden_dim)
        self.scale = hidden_dim ** -0.5

    def forward(self, query, keys, values, mask=None):
        q = self.query_proj(query)
        k = self.key_proj(keys)
        v = self.value_proj(values)
        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask.unsqueeze(1), -1e9)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, v)
        return context

class RecurrentDecoder(nn.Module):
    def __init__(self, embed_dim=512, hidden_dim=1024, enc_dim=512, vocab_size=VOCAB_SIZE, num_layers=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=CHAR_MAP['<PAD>'])
        self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.1)
        self.memory_proj = nn.Linear(enc_dim, hidden_dim)  
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)  # Concat rnn_out + context
        self.pos_emb = SinusoidalPosEmb(embed_dim)
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, tgt, memory, tgt_lengths, memory_mask):
        # tgt: (batch, tgt_len)
        tgt_emb = self.embedding(tgt)
        tgt_seq_len = tgt.size(1)
        pos = torch.arange(0, tgt_seq_len, device=tgt.device).unsqueeze(0).repeat(tgt.size(0), 1)
        tgt_emb = tgt_emb + self.pos_emb(pos)
        packed_tgt = pack_padded_sequence(tgt_emb, tgt_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_tgt)
        rnn_out, _ = pad_packed_sequence(packed_out, batch_first=True)
        
        # Project memory to match decoder dim
        memory_proj = self.memory_proj(memory)
        
        # Attention: query=rnn_out, key=value=memory_proj
        context = self.attention(rnn_out, memory_proj, memory_proj, memory_mask)
        
        combined = torch.cat((rnn_out, context), dim=-1)
        logits = self.fc(combined)
        return logits

    def decode_step(self, tgt_token, memory_proj, memory_mask, hidden, pos):
        # tgt_token: (batch=1, 1)
        tgt_emb = self.embedding(tgt_token)
        pos_tensor = torch.tensor([[pos]], device=tgt_emb.device).repeat(tgt_emb.size(0), 1)
        tgt_emb = tgt_emb + self.pos_emb(pos_tensor)
        rnn_out, hidden = self.rnn(tgt_emb, hidden)
        context = self.attention(rnn_out, memory_proj, memory_proj, memory_mask)
        combined = torch.cat((rnn_out, context), dim=-1)
        logit = self.fc(combined)
        return logit, hidden

class ASRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AudioEncoder(hidden_dim=512, num_layers=6, num_heads=8)  # Reduced from 12
        self.decoder = RecurrentDecoder(embed_dim=512, hidden_dim=1024, enc_dim=512, num_layers=2)  # Reduced from 4

    def forward(self, src, tgt, src_lengths, tgt_lengths):
        enc_out, enc_mask = self.encoder(src, src_lengths)
        logits = self.decoder(tgt, enc_out, tgt_lengths, enc_mask)
        return logits

    def predict(self, src, src_length, max_length=MAX_LENGTH, beam_width=1):
        # Added optional beam search (greedy if beam_width=1)
        self.eval()
        with torch.no_grad():
            enc_out, enc_mask = self.encoder(src.unsqueeze(0), src_length)
            memory_proj = self.decoder.memory_proj(enc_out)
            if beam_width == 1:
                # Existing greedy code
                hidden = (torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device),
                          torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device))
                tgt_token = torch.tensor([[CHAR_MAP['<SOS>']]], device=src.device)
                transcription = []
                for i in range(max_length):
                    logit, hidden = self.decoder.decode_step(tgt_token, memory_proj, enc_mask, hidden, i)
                    pred_token = logit.argmax(-1).squeeze().item()
                    if pred_token == CHAR_MAP['<EOS>']:
                        break
                    if pred_token in INV_CHAR_MAP:
                        transcription.append(INV_CHAR_MAP[pred_token])
                    tgt_token = torch.tensor([[pred_token]], device=src.device)
                return ''.join(transcription)
            else:
                # Simple beam search implementation
                beams = [{'seq': [CHAR_MAP['<SOS>']], 'score': 0.0, 'hidden': (torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device),
                                                                             torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device))}]
                for step in range(max_length):
                    new_beams = []
                    for beam in beams:
                        tgt_token = torch.tensor([[beam['seq'][-1]]], device=src.device)
                        logit, new_hidden = self.decoder.decode_step(tgt_token, memory_proj, enc_mask, beam['hidden'], step)
                        probs = torch.log_softmax(logit.squeeze(0).squeeze(0), dim=-1)
                        topk_probs, topk_tokens = probs.topk(beam_width)
                        for p, t in zip(topk_probs, topk_tokens):
                            new_seq = beam['seq'] + [t.item()]
                            new_score = beam['score'] + p.item()
                            new_beams.append({'seq': new_seq, 'score': new_score, 'hidden': new_hidden})
                    beams = sorted(new_beams, key=lambda b: b['score'], reverse=True)[:beam_width]
                    if beams[0]['seq'][-1] == CHAR_MAP['<EOS>']:
                        break
                best_seq = beams[0]['seq'][1:]  # Skip <SOS>
                transcription = [INV_CHAR_MAP[t] for t in best_seq if t in INV_CHAR_MAP and t != CHAR_MAP['<EOS>']]
                return ''.join(transcription)

def evaluate(model, dataloader, criterion, device):
    model.eval()  
    total_loss = 0
    with torch.no_grad():  # Disable gradient calculation
        for mels, targets, mel_lens, target_lens in dataloader:
            mels = mels.to(device)
            targets = targets.to(device)
            mel_lens = mel_lens.to(device)
            target_lens = target_lens.to(device)

            input_tgt = targets[:, :-1]
            label = targets[:, 1:]
            input_tgt_lens = target_lens - 1
            
            logits = model(mels, input_tgt, mel_lens, input_tgt_lens)
            loss = criterion(logits.reshape(-1, VOCAB_SIZE), label.reshape(-1))
            total_loss += loss.item()
            
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [6]:
train_dataset = AudioDataset(CSV_PATH)
val_dataset = AudioDataset(VAL_CSV_PATH)  

BATCH_SIZE = 64
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn) 

model = ASRModel()
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.05,momentum=0.8)
criterion = nn.CrossEntropyLoss(ignore_index=CHAR_MAP['<PAD>'])

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6)
early_stopping_patience = 7
patience_counter = 3
best_val_loss = math.inf
best_model_path = 'asr_best_model.pth'

num_epochs = 70
CLIP_VALUE = 2.0

In [7]:

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{num_epochs} (Train)")
    
    for batch_idx, (mels, targets, mel_lens, target_lens) in progress_bar:
        if total_train_loss != total_train_loss and batch_idx > 0:
            print("\nGradient Explosion detected mid-epoch. Breaking...")
            break
        mels = mels.to(device)
        targets = targets.to(device)
        mel_lens = mel_lens.to(device)
        target_lens = target_lens.to(device)
        
        optimizer.zero_grad()
        
        input_tgt = targets[:, :-1]
        label = targets[:, 1:]
        input_tgt_lens = target_lens - 1

        logits = model(mels, input_tgt, mel_lens, input_tgt_lens)
        loss = criterion(logits.reshape(-1, VOCAB_SIZE), label.reshape(-1))

        if loss.isnan():
            print(f"\nWarning: NaN loss detected in batch {batch_idx+1}. Skipping batch.")
            continue
        if loss.isinf():
            print(f"\nWarning: Inf loss detected in batch {batch_idx+1}. Skipping batch.")
            continue
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE)
        optimizer.step()
        total_train_loss += loss.item()
        progress_bar.set_postfix(batch_loss=f"{loss.item():.4f}")  

    if total_train_loss == total_train_loss: 
        avg_train_loss = total_train_loss / len(dataloader)
    else:
        avg_train_loss = float('nan')

    avg_val_loss = evaluate(model, val_dataloader, criterion, device)
    
    # Step the Learning Rate Scheduler
    if avg_val_loss == avg_val_loss:
        scheduler.step(avg_val_loss)
    current_lr = optimizer.param_groups[0]['lr']

    # Early Stopping Check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), best_model_path) 
        status = " (Saving Best Model)"
    else:
        patience_counter += 1
        status = ""
        
    print(f'Epoch {epoch+1} completed, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {current_lr:.6f}{status}')
    
    if patience_counter >= early_stopping_patience:
        print(f"\nEarly stopping triggered after {patience_counter} epochs without improvement on Val Loss.")
        break

if os.path.exists(best_model_path):
    print(f"\nLoading best model weights from {best_model_path} (Val Loss: {best_val_loss:.4f}).")
    model.load_state_dict(torch.load(best_model_path))


Epoch 1/50 (Train): 100%|█████████████████| 94/94 [00:43<00:00,  2.15it/s, batch_loss=2.9288]


Epoch 1 completed, Train Loss: 2.9573, Val Loss: 2.9212, LR: 0.050000 (Saving Best Model)


Epoch 2/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.8124]


Epoch 2 completed, Train Loss: 2.8571, Val Loss: 2.8184, LR: 0.050000 (Saving Best Model)


Epoch 3/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=2.6259]


Epoch 3 completed, Train Loss: 2.7165, Val Loss: 2.6490, LR: 0.050000 (Saving Best Model)


Epoch 4/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=2.4410]


Epoch 4 completed, Train Loss: 2.5406, Val Loss: 2.4842, LR: 0.050000 (Saving Best Model)


Epoch 5/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=2.3704]


Epoch 5 completed, Train Loss: 2.4117, Val Loss: 2.3991, LR: 0.050000 (Saving Best Model)


Epoch 6/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.2747]


Epoch 6 completed, Train Loss: 2.3386, Val Loss: 2.3390, LR: 0.050000 (Saving Best Model)


Epoch 7/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.2230]


Epoch 7 completed, Train Loss: 2.2798, Val Loss: 2.2871, LR: 0.050000 (Saving Best Model)


Epoch 8/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.1780]


Epoch 8 completed, Train Loss: 2.2319, Val Loss: 2.2455, LR: 0.050000 (Saving Best Model)


Epoch 9/50 (Train): 100%|█████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=2.1595]


Epoch 9 completed, Train Loss: 2.1877, Val Loss: 2.2103, LR: 0.050000 (Saving Best Model)


Epoch 10/50 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.19it/s, batch_loss=2.1134]


Epoch 10 completed, Train Loss: 2.1480, Val Loss: 2.1702, LR: 0.050000 (Saving Best Model)


Epoch 11/50 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=2.0470]


Epoch 11 completed, Train Loss: 2.1118, Val Loss: 2.1311, LR: 0.050000 (Saving Best Model)


Epoch 12/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=2.0395]


Epoch 12 completed, Train Loss: 2.0806, Val Loss: 2.0969, LR: 0.050000 (Saving Best Model)


Epoch 13/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.0340]


Epoch 13 completed, Train Loss: 2.0479, Val Loss: 2.0775, LR: 0.050000 (Saving Best Model)


Epoch 14/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.24it/s, batch_loss=2.0346]


Epoch 14 completed, Train Loss: 2.0203, Val Loss: 2.0645, LR: 0.050000 (Saving Best Model)


Epoch 15/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.9875]


Epoch 15 completed, Train Loss: 1.9980, Val Loss: 2.0401, LR: 0.050000 (Saving Best Model)


Epoch 16/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.8912]


Epoch 16 completed, Train Loss: 1.9690, Val Loss: 2.0092, LR: 0.050000 (Saving Best Model)


Epoch 17/50 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.15it/s, batch_loss=1.9159]


Epoch 17 completed, Train Loss: 1.9453, Val Loss: 1.9838, LR: 0.050000 (Saving Best Model)


Epoch 18/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.9084]


Epoch 18 completed, Train Loss: 1.9240, Val Loss: 1.9815, LR: 0.050000 (Saving Best Model)


Epoch 19/50 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.15it/s, batch_loss=1.8507]


Epoch 19 completed, Train Loss: 1.9018, Val Loss: 1.9465, LR: 0.050000 (Saving Best Model)


Epoch 20/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.9019]


Epoch 20 completed, Train Loss: 1.8821, Val Loss: 1.9335, LR: 0.050000 (Saving Best Model)


Epoch 21/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.8213]


Epoch 21 completed, Train Loss: 1.8606, Val Loss: 1.9250, LR: 0.050000 (Saving Best Model)


Epoch 22/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.8256]


Epoch 22 completed, Train Loss: 1.8402, Val Loss: 1.9273, LR: 0.050000


Epoch 23/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.7919]


Epoch 23 completed, Train Loss: 1.8187, Val Loss: 1.8752, LR: 0.050000 (Saving Best Model)


Epoch 24/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.8003]


Epoch 24 completed, Train Loss: 1.8023, Val Loss: 1.8697, LR: 0.050000 (Saving Best Model)


Epoch 25/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.7600]


Epoch 25 completed, Train Loss: 1.7836, Val Loss: 1.8799, LR: 0.050000


Epoch 26/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.24it/s, batch_loss=1.7236]


Epoch 26 completed, Train Loss: 1.7637, Val Loss: 1.8420, LR: 0.050000 (Saving Best Model)


Epoch 27/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.7028]


Epoch 27 completed, Train Loss: 1.7465, Val Loss: 1.8248, LR: 0.050000 (Saving Best Model)


Epoch 28/50 (Train): 100%|████████████████| 94/94 [00:41<00:00,  2.25it/s, batch_loss=1.7130]


Epoch 28 completed, Train Loss: 1.7263, Val Loss: 1.8271, LR: 0.050000


Epoch 29/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.7201]


Epoch 29 completed, Train Loss: 1.7111, Val Loss: 1.8287, LR: 0.050000


Epoch 30/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.6770]


Epoch 30 completed, Train Loss: 1.6957, Val Loss: 1.8171, LR: 0.050000 (Saving Best Model)


Epoch 31/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.6812]


Epoch 31 completed, Train Loss: 1.6809, Val Loss: 1.8255, LR: 0.050000


Epoch 32/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.7477]


Epoch 32 completed, Train Loss: 1.6678, Val Loss: 1.7707, LR: 0.050000 (Saving Best Model)


Epoch 33/50 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.15it/s, batch_loss=1.5990]


Epoch 33 completed, Train Loss: 1.6487, Val Loss: 1.7811, LR: 0.050000


Epoch 34/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=1.6452]


Epoch 34 completed, Train Loss: 1.6281, Val Loss: 1.7456, LR: 0.050000 (Saving Best Model)


Epoch 35/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.5539]


Epoch 35 completed, Train Loss: 1.6154, Val Loss: 1.7573, LR: 0.050000


Epoch 36/50 (Train): 100%|████████████████| 94/94 [00:41<00:00,  2.24it/s, batch_loss=1.6469]


Epoch 36 completed, Train Loss: 1.6031, Val Loss: 1.7295, LR: 0.050000 (Saving Best Model)


Epoch 37/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.5994]


Epoch 37 completed, Train Loss: 1.5850, Val Loss: 1.7430, LR: 0.050000


Epoch 38/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.6169]


Epoch 38 completed, Train Loss: 1.5692, Val Loss: 1.7249, LR: 0.050000 (Saving Best Model)


Epoch 39/50 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.5662]


Epoch 39 completed, Train Loss: 1.5591, Val Loss: 1.7236, LR: 0.050000 (Saving Best Model)


Epoch 40/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.5419]


Epoch 40 completed, Train Loss: 1.5419, Val Loss: 1.7088, LR: 0.050000 (Saving Best Model)


Epoch 41/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=1.5100]


Epoch 41 completed, Train Loss: 1.5280, Val Loss: 1.7368, LR: 0.050000


Epoch 42/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.5429]


Epoch 42 completed, Train Loss: 1.5157, Val Loss: 1.6834, LR: 0.050000 (Saving Best Model)


Epoch 43/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.5096]


Epoch 43 completed, Train Loss: 1.5006, Val Loss: 1.6844, LR: 0.050000


Epoch 44/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.4315]


Epoch 44 completed, Train Loss: 1.4880, Val Loss: 1.6631, LR: 0.050000 (Saving Best Model)


Epoch 45/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.5272]


Epoch 45 completed, Train Loss: 1.4741, Val Loss: 1.7190, LR: 0.050000


Epoch 46/50 (Train): 100%|████████████████| 94/94 [00:41<00:00,  2.26it/s, batch_loss=1.4638]


Epoch 46 completed, Train Loss: 1.4626, Val Loss: 1.6932, LR: 0.050000


Epoch 47/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.4318]


Epoch 47 completed, Train Loss: 1.4488, Val Loss: 1.6580, LR: 0.050000 (Saving Best Model)


Epoch 48/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.4290]


Epoch 48 completed, Train Loss: 1.4392, Val Loss: 1.6520, LR: 0.050000 (Saving Best Model)


Epoch 49/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.3999]


Epoch 49 completed, Train Loss: 1.4204, Val Loss: 1.6603, LR: 0.050000


Epoch 50/50 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.3810]


Epoch 50 completed, Train Loss: 1.4101, Val Loss: 1.6575, LR: 0.050000

Loading best model weights from asr_best_model.pth (Val Loss: 1.6520).


In [8]:
import zipfile
from tqdm import tqdm
MODEL_PATH = 'asr_best_model.pth'
TEST_CSV_PATH = "geo/dev.csv"
model = ASRModel().to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
# Load test CSV
test_df = pd.read_csv(TEST_CSV_PATH)
predicted_df = test_df.copy()
predicted_df['transcript'] = predicted_df['transcript'].astype('object') # Fix dtype upfront
# Predict for each row with progress and error handling
for idx, row in tqdm(predicted_df.iterrows(), total=len(predicted_df), desc="Predicting transcripts"):
    try:
        audio_path = row['file']
        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLE_RATE:
            waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
        mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, win_length=WIN_LENGTH, n_fft=WIN_LENGTH
        )
        mel_spec = torch.log(mel_transform(waveform) + 1e-9).squeeze(0).transpose(0, 1)
        src_length = torch.tensor([mel_spec.size(0)])
        prediction = model.predict(mel_spec.to(device), src_length.to(device), beam_width=3)
        predicted_df.at[idx, 'transcript'] = prediction
    except Exception as e:
        print(f"Error processing index {idx} (file: {audio_path}): {e}")
        predicted_df.at[idx, 'transcript'] = '' # Set empty on error
# Verify DataFrame before saving
print("\nSample of predicted_df:")
print(predicted_df.head())
# Ensure transcript column contains only strings
predicted_df['transcript'] = predicted_df['transcript'].fillna('').astype(str)
# Save updated CSV
output_csv = 'predicted_dev.csv'
predicted_df.to_csv(output_csv, index=False)

Predicting transcripts: 100%|████████████████████████████| 1000/1000 [01:10<00:00, 14.25it/s]


Sample of predicted_df:
                  file                                         transcript
0  geo/clips/dev_0.wav  teo eztaz tri la tinpo de la tinpo de la gunom...
1  geo/clips/dev_1.wav  pozti la blej gunomonu eztaz gunzederataj de l...
2  geo/clips/dev_2.wav                                  ĝi eztaz zenilita
3  geo/clips/dev_3.wav                     ĝi eztaz zenjuru de la zceemco
4  geo/clips/dev_4.wav  ĝi eztaz ĉive ĉive gaj la ĉiverzamtojm ĉive ĉi...





In [11]:
import pandas as pd
from jiwer import wer, cer

df_gt = pd.read_csv("geo/dev.csv")               # ground truth
df_pred = pd.read_csv("predicted_dev.csv")   # predictions

# Ensure both are sorted the same way (optional but recommended)
df_gt = df_gt.sort_values("file").reset_index(drop=True)
df_pred = df_pred.sort_values("file").reset_index(drop=True)

df_gt['transcript'] = df_gt['transcript'].fillna('').astype(str)
df_pred['transcript'] = df_pred['transcript'].fillna('').astype(str)

gt_texts = df_gt["transcript"].tolist()
pred_texts = df_pred["transcript"].tolist()


overall_wer = wer(gt_texts, pred_texts)
overall_cer = cer(gt_texts, pred_texts)

print("WER:", overall_wer)
print("CER:", overall_cer)


WER: 1.104522745201157
CER: 0.7869850937650265
