In [1]:
import os
import sys
import torch
import torchaudio
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import pandas as pd
from tqdm import tqdm
from torch.optim.lr_scheduler import ReduceLROnPlateau
import math
# Device setup for GPU/CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Optional: For debugging
from aux import VocabularyBuilder
import warnings
# Ignore all UserWarnings specifically from torchaudio and torch.nn.modules.transformer
warnings.filterwarnings("ignore", category=UserWarning, message=".*torchaudio.*")
warnings.filterwarnings("ignore", category=UserWarning, message=".*nested tensors.*")


Using device: cuda


In [2]:
# Paths
CSV_PATH = 'geo/train.csv'
VAL_CSV_PATH = 'geo/dev.csv'
DATA_DIR = 'geo/clips'
MAX_LENGTH = 200
SAMPLE_RATE = 16000  # Assuming 16kHz; adjust if needed
N_MELS = 80
HOP_LENGTH = 160  # 10ms hop
WIN_LENGTH = 400  # 25ms window

vocab_builder = VocabularyBuilder(train_csv_path=CSV_PATH, val_csv_path=VAL_CSV_PATH)
CHAR_MAP, INV_CHAR_MAP, VOCAB_SIZE = vocab_builder.build_vocab()

# 3. Display the results
print("--- Vocabulary Generation Successful ---")
print(f"VOCAB_SIZE: {VOCAB_SIZE}")

# Show the characters found (excluding the special tokens)
found_chars = ''.join([INV_CHAR_MAP[i] for i in sorted(INV_CHAR_MAP.keys()) if i >= 3])
print(f"Characters found: {found_chars}")

print("\nCHAR_MAP (First 5 entries):")
print({k: v for i, (k, v) in enumerate(CHAR_MAP.items()) if i < 5})

print("\n--- Next Steps ---")
print(f"The variables CHAR_MAP, INV_CHAR_MAP, and VOCAB_SIZE (={VOCAB_SIZE}) are now set and ready for model training.")

--- Vocabulary Generation Successful ---
VOCAB_SIZE: 35
Characters found:  abcdefghijklmnoprstuvz«»ĉĝĥĵŝŭﬁ

CHAR_MAP (First 5 entries):
{'<PAD>': 0, '<SOS>': 1, '<EOS>': 2, ' ': 3, 'a': 4}

--- Next Steps ---
The variables CHAR_MAP, INV_CHAR_MAP, and VOCAB_SIZE (=35) are now set and ready for model training.


In [3]:
class AudioDataset(Dataset):
    def __init__(self, csv_path):
        self.df = pd.read_csv(csv_path)  # Assumes columns: 'file', 'transcript'

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        audio_path = self.df.iloc[idx]['file']
        transcript = self.df.iloc[idx]['transcript'].strip().lower()

        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLE_RATE:
            waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)

        mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, win_length=WIN_LENGTH, n_fft=WIN_LENGTH
        )
        mel_spec = torch.log(mel_transform(waveform) + 1e-9)  # Log-Mel
        mel_spec = mel_spec.squeeze(0).transpose(0, 1)  # (seq_len, n_mels)

        target = [CHAR_MAP['<SOS>']] + [CHAR_MAP[c] for c in transcript if c in CHAR_MAP] + [CHAR_MAP['<EOS>']]
        target = torch.tensor(target, dtype=torch.long)

        return mel_spec, target

def collate_fn(batch):
    mels, targets = zip(*batch)
    mel_lens = torch.tensor([len(m) for m in mels])
    target_lens = torch.tensor([len(t) for t in targets])
    mels_padded = pad_sequence(mels, batch_first=True, padding_value=0)
    targets_padded = pad_sequence(targets, batch_first=True, padding_value=CHAR_MAP['<PAD>'])
    return mels_padded, targets_padded, mel_lens, target_lens

# Model components
class SinusoidalPosEmb(nn.Module):
    def __init__(self, dim=512):
        super().__init__()
        self.dim = dim

    def forward(self, x):
        device = x.device
        half_dim = self.dim // 2
        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
        emb = torch.exp(torch.arange(half_dim, device=device) * -emb)
        emb = x[..., None] * emb[None, :]  
        emb = torch.cat((emb.sin(), emb.cos()), dim=-1)
        return emb

class AudioEncoder(nn.Module):
    def __init__(self, input_dim=80, hidden_dim=512, num_layers=12, num_heads=8):
        super().__init__()
        self.conv_sub = nn.Sequential(
            nn.Conv1d(input_dim, hidden_dim, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(hidden_dim, hidden_dim, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=num_heads, dim_feedforward=2048, dropout=0.3, batch_first=True)
        self.transformer_enc = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)
        self.pos_emb = SinusoidalPosEmb(hidden_dim)

    def forward(self, x, lengths):
        # x: (batch, seq_len, 80)
        x = x.transpose(1, 2)  # For conv1d: (batch, 80, seq_len)
        x = self.conv_sub(x)  # (batch, 256, seq_len//4)
        x = x.transpose(1, 2)  # (batch, seq_len//4, 256)
        seq_len = x.size(1)
        pos = torch.arange(0, seq_len, device=x.device).unsqueeze(0).repeat(x.size(0), 1)
        x = x + self.pos_emb(pos)
        mask = torch.arange(seq_len, device=x.device)[None, :] >= (lengths // 4)[:, None]
        x = self.transformer_enc(x, src_key_padding_mask=mask)
        return x, mask

class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.query_proj = nn.Linear(hidden_dim, hidden_dim)
        self.key_proj = nn.Linear(hidden_dim, hidden_dim)
        self.value_proj = nn.Linear(hidden_dim, hidden_dim)
        self.scale = hidden_dim ** -0.5

    def forward(self, query, keys, values, mask=None):
        q = self.query_proj(query)
        k = self.key_proj(keys)
        v = self.value_proj(values)
        scores = torch.matmul(q, k.transpose(-2, -1)) * self.scale
        if mask is not None:
            scores = scores.masked_fill(mask.unsqueeze(1), -1e9)
        attn = torch.softmax(scores, dim=-1)
        context = torch.matmul(attn, v)
        return context

class RecurrentDecoder(nn.Module):
    def __init__(self, embed_dim=512, hidden_dim=1024, enc_dim=512, vocab_size=VOCAB_SIZE, num_layers=4):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=CHAR_MAP['<PAD>'])
        self.rnn = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=0.1)
        self.memory_proj = nn.Linear(enc_dim, hidden_dim)  
        self.attention = Attention(hidden_dim)
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)  # Concat rnn_out + context
        self.pos_emb = SinusoidalPosEmb(embed_dim)
        self.vocab_size = vocab_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, tgt, memory, tgt_lengths, memory_mask):
        # tgt: (batch, tgt_len)
        tgt_emb = self.embedding(tgt)
        tgt_seq_len = tgt.size(1)
        pos = torch.arange(0, tgt_seq_len, device=tgt.device).unsqueeze(0).repeat(tgt.size(0), 1)
        tgt_emb = tgt_emb + self.pos_emb(pos)
        packed_tgt = pack_padded_sequence(tgt_emb, tgt_lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_out, _ = self.rnn(packed_tgt)
        rnn_out, _ = pad_packed_sequence(packed_out, batch_first=True)
        
        # Project memory to match decoder dim
        memory_proj = self.memory_proj(memory)
        
        # Attention: query=rnn_out, key=value=memory_proj
        context = self.attention(rnn_out, memory_proj, memory_proj, memory_mask)
        
        combined = torch.cat((rnn_out, context), dim=-1)
        logits = self.fc(combined)
        return logits

    def decode_step(self, tgt_token, memory_proj, memory_mask, hidden, pos):
        # tgt_token: (batch=1, 1)
        tgt_emb = self.embedding(tgt_token)
        pos_tensor = torch.tensor([[pos]], device=tgt_emb.device).repeat(tgt_emb.size(0), 1)
        tgt_emb = tgt_emb + self.pos_emb(pos_tensor)
        rnn_out, hidden = self.rnn(tgt_emb, hidden)
        context = self.attention(rnn_out, memory_proj, memory_proj, memory_mask)
        combined = torch.cat((rnn_out, context), dim=-1)
        logit = self.fc(combined)
        return logit, hidden

class ASRModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = AudioEncoder(hidden_dim=512, num_layers=6, num_heads=8)  # Reduced from 12
        self.decoder = RecurrentDecoder(embed_dim=512, hidden_dim=1024, enc_dim=512, num_layers=2)  # Reduced from 4

    def forward(self, src, tgt, src_lengths, tgt_lengths):
        enc_out, enc_mask = self.encoder(src, src_lengths)
        logits = self.decoder(tgt, enc_out, tgt_lengths, enc_mask)
        return logits

    def predict(self, src, src_length, max_length=MAX_LENGTH, beam_width=1):
        # Added optional beam search (greedy if beam_width=1)
        self.eval()
        with torch.no_grad():
            enc_out, enc_mask = self.encoder(src.unsqueeze(0), src_length)
            memory_proj = self.decoder.memory_proj(enc_out)
            if beam_width == 1:
                # Existing greedy code
                hidden = (torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device),
                          torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device))
                tgt_token = torch.tensor([[CHAR_MAP['<SOS>']]], device=src.device)
                transcription = []
                for i in range(max_length):
                    logit, hidden = self.decoder.decode_step(tgt_token, memory_proj, enc_mask, hidden, i)
                    pred_token = logit.argmax(-1).squeeze().item()
                    if pred_token == CHAR_MAP['<EOS>']:
                        break
                    if pred_token in INV_CHAR_MAP:
                        transcription.append(INV_CHAR_MAP[pred_token])
                    tgt_token = torch.tensor([[pred_token]], device=src.device)
                return ''.join(transcription)
            else:
                # Simple beam search implementation
                beams = [{'seq': [CHAR_MAP['<SOS>']], 'score': 0.0, 'hidden': (torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device),
                                                                             torch.zeros(self.decoder.num_layers, 1, self.decoder.hidden_dim, device=src.device))}]
                for step in range(max_length):
                    new_beams = []
                    for beam in beams:
                        tgt_token = torch.tensor([[beam['seq'][-1]]], device=src.device)
                        logit, new_hidden = self.decoder.decode_step(tgt_token, memory_proj, enc_mask, beam['hidden'], step)
                        probs = torch.log_softmax(logit.squeeze(0).squeeze(0), dim=-1)
                        topk_probs, topk_tokens = probs.topk(beam_width)
                        for p, t in zip(topk_probs, topk_tokens):
                            new_seq = beam['seq'] + [t.item()]
                            new_score = beam['score'] + p.item()
                            new_beams.append({'seq': new_seq, 'score': new_score, 'hidden': new_hidden})
                    beams = sorted(new_beams, key=lambda b: b['score'], reverse=True)[:beam_width]
                    if beams[0]['seq'][-1] == CHAR_MAP['<EOS>']:
                        break
                best_seq = beams[0]['seq'][1:]  # Skip <SOS>
                transcription = [INV_CHAR_MAP[t] for t in best_seq if t in INV_CHAR_MAP and t != CHAR_MAP['<EOS>']]
                return ''.join(transcription)

def evaluate(model, dataloader, criterion, device):
    model.eval()  
    total_loss = 0
    with torch.no_grad():  # Disable gradient calculation
        for mels, targets, mel_lens, target_lens in dataloader:
            mels = mels.to(device)
            targets = targets.to(device)
            mel_lens = mel_lens.to(device)
            target_lens = target_lens.to(device)

            input_tgt = targets[:, :-1]
            label = targets[:, 1:]
            input_tgt_lens = target_lens - 1
            
            logits = model(mels, input_tgt, mel_lens, input_tgt_lens)
            loss = criterion(logits.reshape(-1, VOCAB_SIZE), label.reshape(-1))
            total_loss += loss.item()
            
    avg_loss = total_loss / len(dataloader)
    return avg_loss

In [4]:
train_dataset = AudioDataset(CSV_PATH)
val_dataset = AudioDataset(VAL_CSV_PATH)  

BATCH_SIZE = 64
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn) 

model = ASRModel()
model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.06,momentum=0.8)
criterion = nn.CrossEntropyLoss(ignore_index=CHAR_MAP['<PAD>'])

scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=1e-6)
early_stopping_patience = 7
patience_counter = 3
best_val_loss = math.inf
best_model_path = 'asr_best_model.pth'

num_epochs = 120
CLIP_VALUE = 4.0

In [5]:

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0
    progress_bar = tqdm(enumerate(dataloader), total=len(dataloader), desc=f"Epoch {epoch+1}/{num_epochs} (Train)")
    
    for batch_idx, (mels, targets, mel_lens, target_lens) in progress_bar:
        if total_train_loss != total_train_loss and batch_idx > 0:
            print("\nGradient Explosion detected mid-epoch. Breaking...")
            break
        mels = mels.to(device)
        targets = targets.to(device)
        mel_lens = mel_lens.to(device)
        target_lens = target_lens.to(device)
        
        optimizer.zero_grad()
        
        input_tgt = targets[:, :-1]
        label = targets[:, 1:]
        input_tgt_lens = target_lens - 1

        logits = model(mels, input_tgt, mel_lens, input_tgt_lens)
        loss = criterion(logits.reshape(-1, VOCAB_SIZE), label.reshape(-1))

        if loss.isnan():
            print(f"\nWarning: NaN loss detected in batch {batch_idx+1}. Skipping batch.")
            continue
        if loss.isinf():
            print(f"\nWarning: Inf loss detected in batch {batch_idx+1}. Skipping batch.")
            continue
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_VALUE)
        optimizer.step()
        total_train_loss += loss.item()
        progress_bar.set_postfix(batch_loss=f"{loss.item():.4f}")  

    if total_train_loss == total_train_loss: 
        avg_train_loss = total_train_loss / len(dataloader)
    else:
        avg_train_loss = float('nan')

    avg_val_loss = evaluate(model, val_dataloader, criterion, device)
    
    # Step the Learning Rate Scheduler
    if avg_val_loss == avg_val_loss:
        scheduler.step(avg_val_loss)
    current_lr = optimizer.param_groups[0]['lr']

    # Early Stopping Check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model.state_dict(), best_model_path) 
        status = " (Saving Best Model)"
    else:
        patience_counter += 1
        status = ""
        
    print(f'Epoch {epoch+1} completed, Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}, LR: {current_lr:.6f}{status}')
    
    if patience_counter >= early_stopping_patience:
        print(f"\nEarly stopping triggered after {patience_counter} epochs without improvement on Val Loss.")
        break

if os.path.exists(best_model_path):
    print(f"\nLoading best model weights from {best_model_path} (Val Loss: {best_val_loss:.4f}).")
    model.load_state_dict(torch.load(best_model_path))


Epoch 1/120 (Train): 100%|████████████████| 94/94 [00:43<00:00,  2.14it/s, batch_loss=2.8116]


Epoch 1 completed, Train Loss: 2.9431, Val Loss: 2.8538, LR: 0.060000 (Saving Best Model)


Epoch 2/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.5413]


Epoch 2 completed, Train Loss: 2.7136, Val Loss: 2.5740, LR: 0.060000 (Saving Best Model)


Epoch 3/120 (Train): 100%|████████████████| 94/94 [00:41<00:00,  2.24it/s, batch_loss=2.3694]


Epoch 3 completed, Train Loss: 2.4513, Val Loss: 2.3936, LR: 0.060000 (Saving Best Model)


Epoch 4/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.3179]


Epoch 4 completed, Train Loss: 2.3213, Val Loss: 2.3020, LR: 0.060000 (Saving Best Model)


Epoch 5/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=2.1771]


Epoch 5 completed, Train Loss: 2.2342, Val Loss: 2.2154, LR: 0.060000 (Saving Best Model)


Epoch 6/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=2.0855]


Epoch 6 completed, Train Loss: 2.1621, Val Loss: 2.1586, LR: 0.060000 (Saving Best Model)


Epoch 7/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=2.0692]


Epoch 7 completed, Train Loss: 2.1056, Val Loss: 2.1019, LR: 0.060000 (Saving Best Model)


Epoch 8/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=2.0159]


Epoch 8 completed, Train Loss: 2.0529, Val Loss: 2.0795, LR: 0.060000 (Saving Best Model)


Epoch 9/120 (Train): 100%|████████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=1.9898]


Epoch 9 completed, Train Loss: 2.0092, Val Loss: 2.0267, LR: 0.060000 (Saving Best Model)


Epoch 10/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.16it/s, batch_loss=1.9493]


Epoch 10 completed, Train Loss: 1.9679, Val Loss: 2.0081, LR: 0.060000 (Saving Best Model)


Epoch 11/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.16it/s, batch_loss=1.9088]


Epoch 11 completed, Train Loss: 1.9307, Val Loss: 1.9726, LR: 0.060000 (Saving Best Model)


Epoch 12/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.8498]


Epoch 12 completed, Train Loss: 1.8923, Val Loss: 1.9198, LR: 0.060000 (Saving Best Model)


Epoch 13/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.17it/s, batch_loss=1.8963]


Epoch 13 completed, Train Loss: 1.8561, Val Loss: 1.8922, LR: 0.060000 (Saving Best Model)


Epoch 14/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.8216]


Epoch 14 completed, Train Loss: 1.8255, Val Loss: 1.8866, LR: 0.060000 (Saving Best Model)


Epoch 15/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.7569]


Epoch 15 completed, Train Loss: 1.7922, Val Loss: 1.8601, LR: 0.060000 (Saving Best Model)


Epoch 16/120 (Train): 100%|███████████████| 94/94 [00:41<00:00,  2.25it/s, batch_loss=1.7946]


Epoch 16 completed, Train Loss: 1.7646, Val Loss: 1.8265, LR: 0.060000 (Saving Best Model)


Epoch 17/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.7219]


Epoch 17 completed, Train Loss: 1.7349, Val Loss: 1.8145, LR: 0.060000 (Saving Best Model)


Epoch 18/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.6846]


Epoch 18 completed, Train Loss: 1.7073, Val Loss: 1.7759, LR: 0.060000 (Saving Best Model)


Epoch 19/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.24it/s, batch_loss=1.6567]


Epoch 19 completed, Train Loss: 1.6783, Val Loss: 1.7607, LR: 0.060000 (Saving Best Model)


Epoch 20/120 (Train): 100%|███████████████| 94/94 [00:41<00:00,  2.26it/s, batch_loss=1.6427]


Epoch 20 completed, Train Loss: 1.6544, Val Loss: 1.7550, LR: 0.060000 (Saving Best Model)


Epoch 21/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.6725]


Epoch 21 completed, Train Loss: 1.6279, Val Loss: 1.7571, LR: 0.060000


Epoch 22/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.6318]


Epoch 22 completed, Train Loss: 1.6067, Val Loss: 1.7348, LR: 0.060000 (Saving Best Model)


Epoch 23/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.17it/s, batch_loss=1.6055]


Epoch 23 completed, Train Loss: 1.5831, Val Loss: 1.6947, LR: 0.060000 (Saving Best Model)


Epoch 24/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.5732]


Epoch 24 completed, Train Loss: 1.5625, Val Loss: 1.6822, LR: 0.060000 (Saving Best Model)


Epoch 25/120 (Train): 100%|███████████████| 94/94 [00:41<00:00,  2.24it/s, batch_loss=1.5254]


Epoch 25 completed, Train Loss: 1.5352, Val Loss: 1.6651, LR: 0.060000 (Saving Best Model)


Epoch 26/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.16it/s, batch_loss=1.5309]


Epoch 26 completed, Train Loss: 1.5157, Val Loss: 1.6666, LR: 0.060000


Epoch 27/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.16it/s, batch_loss=1.5038]


Epoch 27 completed, Train Loss: 1.4927, Val Loss: 1.6527, LR: 0.060000 (Saving Best Model)


Epoch 28/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.4708]


Epoch 28 completed, Train Loss: 1.4743, Val Loss: 1.6515, LR: 0.060000 (Saving Best Model)


Epoch 29/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.4364]


Epoch 29 completed, Train Loss: 1.4506, Val Loss: 1.6212, LR: 0.060000 (Saving Best Model)


Epoch 30/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.24it/s, batch_loss=1.4108]


Epoch 30 completed, Train Loss: 1.4322, Val Loss: 1.6449, LR: 0.060000


Epoch 31/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.3911]


Epoch 31 completed, Train Loss: 1.4137, Val Loss: 1.6143, LR: 0.060000 (Saving Best Model)


Epoch 32/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.4014]


Epoch 32 completed, Train Loss: 1.3940, Val Loss: 1.6124, LR: 0.060000 (Saving Best Model)


Epoch 33/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.3242]


Epoch 33 completed, Train Loss: 1.3745, Val Loss: 1.6213, LR: 0.060000


Epoch 34/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.3356]


Epoch 34 completed, Train Loss: 1.3543, Val Loss: 1.6066, LR: 0.060000 (Saving Best Model)


Epoch 35/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.4006]


Epoch 35 completed, Train Loss: 1.3350, Val Loss: 1.6157, LR: 0.060000


Epoch 36/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.3045]


Epoch 36 completed, Train Loss: 1.3194, Val Loss: 1.5960, LR: 0.060000 (Saving Best Model)


Epoch 37/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.24it/s, batch_loss=1.2843]


Epoch 37 completed, Train Loss: 1.3021, Val Loss: 1.5933, LR: 0.060000 (Saving Best Model)


Epoch 38/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.2780]


Epoch 38 completed, Train Loss: 1.2807, Val Loss: 1.5681, LR: 0.060000 (Saving Best Model)


Epoch 39/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.2463]


Epoch 39 completed, Train Loss: 1.2572, Val Loss: 1.5772, LR: 0.060000


Epoch 40/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.2347]


Epoch 40 completed, Train Loss: 1.2431, Val Loss: 1.6207, LR: 0.060000


Epoch 41/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.2379]


Epoch 41 completed, Train Loss: 1.2174, Val Loss: 1.6046, LR: 0.060000


Epoch 42/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.2019]


Epoch 42 completed, Train Loss: 1.1958, Val Loss: 1.5897, LR: 0.060000


Epoch 43/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=1.1746]


Epoch 43 completed, Train Loss: 1.1714, Val Loss: 1.6347, LR: 0.060000


Epoch 44/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.20it/s, batch_loss=1.1253]


Epoch 44 completed, Train Loss: 1.1529, Val Loss: 1.5524, LR: 0.060000 (Saving Best Model)


Epoch 45/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=1.1040]


Epoch 45 completed, Train Loss: 1.1183, Val Loss: 1.5811, LR: 0.060000


Epoch 46/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.1481]


Epoch 46 completed, Train Loss: 1.0942, Val Loss: 1.5700, LR: 0.060000


Epoch 47/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=1.0706]


Epoch 47 completed, Train Loss: 1.0762, Val Loss: 1.5261, LR: 0.060000 (Saving Best Model)


Epoch 48/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=1.0445]


Epoch 48 completed, Train Loss: 1.0365, Val Loss: 1.5362, LR: 0.060000


Epoch 49/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.19it/s, batch_loss=0.9667]


Epoch 49 completed, Train Loss: 1.0014, Val Loss: 1.5181, LR: 0.060000 (Saving Best Model)


Epoch 50/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=0.9828]


Epoch 50 completed, Train Loss: 0.9687, Val Loss: 1.5121, LR: 0.060000 (Saving Best Model)


Epoch 51/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=0.9229]


Epoch 51 completed, Train Loss: 0.9318, Val Loss: 1.4874, LR: 0.060000 (Saving Best Model)


Epoch 52/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=0.8916]


Epoch 52 completed, Train Loss: 0.8934, Val Loss: 1.4920, LR: 0.060000


Epoch 53/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=0.9525]


Epoch 53 completed, Train Loss: 0.8631, Val Loss: 1.5131, LR: 0.060000


Epoch 54/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=0.7929]


Epoch 54 completed, Train Loss: 0.8264, Val Loss: 1.5212, LR: 0.060000


Epoch 55/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=0.7590]


Epoch 55 completed, Train Loss: 0.7810, Val Loss: 1.4691, LR: 0.060000 (Saving Best Model)


Epoch 56/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=0.7668]


Epoch 56 completed, Train Loss: 0.7436, Val Loss: 1.4853, LR: 0.060000


Epoch 57/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=0.6871]


Epoch 57 completed, Train Loss: 0.7094, Val Loss: 1.4643, LR: 0.060000 (Saving Best Model)


Epoch 58/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.21it/s, batch_loss=0.6459]


Epoch 58 completed, Train Loss: 0.6666, Val Loss: 1.4466, LR: 0.060000 (Saving Best Model)


Epoch 59/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.22it/s, batch_loss=0.7228]


Epoch 59 completed, Train Loss: 0.6476, Val Loss: 1.4825, LR: 0.060000


Epoch 60/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=0.6052]


Epoch 60 completed, Train Loss: 0.5941, Val Loss: 1.4766, LR: 0.060000


Epoch 61/120 (Train): 100%|███████████████| 94/94 [00:41<00:00,  2.25it/s, batch_loss=0.5738]


Epoch 61 completed, Train Loss: 0.5673, Val Loss: 1.5147, LR: 0.060000


Epoch 62/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=0.5770]


Epoch 62 completed, Train Loss: 0.5230, Val Loss: 1.5309, LR: 0.060000


Epoch 63/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.18it/s, batch_loss=0.5036]


Epoch 63 completed, Train Loss: 0.5123, Val Loss: 1.4629, LR: 0.060000


Epoch 64/120 (Train): 100%|███████████████| 94/94 [00:43<00:00,  2.17it/s, batch_loss=0.5054]


Epoch 64 completed, Train Loss: 0.4719, Val Loss: 1.4868, LR: 0.030000


Epoch 65/120 (Train): 100%|███████████████| 94/94 [00:42<00:00,  2.23it/s, batch_loss=0.3493]


Epoch 65 completed, Train Loss: 0.3798, Val Loss: 1.4898, LR: 0.030000

Early stopping triggered after 7 epochs without improvement on Val Loss.

Loading best model weights from asr_best_model.pth (Val Loss: 1.4466).


In [6]:
import zipfile
from tqdm import tqdm
MODEL_PATH = 'asr_best_model.pth'
TEST_CSV_PATH = "geo/dev.csv"
model = ASRModel().to(device)
model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
# Load test CSV
test_df = pd.read_csv(TEST_CSV_PATH)
predicted_df = test_df.copy()
predicted_df['transcript'] = predicted_df['transcript'].astype('object') # Fix dtype upfront
# Predict for each row with progress and error handling
for idx, row in tqdm(predicted_df.iterrows(), total=len(predicted_df), desc="Predicting transcripts"):
    try:
        audio_path = row['file']
        waveform, sr = torchaudio.load(audio_path)
        if sr != SAMPLE_RATE:
            waveform = torchaudio.transforms.Resample(sr, SAMPLE_RATE)(waveform)
        mel_transform = torchaudio.transforms.MelSpectrogram(
            sample_rate=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH, win_length=WIN_LENGTH, n_fft=WIN_LENGTH
        )
        mel_spec = torch.log(mel_transform(waveform) + 1e-9).squeeze(0).transpose(0, 1)
        src_length = torch.tensor([mel_spec.size(0)])
        prediction = model.predict(mel_spec.to(device), src_length.to(device), beam_width=3)
        predicted_df.at[idx, 'transcript'] = prediction
    except Exception as e:
        print(f"Error processing index {idx} (file: {audio_path}): {e}")
        predicted_df.at[idx, 'transcript'] = '' # Set empty on error
# Verify DataFrame before saving
print("\nSample of predicted_df:")
print(predicted_df.head())
# Ensure transcript column contains only strings
predicted_df['transcript'] = predicted_df['transcript'].fillna('').astype(str)
# Save updated CSV
output_csv = 'predicted_dev.csv'
predicted_df.to_csv(output_csv, index=False)

Predicting transcripts: 100%|████████████████████████████| 1000/1000 [01:14<00:00, 13.38it/s]


Sample of predicted_df:
                  file                                         transcript
0  geo/clips/dev_0.wav  ili betez grizgaditam betomum giu lumgaz lin l...
1  geo/clips/dev_1.wav  por naljoma gaj baldaŭ truviĝas imtirmacea luĝ...
2  geo/clips/dev_2.wav                             li vareĝiz teez tirazi
3  geo/clips/dev_3.wav                    zid em la lamdoj eztez elegteta
4  geo/clips/dev_4.wav  gaj la ĝemerala magcimto em la jomaj gumzirfeztoj





In [7]:
import pandas as pd
from jiwer import wer, cer

df_gt = pd.read_csv("geo/dev.csv")               # ground truth
df_pred = pd.read_csv("predicted_dev.csv")   # predictions

# Ensure both are sorted the same way (optional but recommended)
df_gt = df_gt.sort_values("file").reset_index(drop=True)
df_pred = df_pred.sort_values("file").reset_index(drop=True)

df_gt['transcript'] = df_gt['transcript'].fillna('').astype(str)
df_pred['transcript'] = df_pred['transcript'].fillna('').astype(str)

gt_texts = df_gt["transcript"].tolist()
pred_texts = df_pred["transcript"].tolist()


overall_wer = wer(gt_texts, pred_texts)
overall_cer = cer(gt_texts, pred_texts)

print("WER:", overall_wer)
print("CER:", overall_cer)


WER: 1.039442545358927
CER: 0.7458841847365649
