# üéØ AEMER - Accent Detection Model Training (16 Accents)

**Architecture:** CNN-BiLSTM with Attention

**Dataset:** Mozilla Common Voice (English) ‚Äî Real speech data

**Output Classes:** 16 English accents worldwide

**Author:** Sanjula Sunath | w1999522


## 1Ô∏è‚É£ Setup & Dependencies


In [None]:
!pip install torch torchaudio librosa pandas numpy scikit-learn tqdm matplotlib seaborn datasets --quiet

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import librosa
import os
import random
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Download Speech Accent Archive from Kaggle
!pip install kagglehub --quiet
import kagglehub

print("üì• Downloading Speech Accent Archive from Kaggle...")
dataset_path = kagglehub.dataset_download("rtatman/speech-accent-archive")
print(f"‚úÖ Dataset downloaded to: {dataset_path}")

import os
print("\nFiles:")
for f in os.listdir(dataset_path):
    print(f"  {f}")

# Check for recordings directory
recordings_dir = None
for root, dirs, files in os.walk(dataset_path):
    for d in dirs:
        if 'recording' in d.lower():
            recordings_dir = os.path.join(root, d)
            break
    if recordings_dir:
        break

# Also check if MP3s are directly in dataset_path
if recordings_dir is None:
    mp3_files = [f for f in os.listdir(dataset_path) if f.endswith('.mp3')]
    if mp3_files:
        recordings_dir = dataset_path

print(f"üìÇ Recordings directory: {recordings_dir}")
if recordings_dir:
    audio_files = [f for f in os.listdir(recordings_dir) if f.endswith('.mp3') or f.endswith('.wav')]
    print(f"   Found {len(audio_files)} audio files")

In [None]:
# Build accent dataset from Speech Accent Archive metadata
import pandas as pd
import glob

# Find the CSV metadata file
csv_files = glob.glob(os.path.join(dataset_path, '**', '*.csv'), recursive=True)
print(f"Found CSV files: {csv_files}")

# Load metadata
df = pd.read_csv(csv_files[0])
print(f"\nMetadata shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"\nNative languages (top 20):")
print(df['native_language'].value_counts().head(20))

# Map native languages to our accent classes
LANGUAGE_TO_ACCENT = {
    # American English
    'english': None,  # Will split by country/birth_place later
    
    # South Asian  
    'hindi': 'indian', 'urdu': 'indian', 'bengali': 'indian',
    'tamil': 'indian', 'telugu': 'indian', 'gujarati': 'indian',
    'punjabi': 'indian', 'marathi': 'indian', 'kannada': 'indian',
    'malayalam': 'indian', 'nepali': 'indian', 'sinhala': 'indian',
    'sinhalese': 'indian', 'dari': 'indian',
    
    # East Asian
    'mandarin': 'hongkong', 'cantonese': 'hongkong',
    'chinese': 'hongkong',
    
    # Southeast Asian
    'malay': 'malaysian', 'bahasa': 'malaysian',
    'tagalog': 'filipino', 'cebuano': 'filipino', 'ilocano': 'filipino',
    
    # African accents
    'amharic': 'african', 'swahili': 'african', 'yoruba': 'african',
    'igbo': 'african', 'hausa': 'african', 'zulu': 'african',
    'twi': 'african', 'shona': 'african', 'akan': 'african',
    'luganda': 'african', 'wolof': 'african', 'somali': 'african',
    'kinyarwanda': 'african', 'xhosa': 'african', 'sesotho': 'african',
    'afrikaans': 'african',

    # Irish
    'irish': 'irish', 'gaelic': 'irish',

    # Scottish  
    'scots': 'scottish',

    # Welsh
    'welsh': 'welsh',
    
    # Korean
    'korean': 'singaporean',  # Group with SE Asian
    
    # Japanese
    'japanese': 'hongkong',  # Group with East Asian
    
    # Arabic
    'arabic': 'african',
    
    # Spanish-influenced
    'spanish': 'bermudian',  # Map to available class
    
    # French
    'french': 'bermudian',
    
    # German  
    'german': 'southatlantic',
    
    # Portuguese
    'portuguese': 'southatlantic',
    
    # Dutch
    'dutch': 'southatlantic',
    
    # Russian / Eastern European
    'russian': 'southatlantic',
    'polish': 'southatlantic',
    'turkish': 'southatlantic',
    'romanian': 'southatlantic',
    'czech': 'southatlantic',
    'hungarian': 'southatlantic',
    'serbian': 'southatlantic',
    'croatian': 'southatlantic',
    'bulgarian': 'southatlantic',
    'ukrainian': 'southatlantic',
    
    # Italian
    'italian': 'bermudian',
    
    # Vietnamese
    'vietnamese': 'singaporean',
    'thai': 'singaporean',
    'indonesian': 'singaporean',
}

# Countries to help classify native English speakers
COUNTRY_TO_ACCENT = {
    'usa': 'american', 'united states': 'american', 'us': 'american',
    'uk': 'british', 'england': 'british', 'united kingdom': 'british',
    'australia': 'australian',
    'canada': 'canadian',
    'new zealand': 'newzealand',
    'scotland': 'scottish',
    'ireland': 'irish',
    'wales': 'welsh',
    'south africa': 'african',
    'nigeria': 'african', 'kenya': 'african', 'ghana': 'african',
    'india': 'indian', 'pakistan': 'indian', 'bangladesh': 'indian', 'sri lanka': 'indian',
    'malaysia': 'malaysian',
    'philippines': 'filipino',
    'singapore': 'singaporean',
    'hong kong': 'hongkong',
    'bermuda': 'bermudian',
}

data_list = []
accent_counts = {}
skipped = 0

for _, row in df.iterrows():
    try:
        lang = str(row.get('native_language', '')).strip().lower()
        country = str(row.get('country', '')).strip().lower()
        birth_place = str(row.get('birthplace', '')).strip().lower()
        filename = str(row.get('filename', ''))
        
        if not filename or filename == 'nan':
            skipped += 1
            continue
        
        # Determine accent
        accent = None
        
        # First try language mapping
        if lang in LANGUAGE_TO_ACCENT:
            accent = LANGUAGE_TO_ACCENT[lang]
        
        # For native English speakers, use country
        if accent is None and lang == 'english':
            for key, acc in COUNTRY_TO_ACCENT.items():
                if key in country or key in birth_place:
                    accent = acc
                    break
            if accent is None:
                accent = 'american'  # Default English to American
        
        # Try country mapping if still None
        if accent is None:
            for key, acc in COUNTRY_TO_ACCENT.items():
                if key in country or key in birth_place:
                    accent = acc
                    break
        
        if accent is None:
            skipped += 1
            continue
        
        if accent not in ACCENT_TO_IDX:
            skipped += 1
            continue
            
        label = ACCENT_TO_IDX[accent]
        
        # Find audio file
        audio_path = None
        if recordings_dir:
            # Try common filename patterns
            for ext in ['.mp3', '.wav']:
                candidate = os.path.join(recordings_dir, filename + ext)
                if os.path.exists(candidate):
                    audio_path = candidate
                    break
                # Also try without extension if filename already has it
                candidate = os.path.join(recordings_dir, filename)
                if os.path.exists(candidate):
                    audio_path = candidate
                    break
        
        if audio_path is None:
            skipped += 1
            continue
        
        data_list.append({
            'audio_path': audio_path,
            'label': label,
            'accent': accent,
        })
        accent_counts[accent] = accent_counts.get(accent, 0) + 1
        
    except Exception as e:
        skipped += 1
        continue

print(f"\n‚úÖ Processed: {len(data_list)} samples")
print(f"‚è≠Ô∏è Skipped: {skipped}")
print(f"\nüìä Samples per accent:")
for accent, count in sorted(accent_counts.items(), key=lambda x: -x[1]):
    print(f"  {accent}: {count}")

# Balance & filter accents
from collections import defaultdict

# Only keep accents with enough samples
MIN_SAMPLES = 10  # Lower threshold since dataset is smaller
samples_by_class = defaultdict(list)
for d in data_list:
    samples_by_class[d['label']].append(d)

# Filter out classes with too few samples  
valid_classes = {label for label, samples in samples_by_class.items() if len(samples) >= MIN_SAMPLES}

print(f"Accents with >= {MIN_SAMPLES} samples:")
for label in sorted(valid_classes):
    accent = ACCENT_LABELS[label]
    count = len(samples_by_class[label])
    print(f"  [{label}] {accent}: {count} samples")

# Rebuild with only valid classes, remap labels
valid_data = [d for d in data_list if d['label'] in valid_classes]
sorted_valid = sorted(valid_classes)
old_to_new = {old: new for new, old in enumerate(sorted_valid)}

FINAL_ACCENTS = [ACCENT_LABELS[old] for old in sorted_valid]
FINAL_ACCENT_TO_IDX = {a: i for i, a in enumerate(FINAL_ACCENTS)}
FINAL_IDX_TO_ACCENT = {i: a for i, a in enumerate(FINAL_ACCENTS)}
FINAL_NUM_CLASSES = len(FINAL_ACCENTS)

for d in valid_data:
    d['label'] = old_to_new[d['label']]

# Balance using oversampling
target_size = int(np.median([len(samples_by_class[c]) for c in valid_classes]))
target_size = max(target_size, MIN_SAMPLES * 2)

balanced_data = []
new_samples_by_class = defaultdict(list)
for d in valid_data:
    new_samples_by_class[d['label']].append(d)

for label, samples in new_samples_by_class.items():
    if len(samples) >= target_size:
        balanced_data.extend(random.sample(samples, target_size))
    else:
        balanced_data.extend(samples)
        extra = target_size - len(samples)
        balanced_data.extend(random.choices(samples, k=extra))

random.shuffle(balanced_data)
data_list = balanced_data

print(f"\n‚úÖ Final: {FINAL_NUM_CLASSES} accent classes, {len(data_list)} total samples")
print(f"\nüìä Balanced distribution:")
from collections import Counter
final_counts = Counter(d['label'] for d in data_list)
for label in sorted(final_counts.keys()):
    print(f"  [{label}] {FINAL_IDX_TO_ACCENT[label]}: {final_counts[label]}")

In [None]:
class AccentDataset(Dataset):
    def __init__(self, data_list, augment=False):
        self.data = data_list
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def process_audio(self, audio_path):
        # Load audio file
        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION + 1)
        
        # Trim silence
        audio, _ = librosa.effects.trim(audio, top_db=20)

        # Normalize
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val

        # Fix length
        target = int(DURATION * SAMPLE_RATE)
        if len(audio) < target:
            audio = np.pad(audio, (0, target - len(audio)))
        else:
            if self.augment and len(audio) > target:
                start = random.randint(0, len(audio) - target)
                audio = audio[start:start + target]
            else:
                audio = audio[:target]

        # Data augmentation
        if self.augment:
            if random.random() < 0.3:
                audio = audio + np.random.randn(len(audio)) * 0.005
            if random.random() < 0.2:
                n_steps = random.uniform(-1, 1)
                audio = librosa.effects.pitch_shift(audio, sr=SAMPLE_RATE, n_steps=n_steps)
            if random.random() < 0.2:
                rate = random.uniform(0.9, 1.1)
                audio = librosa.effects.time_stretch(audio, rate=rate)
                if len(audio) < target:
                    audio = np.pad(audio, (0, target - len(audio)))
                else:
                    audio = audio[:target]

        # Mel spectrogram
        mel = librosa.feature.melspectrogram(
            y=audio, sr=SAMPLE_RATE, n_mels=N_MELS,
            n_fft=N_FFT, hop_length=HOP_LENGTH
        )
        spec = librosa.power_to_db(mel, ref=np.max)
        spec = (spec - spec.mean()) / (spec.std() + 1e-8)

        if spec.shape[1] < MAX_LEN:
            spec = np.pad(spec, ((0, 0), (0, MAX_LEN - spec.shape[1])))
        else:
            spec = spec[:, :MAX_LEN]

        return spec

    def __getitem__(self, idx):
        item = self.data[idx]
        label = item['label']
        try:
            spec = self.process_audio(item['audio_path'])
        except Exception as e:
            # Return zeros on error (rare)
            spec = np.zeros((N_MELS, MAX_LEN))
        return torch.FloatTensor(spec).unsqueeze(0), label

print("‚úÖ AccentDataset class defined (loads from audio files)")

## 4Ô∏è‚É£ Build Accent Dataset from Common Voice


In [None]:
# Process Common Voice data and filter by accent
import soundfile as sf
from collections import Counter

data_list = []
accent_counts = Counter()
skipped = 0

print("üîÑ Processing audio files and mapping accents...")

for i, sample in enumerate(tqdm(cv_dataset, desc="Processing")):
    try:
        # Get accent label from the sample
        accent_raw = sample.get('accent', '') or ''
        accent_raw = accent_raw.strip().lower()
        
        if not accent_raw:
            skipped += 1
            continue
        
        # Map to our accent classes
        mapped_accent = CV_ACCENT_MAP.get(accent_raw)
        if mapped_accent is None:
            skipped += 1
            continue
        
        label = ACCENT_TO_IDX[mapped_accent]
        
        # Get audio
        audio_info = sample.get('audio', None)
        if audio_info is None:
            skipped += 1
            continue
        
        audio_array = audio_info['array']
        sr = audio_info['sampling_rate']
        
        if len(audio_array) < sr * 0.5:  # Skip very short clips (<0.5s)
            skipped += 1
            continue
        
        data_list.append({
            'audio': audio_array,
            'sample_rate': sr,
            'label': label,
        })
        accent_counts[mapped_accent] += 1
        
    except Exception as e:
        skipped += 1
        continue

print(f"\n‚úÖ Processed samples: {len(data_list)}")
print(f"‚è≠Ô∏è Skipped: {skipped}")
print(f"\nüìä Samples per accent:")
for accent, count in sorted(accent_counts.items(), key=lambda x: -x[1]):
    flag = '‚úÖ' if count >= MIN_SAMPLES_PER_CLASS else '‚ö†Ô∏è'
    print(f"  {flag} {accent}: {count}")

# Remove accents with too few samples
valid_accents = {a for a, c in accent_counts.items() if c >= MIN_SAMPLES_PER_CLASS}
data_list = [d for d in data_list if ACCENT_LABELS[d['label']] in valid_accents]

print(f"\nüéØ Using {len(valid_accents)} accents with >= {MIN_SAMPLES_PER_CLASS} samples")
print(f"üìä Total training samples: {len(data_list)}")

# Rebuild label mapping for valid accents only
FINAL_ACCENTS = sorted(valid_accents)
FINAL_ACCENT_TO_IDX = {a: i for i, a in enumerate(FINAL_ACCENTS)}
FINAL_IDX_TO_ACCENT = {i: a for a, i in FINAL_ACCENT_TO_IDX.items()}
FINAL_NUM_CLASSES = len(FINAL_ACCENTS)

# Remap labels
for d in data_list:
    old_accent = ACCENT_LABELS[d['label']]
    d['label'] = FINAL_ACCENT_TO_IDX[old_accent]

print(f"\n‚úÖ Final accent classes ({FINAL_NUM_CLASSES}):")
for i, accent in FINAL_IDX_TO_ACCENT.items():
    count = sum(1 for d in data_list if d['label'] == i)
    print(f"  [{i}] {accent}: {count} samples")

## 5Ô∏è‚É£ Balance Dataset


In [None]:
# Balance using oversampling (duplicate minority class samples)
from collections import defaultdict

samples_by_class = defaultdict(list)
for d in data_list:
    samples_by_class[d['label']].append(d)

# Target: match the median class size (avoid extreme over/under sampling)
class_sizes = [len(v) for v in samples_by_class.values()]
target_size = int(np.median(class_sizes))
target_size = min(target_size, max(class_sizes))  # Don't exceed max

print(f"‚öñÔ∏è Balancing to ~{target_size} samples per class")

balanced_data = []
for label, samples in samples_by_class.items():
    if len(samples) >= target_size:
        # Downsample: random selection
        balanced_data.extend(random.sample(samples, target_size))
    else:
        # Oversample: repeat + random extra
        balanced_data.extend(samples)
        extra_needed = target_size - len(samples)
        balanced_data.extend(random.choices(samples, k=extra_needed))

random.shuffle(balanced_data)
data_list = balanced_data

print(f"‚úÖ Balanced dataset: {len(data_list)} total samples")
print(f"üìä Per class:")
final_counts = Counter(d['label'] for d in data_list)
for label in sorted(final_counts.keys()):
    print(f"  [{label}] {FINAL_IDX_TO_ACCENT[label]}: {final_counts[label]}")

## 6Ô∏è‚É£ Dataset Class


In [None]:
class AccentDataset(Dataset):
    def __init__(self, data_list, augment=False):
        self.data = data_list
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def process_audio(self, audio, sr):
        # Resample to target rate
        if sr != SAMPLE_RATE:
            audio = librosa.resample(audio, orig_sr=sr, target_sr=SAMPLE_RATE)

        # Trim silence
        audio, _ = librosa.effects.trim(audio, top_db=20)

        # Normalize
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val

        # Fix length to DURATION seconds
        target = int(DURATION * SAMPLE_RATE)
        if len(audio) < target:
            audio = np.pad(audio, (0, target - len(audio)))
        else:
            # Random crop for augmentation, center crop otherwise
            if self.augment and len(audio) > target:
                start = random.randint(0, len(audio) - target)
                audio = audio[start:start + target]
            else:
                audio = audio[:target]

        # Data augmentation
        if self.augment:
            # Random noise
            if random.random() < 0.3:
                noise = np.random.randn(len(audio)) * 0.005
                audio = audio + noise
            # Random pitch shift
            if random.random() < 0.2:
                n_steps = random.uniform(-1, 1)
                audio = librosa.effects.pitch_shift(audio, sr=SAMPLE_RATE, n_steps=n_steps)
            # Random speed change
            if random.random() < 0.2:
                rate = random.uniform(0.9, 1.1)
                audio = librosa.effects.time_stretch(audio, rate=rate)
                if len(audio) < target:
                    audio = np.pad(audio, (0, target - len(audio)))
                else:
                    audio = audio[:target]

        # Mel spectrogram
        mel = librosa.feature.melspectrogram(
            y=audio, sr=SAMPLE_RATE, n_mels=N_MELS,
            n_fft=N_FFT, hop_length=HOP_LENGTH
        )
        spec = librosa.power_to_db(mel, ref=np.max)

        # Normalize spectrogram
        spec = (spec - spec.mean()) / (spec.std() + 1e-8)

        # Fix time dimension
        if spec.shape[1] < MAX_LEN:
            spec = np.pad(spec, ((0, 0), (0, MAX_LEN - spec.shape[1])))
        else:
            spec = spec[:, :MAX_LEN]

        return spec

    def __getitem__(self, idx):
        item = self.data[idx]
        label = item['label']
        audio = np.array(item['audio'], dtype=np.float32)
        sr = item['sample_rate']
        spec = self.process_audio(audio, sr)
        return torch.FloatTensor(spec).unsqueeze(0), label

print("‚úÖ AccentDataset class defined (with augmentation)")

## 7Ô∏è‚É£ CNN-BiLSTM Model


In [None]:
class CNN_BiLSTM_Accent(nn.Module):
    def __init__(self, num_classes=16):
        super().__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(1, 32, 3, 1, 1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.2))
        self.conv2 = nn.Sequential(nn.Conv2d(32, 64, 3, 1, 1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.2))
        self.conv3 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.3))
        self.lstm = nn.LSTM(128 * 16, 128, 2, batch_first=True, bidirectional=True, dropout=0.3)
        self.attention = nn.Sequential(nn.Linear(256, 64), nn.Tanh(), nn.Linear(64, 1))
        self.fc = nn.Sequential(nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.4), nn.Linear(128, num_classes))

    def forward(self, x):
        x = self.conv3(self.conv2(self.conv1(x)))
        x = x.permute(0, 3, 1, 2).reshape(x.size(0), -1, 128 * 16)
        lstm_out, _ = self.lstm(x)
        attn = F.softmax(self.attention(lstm_out), dim=1)
        return self.fc(torch.sum(attn * lstm_out, dim=1))

model = CNN_BiLSTM_Accent(FINAL_NUM_CLASSES).to(device)
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"‚úÖ Model created for {FINAL_NUM_CLASSES} accent classes")
print(f"   Total parameters: {total_params:,}")
print(f"   Trainable: {trainable_params:,}")

## 8Ô∏è‚É£ Prepare Data Loaders


In [None]:
train_data, val_data = train_test_split(
    data_list, test_size=0.2, random_state=42,
    stratify=[d['label'] for d in data_list]
)
print(f"Training: {len(train_data)}, Validation: {len(val_data)}")

train_dataset = AccentDataset(train_data, augment=True)
val_dataset = AccentDataset(val_data, augment=False)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"Train batches: {len(train_loader)}, Val batches: {len(val_loader)}")

## 9Ô∏è‚É£ Training with Early Stopping


In [None]:
# Class weights for imbalanced data
from collections import Counter
label_counts = Counter(d['label'] for d in train_data)
total = sum(label_counts.values())
class_weights = torch.FloatTensor([total / (FINAL_NUM_CLASSES * label_counts.get(i, 1)) for i in range(FINAL_NUM_CLASSES)]).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

PATIENCE = 8
best_val_loss = float('inf')
patience_counter = 0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

print(f"Starting training for up to {EPOCHS} epochs (patience={PATIENCE})...")
print("=" * 70)

for epoch in range(EPOCHS):
    # Train
    model.train()
    train_loss, correct, total = 0, 0, 0
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()
        _, pred = outputs.max(1)
        total += labels.size(0)
        correct += pred.eq(labels).sum().item()
    train_acc = correct / total

    # Validate
    model.eval()
    val_loss, correct, total = 0, 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            _, pred = outputs.max(1)
            total += labels.size(0)
            correct += pred.eq(labels).sum().item()
    val_acc = correct / total
    scheduler.step(val_loss)

    history['train_loss'].append(train_loss / len(train_loader))
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss / len(val_loader))
    history['val_acc'].append(val_acc)

    # Early stopping on val_loss
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'accent_model.pth')
        print(f"Epoch {epoch+1}: Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} ‚≠ê BEST (loss: {val_loss/len(val_loader):.4f})")
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"Epoch {epoch+1}: Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f} (no improvement {patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print(f"\nüõë Early stopping at epoch {epoch+1}!")
            break

print("=" * 70)
print(f"üéâ Training complete! Best Val Loss: {best_val_loss/len(val_loader):.4f} | Best Val Acc: {best_val_acc:.4f}")
print(f"   Total epochs: {len(history['train_loss'])}")

## üîü Visualize Training


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(history['train_loss'], label='Train')
axes[0].plot(history['val_loss'], label='Val')
axes[0].set_title('Loss'); axes[0].set_xlabel('Epoch'); axes[0].legend(); axes[0].grid(True)
axes[1].plot([a*100 for a in history['train_acc']], label='Train')
axes[1].plot([a*100 for a in history['val_acc']], label='Val')
axes[1].set_title('Accuracy (%)'); axes[1].set_xlabel('Epoch'); axes[1].legend(); axes[1].grid(True)
plt.tight_layout()
plt.savefig('accent_training_curves.png', dpi=150)
plt.show()

## 1Ô∏è‚É£1Ô∏è‚É£ Evaluation & Confusion Matrix


In [None]:
# Load best model
model.load_state_dict(torch.load('accent_model.pth'))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs.to(device))
        _, pred = outputs.max(1)
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(labels.numpy())

accent_names = [FINAL_IDX_TO_ACCENT[i] for i in range(FINAL_NUM_CLASSES)]

print("üìä Classification Report:")
print(classification_report(all_labels, all_preds, target_names=accent_names))

# Confusion matrices (normalized + raw)
cm = confusion_matrix(all_labels, all_preds)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig, axes = plt.subplots(1, 2, figsize=(20, 8))

# Normalized
sns.heatmap(cm_normalized, annot=True, fmt='.1%', cmap='Blues',
            xticklabels=accent_names, yticklabels=accent_names, ax=axes[0],
            vmin=0, vmax=1)
axes[0].set_xlabel('Predicted'); axes[0].set_ylabel('Actual')
axes[0].set_title('Normalized Confusion Matrix (Recall)')
axes[0].tick_params(axis='x', rotation=45)

# Raw
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=accent_names, yticklabels=accent_names, ax=axes[1])
axes[1].set_xlabel('Predicted'); axes[1].set_ylabel('Actual')
axes[1].set_title('Raw Counts')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('accent_confusion_matrix.png', dpi=150)
plt.show()

# Per-class accuracy
print("\nüéØ Per-class accuracy (recall):")
for i, name in enumerate(accent_names):
    recall = cm_normalized[i, i] if i < len(cm_normalized) else 0
    print(f"  {name}: {recall:.1%}")
print(f"  Overall: {np.trace(cm)/cm.sum():.1%}")

## 1Ô∏è‚É£2Ô∏è‚É£ Save & Download


In [None]:
# Save model with metadata
checkpoint = {
    'model_state_dict': model.state_dict(),
    'accent_labels': FINAL_IDX_TO_ACCENT,
    'num_classes': FINAL_NUM_CLASSES,
    'accents_list': [FINAL_IDX_TO_ACCENT[i] for i in range(FINAL_NUM_CLASSES)],
    'sample_rate': SAMPLE_RATE,
    'n_mels': N_MELS,
    'n_fft': N_FFT,
    'hop_length': HOP_LENGTH,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'accent_model_full.pth')

model_size = os.path.getsize('accent_model.pth') / (1024 * 1024)
print(f"‚úÖ accent_model.pth ({model_size:.1f} MB)")
print(f"‚úÖ accent_model_full.pth (with metadata)")
print(f"\nüéØ Accent classes ({FINAL_NUM_CLASSES}):")
for i in range(FINAL_NUM_CLASSES):
    print(f"  [{i}] {FINAL_IDX_TO_ACCENT[i]}")

print(f"\n‚ö†Ô∏è IMPORTANT: Update backend ACCENTS list to match this order!")
print(f"Copy this to app.py:")
print(f"ACCENTS = {[FINAL_IDX_TO_ACCENT[i] for i in range(FINAL_NUM_CLASSES)]}")

In [None]:
from google.colab import files
files.download('accent_model.pth')
files.download('accent_model_full.pth')
files.download('accent_training_curves.png')
files.download('accent_confusion_matrix.png')

## üìù Next Steps

1. Download `accent_model.pth`
2. Place in `AccentModel/` folder
3. Update `ACCENTS` list in `app.py` and `video_handler.py`
4. Update `ResultModal.tsx` accent flags in frontend
5. Deploy to HF Space
6. Test all accent predictions
