# üéØ AEMER - Accent Detection Model Training (11+ Accents)

**Architecture:** CNN-BiLSTM with Attention

**Dataset:** Kaggle Speech Accent Archive (2,138 real recordings)

**Author:** Sanjula Sunath | w1999522


## 1Ô∏è‚É£ Setup & Dependencies


In [None]:
!pip install torch torchaudio librosa pandas numpy scikit-learn tqdm matplotlib seaborn kagglehub --quiet
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import librosa
import pandas as pd
import os
import random
import glob
from tqdm import tqdm
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

## 2Ô∏è‚É£ Configuration


In [None]:
# === 9 Real Accent Classes ===
ACCENT_LABELS = {
    0: 'american',     # üá∫üá∏ United States
    1: 'british',      # üá¨üáß England
    2: 'australian',   # üá¶üá∫ Australia
    3: 'indian',       # üáÆüá≥ South Asia
    4: 'canadian',     # üá®üá¶ Canada
    5: 'irish',        # üáÆüá™ Ireland
    6: 'african',      # üåç Africa
    7: 'filipino',     # üáµüá≠ Philippines
    8: 'hongkong',     # üá≠üá∞ Hong Kong / East Asia
}
ACCENT_TO_IDX = {v: k for k, v in ACCENT_LABELS.items()}
NUM_CLASSES = 9

SAMPLE_RATE = 16000
DURATION = 5
N_MELS = 128
N_FFT = 1024
HOP_LENGTH = 160
MAX_LEN = int(SAMPLE_RATE * DURATION / HOP_LENGTH) + 1

BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 0.001

print(f"üéØ {NUM_CLASSES} accent classes, {DURATION}s clips, {EPOCHS} epochs")

## 3Ô∏è‚É£ Download Speech Accent Archive


In [None]:
import kagglehub

print("üì• Downloading Speech Accent Archive from Kaggle...")
dataset_path = kagglehub.dataset_download("rtatman/speech-accent-archive")
print(f"‚úÖ Dataset downloaded to: {dataset_path}")

# Find recordings directory (nested: recordings/recordings/)
recordings_dir = os.path.join(dataset_path, 'recordings', 'recordings')
if not os.path.exists(recordings_dir):
    # Fallback: search for MP3 files
    for root, dirs, files in os.walk(dataset_path):
        mp3s = [f for f in files if f.endswith('.mp3')]
        if mp3s:
            recordings_dir = root
            break

audio_files = [f for f in os.listdir(recordings_dir) if f.endswith('.mp3')]
print(f"üìÇ Recordings: {recordings_dir}")
print(f"   Found {len(audio_files)} audio files")

## 4Ô∏è‚É£ Build Accent Dataset


In [None]:
# Load metadata CSV
csv_files = glob.glob(os.path.join(dataset_path, '**', '*.csv'), recursive=True)
df = pd.read_csv(csv_files[0])
print(f"Metadata: {df.shape[0]} rows")
print(f"\nTop 20 native languages:")
print(df['native_language'].value_counts().head(20))

# Map native languages ‚Üí 9 REAL accent classes only
LANGUAGE_TO_ACCENT = {
    'hindi': 'indian', 'urdu': 'indian', 'bengali': 'indian',
    'tamil': 'indian', 'telugu': 'indian', 'gujarati': 'indian',
    'punjabi': 'indian', 'marathi': 'indian', 'kannada': 'indian',
    'malayalam': 'indian', 'nepali': 'indian', 'sinhala': 'indian',
    'sinhalese': 'indian', 'dari': 'indian',
    'mandarin': 'hongkong', 'cantonese': 'hongkong',
    'chinese': 'hongkong', 'japanese': 'hongkong', 'korean': 'hongkong',
    'tagalog': 'filipino', 'cebuano': 'filipino', 'ilocano': 'filipino',
    'amharic': 'african', 'swahili': 'african', 'yoruba': 'african',
    'igbo': 'african', 'hausa': 'african', 'zulu': 'african',
    'twi': 'african', 'shona': 'african', 'akan': 'african',
    'luganda': 'african', 'wolof': 'african', 'somali': 'african',
    'kinyarwanda': 'african', 'xhosa': 'african', 'afrikaans': 'african',
    'arabic': 'african',
    'irish': 'irish', 'gaelic': 'irish',
}

COUNTRY_TO_ACCENT = {
    'usa': 'american', 'united states': 'american',
    'uk': 'british', 'england': 'british', 'united kingdom': 'british',
    'australia': 'australian',
    'canada': 'canadian',
    'scotland': 'british',
    'ireland': 'irish',
    'south africa': 'african', 'nigeria': 'african', 'kenya': 'african', 'ghana': 'african',
    'india': 'indian', 'pakistan': 'indian', 'bangladesh': 'indian', 'sri lanka': 'indian',
    'philippines': 'filipino',
    'hong kong': 'hongkong',
}

data_list = []
accent_counts = Counter()
skipped = 0

print("\nüîÑ Processing speakers...")
for _, row in tqdm(df.iterrows(), total=len(df), desc="Processing"):
    try:
        lang = str(row.get('native_language', '')).strip().lower()
        country = str(row.get('country', '')).strip().lower()
        birth_place = str(row.get('birthplace', '')).strip().lower()
        filename = str(row.get('filename', ''))

        if not filename or filename == 'nan':
            skipped += 1
            continue

        accent = None
        if lang in LANGUAGE_TO_ACCENT:
            accent = LANGUAGE_TO_ACCENT[lang]
        if accent is None and lang == 'english':
            for key, acc in COUNTRY_TO_ACCENT.items():
                if key in country or key in birth_place:
                    accent = acc
                    break
            if accent is None:
                accent = 'american'
        if accent is None:
            for key, acc in COUNTRY_TO_ACCENT.items():
                if key in country or key in birth_place:
                    accent = acc
                    break

        if accent is None or accent not in ACCENT_TO_IDX:
            skipped += 1
            continue

        audio_path = None
        for ext in ['.mp3', '.wav', '']:
            candidate = os.path.join(recordings_dir, filename + ext)
            if os.path.exists(candidate):
                audio_path = candidate
                break
            candidate = os.path.join(recordings_dir, filename)
            if os.path.exists(candidate):
                audio_path = candidate
                break

        if audio_path is None:
            skipped += 1
            continue

        data_list.append({
            'audio_path': audio_path,
            'label': ACCENT_TO_IDX[accent],
            'accent': accent,
        })
        accent_counts[accent] += 1
    except:
        skipped += 1

print(f"\n‚úÖ Processed: {len(data_list)} samples (skipped: {skipped})")
print(f"\nüìä Samples per accent:")
for accent, count in sorted(accent_counts.items(), key=lambda x: -x[1]):
    print(f"  {accent}: {count}")

## 5Ô∏è‚É£ Balance Dataset


In [None]:
# Filter accents with enough samples and balance
MIN_SAMPLES = 10
samples_by_class = defaultdict(list)
for d in data_list:
    samples_by_class[d['label']].append(d)

valid_classes = {label for label, samples in samples_by_class.items() if len(samples) >= MIN_SAMPLES}

print(f"Accents with >= {MIN_SAMPLES} samples:")
for label in sorted(valid_classes):
    print(f"  [{label}] {ACCENT_LABELS[label]}: {len(samples_by_class[label])}")

# Remap labels to be contiguous
valid_data = [d for d in data_list if d['label'] in valid_classes]
sorted_valid = sorted(valid_classes)
old_to_new = {old: new for new, old in enumerate(sorted_valid)}

FINAL_ACCENTS = [ACCENT_LABELS[old] for old in sorted_valid]
FINAL_ACCENT_TO_IDX = {a: i for i, a in enumerate(FINAL_ACCENTS)}
FINAL_IDX_TO_ACCENT = {i: a for i, a in enumerate(FINAL_ACCENTS)}
FINAL_NUM_CLASSES = len(FINAL_ACCENTS)

for d in valid_data:
    d['label'] = old_to_new[d['label']]

# Balance with oversampling
target_size = int(np.median([len(samples_by_class[c]) for c in valid_classes]))
target_size = max(target_size, MIN_SAMPLES * 2)

balanced_data = []
new_by_class = defaultdict(list)
for d in valid_data:
    new_by_class[d['label']].append(d)

for label, samples in new_by_class.items():
    if len(samples) >= target_size:
        balanced_data.extend(random.sample(samples, target_size))
    else:
        balanced_data.extend(samples)
        balanced_data.extend(random.choices(samples, k=target_size - len(samples)))

random.shuffle(balanced_data)
data_list = balanced_data

print(f"\n‚úÖ Final: {FINAL_NUM_CLASSES} accent classes, {len(data_list)} total samples")
print(f"\nüìä Balanced distribution:")
for label in sorted(set(d['label'] for d in data_list)):
    count = sum(1 for d in data_list if d['label'] == label)
    print(f"  [{label}] {FINAL_IDX_TO_ACCENT[label]}: {count}")

## 6Ô∏è‚É£ Dataset Class


In [None]:
class AccentDataset(Dataset):
    def __init__(self, data_list, augment=False):
        self.data = data_list
        self.augment = augment

    def __len__(self):
        return len(self.data)

    def process_audio(self, audio_path):
        audio, sr = librosa.load(audio_path, sr=SAMPLE_RATE, duration=DURATION + 1)
        audio, _ = librosa.effects.trim(audio, top_db=20)
        max_val = np.max(np.abs(audio))
        if max_val > 0:
            audio = audio / max_val

        target = int(DURATION * SAMPLE_RATE)
        if len(audio) < target:
            audio = np.pad(audio, (0, target - len(audio)))
        else:
            if self.augment:
                start = random.randint(0, max(0, len(audio) - target))
                audio = audio[start:start + target]
            else:
                audio = audio[:target]

        if self.augment:
            if random.random() < 0.3:
                audio = audio + np.random.randn(len(audio)) * 0.005
            if random.random() < 0.2:
                audio = librosa.effects.pitch_shift(audio, sr=SAMPLE_RATE, n_steps=random.uniform(-1, 1))
            if random.random() < 0.2:
                audio = librosa.effects.time_stretch(audio, rate=random.uniform(0.9, 1.1))
                if len(audio) < target:
                    audio = np.pad(audio, (0, target - len(audio)))
                else:
                    audio = audio[:target]

        mel = librosa.feature.melspectrogram(y=audio, sr=SAMPLE_RATE, n_mels=N_MELS, n_fft=N_FFT, hop_length=HOP_LENGTH)
        spec = librosa.power_to_db(mel, ref=np.max)
        spec = (spec - spec.mean()) / (spec.std() + 1e-8)
        if spec.shape[1] < MAX_LEN:
            spec = np.pad(spec, ((0, 0), (0, MAX_LEN - spec.shape[1])))
        else:
            spec = spec[:, :MAX_LEN]
        return spec

    def __getitem__(self, idx):
        item = self.data[idx]
        try:
            spec = self.process_audio(item['audio_path'])
        except:
            spec = np.zeros((N_MELS, MAX_LEN))
        return torch.FloatTensor(spec).unsqueeze(0), item['label']

print("‚úÖ AccentDataset class defined")

## 7Ô∏è‚É£ CNN-BiLSTM Model


In [None]:
class CNN_BiLSTM_Accent(nn.Module):
    def __init__(self, num_classes=16):
        super().__init__()
        self.conv1 = nn.Sequential(nn.Conv2d(1, 32, 3, 1, 1), nn.BatchNorm2d(32), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.2))
        self.conv2 = nn.Sequential(nn.Conv2d(32, 64, 3, 1, 1), nn.BatchNorm2d(64), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.2))
        self.conv3 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128), nn.ReLU(), nn.MaxPool2d(2), nn.Dropout(0.3))
        self.lstm = nn.LSTM(128 * 16, 128, 2, batch_first=True, bidirectional=True, dropout=0.3)
        self.attention = nn.Sequential(nn.Linear(256, 64), nn.Tanh(), nn.Linear(64, 1))
        self.fc = nn.Sequential(nn.Linear(256, 128), nn.ReLU(), nn.Dropout(0.4), nn.Linear(128, num_classes))

    def forward(self, x):
        x = self.conv3(self.conv2(self.conv1(x)))
        x = x.permute(0, 3, 1, 2).reshape(x.size(0), -1, 128 * 16)
        lstm_out, _ = self.lstm(x)
        attn = F.softmax(self.attention(lstm_out), dim=1)
        return self.fc(torch.sum(attn * lstm_out, dim=1))

model = CNN_BiLSTM_Accent(FINAL_NUM_CLASSES).to(device)
print(f"‚úÖ Model: {FINAL_NUM_CLASSES} classes, {sum(p.numel() for p in model.parameters()):,} params")

## 8Ô∏è‚É£ Data Loaders


In [None]:
train_data, val_data = train_test_split(
    data_list, test_size=0.2, random_state=42,
    stratify=[d['label'] for d in data_list]
)

train_dataset = AccentDataset(train_data, augment=True)
val_dataset = AccentDataset(val_data, augment=False)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

print(f"Train: {len(train_data)} ({len(train_loader)} batches)")
print(f"Val: {len(val_data)} ({len(val_loader)} batches)")

## 9Ô∏è‚É£ Training


In [None]:
label_counts = Counter(d['label'] for d in train_data)
total = sum(label_counts.values())
class_weights = torch.FloatTensor([total / (FINAL_NUM_CLASSES * label_counts.get(i, 1)) for i in range(FINAL_NUM_CLASSES)]).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=5, factor=0.5)

PATIENCE = 12
best_val_loss = float('inf')
patience_counter = 0
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

print(f"Training for up to {EPOCHS} epochs (patience={PATIENCE})...")
print("=" * 60)

for epoch in range(EPOCHS):
    model.train()
    train_loss, correct, total_n = 0, 0, 0
    for inputs, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}', leave=False):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        train_loss += loss.item()
        _, pred = outputs.max(1)
        total_n += labels.size(0)
        correct += pred.eq(labels).sum().item()
    train_acc = correct / total_n

    model.eval()
    val_loss, correct, total_n = 0, 0, 0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            val_loss += criterion(outputs, labels).item()
            _, pred = outputs.max(1)
            total_n += labels.size(0)
            correct += pred.eq(labels).sum().item()
    val_acc = correct / total_n
    scheduler.step(val_loss)

    history['train_loss'].append(train_loss / len(train_loader))
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss / len(val_loader))
    history['val_acc'].append(val_acc)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'accent_model.pth')
        print(f"Epoch {epoch+1}: Train {train_acc:.3f} | Val {val_acc:.3f} ‚≠ê BEST")
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"Epoch {epoch+1}: Train {train_acc:.3f} | Val {val_acc:.3f} ({patience_counter}/{PATIENCE})")
        if patience_counter >= PATIENCE:
            print(f"\nüõë Early stopping at epoch {epoch+1}!")
            break

print("=" * 60)
print(f"Best Val Acc: {best_val_acc:.3f}")

## üîü Training Curves


In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
axes[0].plot(history['train_loss'], label='Train'); axes[0].plot(history['val_loss'], label='Val')
axes[0].set_title('Loss'); axes[0].legend(); axes[0].grid(True)
axes[1].plot([a*100 for a in history['train_acc']], label='Train')
axes[1].plot([a*100 for a in history['val_acc']], label='Val')
axes[1].set_title('Accuracy (%)'); axes[1].legend(); axes[1].grid(True)
plt.tight_layout()
plt.savefig('accent_training_curves.png', dpi=150)
plt.show()

## 1Ô∏è‚É£1Ô∏è‚É£ Evaluation


In [None]:
model.load_state_dict(torch.load('accent_model.pth'))
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for inputs, labels in val_loader:
        outputs = model(inputs.to(device))
        _, pred = outputs.max(1)
        all_preds.extend(pred.cpu().numpy())
        all_labels.extend(labels.numpy())

accent_names = [FINAL_IDX_TO_ACCENT[i] for i in range(FINAL_NUM_CLASSES)]

print("üìä Classification Report:")
print(classification_report(all_labels, all_preds, target_names=accent_names))

cm = confusion_matrix(all_labels, all_preds)
cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

fig, axes = plt.subplots(1, 2, figsize=(20, 8))
sns.heatmap(cm_norm, annot=True, fmt='.1%', cmap='Blues',
            xticklabels=accent_names, yticklabels=accent_names, ax=axes[0], vmin=0, vmax=1)
axes[0].set_xlabel('Predicted'); axes[0].set_ylabel('Actual')
axes[0].set_title('Normalized (Recall)'); axes[0].tick_params(axis='x', rotation=45)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=accent_names, yticklabels=accent_names, ax=axes[1])
axes[1].set_xlabel('Predicted'); axes[1].set_ylabel('Actual')
axes[1].set_title('Raw Counts'); axes[1].tick_params(axis='x', rotation=45)
plt.tight_layout()
plt.savefig('accent_confusion_matrix.png', dpi=150)
plt.show()

print("\nüéØ Per-class recall:")
for i, name in enumerate(accent_names):
    print(f"  {name}: {cm_norm[i, i]:.1%}")
print(f"  Overall: {np.trace(cm)/cm.sum():.1%}")

## 1Ô∏è‚É£2Ô∏è‚É£ Save & Download


In [None]:
# Save with metadata
checkpoint = {
    'model_state_dict': model.state_dict(),
    'accent_labels': FINAL_IDX_TO_ACCENT,
    'num_classes': FINAL_NUM_CLASSES,
    'accents_list': FINAL_ACCENTS,
    'sample_rate': SAMPLE_RATE,
    'n_mels': N_MELS,
    'n_fft': N_FFT,
    'hop_length': HOP_LENGTH,
    'best_val_acc': best_val_acc
}
torch.save(checkpoint, 'accent_model_full.pth')

print(f"‚úÖ Saved accent_model.pth")
print(f"\nüéØ Final {FINAL_NUM_CLASSES} accent classes:")
for i in range(FINAL_NUM_CLASSES):
    print(f"  [{i}] {FINAL_IDX_TO_ACCENT[i]}")

print(f"\n‚ö†Ô∏è COPY THIS TO app.py:")
print(f"ACCENTS = {FINAL_ACCENTS}")

In [None]:
from google.colab import files
files.download('accent_model.pth')
files.download('accent_model_full.pth')
files.download('accent_training_curves.png')
files.download('accent_confusion_matrix.png')