In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
print("Output Dir:")
for dirname, _, filenames in os.walk('/kaggle/working'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# main.py - Combined ECAPA-TDNN code (Python 3.10)

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import numpy as np
from typing import List, Tuple

# ========================
# dataLoader.py contents
# ========================
class CustomDataset(data.Dataset):
    def __init__(self, data_list: torch.Tensor, labels: torch.Tensor):
        self.data_list = data_list.clone().detach()
        self.labels = labels.clone().detach()

    def __len__(self) -> int:
        return len(self.data_list)

    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.data_list[idx].clone().detach(), self.labels[idx].clone().detach()

# ========================
# model.py contents
# ========================
class ECAPA_TDNN(nn.Module):
    def __init__(self, input_dim: int, output_dim: int):
        super().__init__()
        self.fc = nn.Linear(input_dim, output_dim)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.fc(x)

# ========================
# loss.py contents
# ========================
class SoftmaxLoss(nn.Module):
    def __init__(self, num_classes: int, embedding_dim: int):
        super().__init__()
        self.fc = nn.Linear(embedding_dim, num_classes)
        self.loss_fn = nn.CrossEntropyLoss()

    def forward(self, embeddings: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
        logits = self.fc(embeddings)
        return self.loss_fn(logits, labels)

# ========================
# tools.py contents
# ========================
def compute_accuracy(predictions: torch.Tensor, labels: torch.Tensor) -> float:
    _, predicted = torch.max(predictions, 1)
    return (predicted == labels).sum().item() / labels.size(0)

# ========================
# ECAPAModel.py & trainECAPAModel.py contents
# ========================
def train_model(model: nn.Module, train_loader: data.DataLoader, criterion: nn.Module, optimizer: optim.Optimizer, epochs: int = 10) -> None:
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(torch.float32), labels.to(torch.long)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

# Main execution
if __name__ == "__main__":
    input_dim, output_dim, num_classes = 256, 512, 10
    train_data = torch.randn(100, input_dim)
    train_labels = torch.randint(0, num_classes, (100,))
    train_dataset = CustomDataset(train_data, train_labels)
    train_loader = data.DataLoader(train_dataset, batch_size=32, shuffle=True)
    
    model = ECAPA_TDNN(input_dim, output_dim)
    criterion = SoftmaxLoss(num_classes, output_dim)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    train_model(model, train_loader, criterion, optimizer)

In [None]:
print("Done")

In [None]:
# main.py - ECAPA-TDNN for VoxCeleb on Kaggle (Python 3.10)

import os
from glob import glob
from typing import List, Tuple

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as data
import torchaudio
import torchaudio.transforms as T
import numpy as np

# =======================
# Custom VoxCeleb Dataset
# =======================
class VoxCelebDataset(data.Dataset):
    """
    Custom dataset loader for VoxCeleb data.
    
    Assumes a directory structure such as:
      root/
         id00001/
             some_subfolder/
                 *.wav
         id00002/
             some_subfolder/
                 *.wav
    """
    def __init__(self, root: str, transform=None):
        self.transform = transform
        self.file_list = []
        self.speaker_list = []
        # Walk through each speaker folder
        for speaker in sorted(os.listdir(root)):
            speaker_path = os.path.join(root, speaker)
            if not os.path.isdir(speaker_path):
                continue
            # Each speaker folder may have multiple subfolders
            for subfolder in sorted(os.listdir(speaker_path)):
                subfolder_path = os.path.join(speaker_path, subfolder)
                if not os.path.isdir(subfolder_path):
                    continue
                # Get all .wav files in this subfolder
                files = glob(os.path.join(subfolder_path, "*.wav"))
                for f in files:
                    self.file_list.append(f)
                    self.speaker_list.append(speaker)
        # Create mapping from speaker ID (string) to a numeric label
        self.speaker_set = sorted(list(set(self.speaker_list)))
        self.speaker_to_label = {spk: idx for idx, spk in enumerate(self.speaker_set)}
        
    def __len__(self) -> int:
        return len(self.file_list)
    
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        wav_path = self.file_list[idx]
        waveform, sample_rate = torchaudio.load(wav_path)
        # If multi-channel, convert to mono by averaging
        if waveform.shape[0] > 1:
            waveform = waveform.mean(dim=0, keepdim=True)
        # Apply transform (e.g., MelSpectrogram) if provided
        if self.transform:
            features = self.transform(waveform)
        else:
            features = waveform
        # features shape expected: (channel, n_mels, time)
        # For further processing, squeeze channel dim and transpose to (time, n_mels)
        if features.dim() == 3:
            features = features.squeeze(0).transpose(0, 1)
        else:
            features = features.transpose(0, 1)
        label = self.speaker_to_label[self.speaker_list[idx]]
        return features, label

def collate_fn(batch: List[Tuple[torch.Tensor, int]]) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Pads variable-length feature sequences (time x n_mels) with zeros to form a batch.
    
    Returns:
      - padded features: (batch, max_time, n_mels)
      - labels: (batch,)
    """
    features = [item[0] for item in batch]
    labels = [item[1] for item in batch]
    lengths = [feat.shape[0] for feat in features]
    max_len = max(lengths)
    padded = []
    for feat in features:
        pad = torch.zeros(max_len, feat.shape[1])
        pad[:feat.shape[0], :] = feat
        padded.append(pad)
    padded = torch.stack(padded)  # Shape: (batch, max_time, n_mels)
    labels = torch.tensor(labels, dtype=torch.long)
    return padded, labels

# =======================
# ECAPA-TDNN Implementation
# =======================

class SE_Res2Block(nn.Module):
    """
    Squeeze-Excitation Res2Block used in ECAPA-TDNN.
    Splits channels into several segments, processes them with convolutions and residual connections,
    then applies a squeeze-excitation module.
    """
    def __init__(self, channels: int, scale: int = 8, kernel_size: int = 3, dilation: int = 1):
        super().__init__()
        self.scale = scale
        self.width = channels // scale
        self.conv1 = nn.Conv1d(channels, channels, kernel_size=1)
        self.bn1 = nn.BatchNorm1d(channels)
        self.relu = nn.ReLU()
        self.convs = nn.ModuleList([
            nn.Conv1d(self.width, self.width, kernel_size=kernel_size,
                      padding=dilation, dilation=dilation)
            for _ in range(scale - 1)
        ])
        self.bn2 = nn.BatchNorm1d(channels)
        self.conv3 = nn.Conv1d(channels, channels, kernel_size=1)
        self.bn3 = nn.BatchNorm1d(channels)
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Conv1d(channels, channels // 8, kernel_size=1),
            nn.ReLU(),
            nn.Conv1d(channels // 8, channels, kernel_size=1),
            nn.Sigmoid()
        )
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        residual = x
        out = self.relu(self.bn1(self.conv1(x)))
        splits = torch.split(out, self.width, dim=1)
        out_splits = [splits[0]]
        for i in range(1, self.scale):
            if i == 1:
                sp = self.convs[i-1](splits[i])
            else:
                sp = self.convs[i-1](splits[i] + out_splits[i-1])
            out_splits.append(sp)
        out = torch.cat(out_splits, dim=1)
        out = self.relu(self.bn2(out))
        out = self.conv3(out)
        out = self.bn3(out)
        w = self.se(out)
        out = out * w
        return out + residual

class AttentiveStatsPooling(nn.Module):
    """
    Attentive Statistics Pooling layer computes weighted mean and standard deviation over time.
    """
    def __init__(self, in_channels: int, hidden_channels: int):
        super().__init__()
        self.tanh = nn.Tanh()
        self.linear = nn.Conv1d(in_channels, hidden_channels, kernel_size=1)
        self.attention = nn.Conv1d(hidden_channels, in_channels, kernel_size=1)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x shape: (batch, channels, time)
        alpha = self.tanh(self.linear(x))
        alpha = self.attention(alpha)
        alpha = torch.softmax(alpha, dim=2)
        mean = torch.sum(x * alpha, dim=2)
        std = torch.sqrt(torch.sum((x ** 2) * alpha, dim=2) - mean ** 2 + 1e-9)
        return torch.cat([mean, std], dim=1)

class ECAPA_TDNN(nn.Module):
    """
    ECAPA-TDNN model for speaker recognition.
    Expects input features with shape (batch, time, n_mels).
    """
    def __init__(self, input_dim: int = 80, channels: int = 512, emb_dim: int = 192, num_classes: int = 1000):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv1d(input_dim, channels, kernel_size=5, padding=2),
            nn.BatchNorm1d(channels),
            nn.ReLU()
        )
        self.layer2 = SE_Res2Block(channels, scale=8, kernel_size=3, dilation=2)
        self.layer3 = SE_Res2Block(channels, scale=8, kernel_size=3, dilation=3)
        self.layer4 = SE_Res2Block(channels, scale=8, kernel_size=3, dilation=4)
        self.pooling = AttentiveStatsPooling(channels, channels)
        self.fc = nn.Linear(channels * 2, emb_dim)
        self.classifier = nn.Linear(emb_dim, num_classes)
    
    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        # x shape: (batch, time, feature)
        x = x.transpose(1, 2)  # -> (batch, feature, time)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.pooling(x)  # -> (batch, channels*2)
        emb = self.fc(x)
        logits = self.classifier(emb)
        return emb, logits

# =======================
# Loss and Training Functions
# =======================
class SoftmaxLoss(nn.Module):
    """
    A simple wrapper around CrossEntropyLoss.
    """
    def __init__(self):
        super().__init__()
        self.loss_fn = nn.CrossEntropyLoss()
    
    def forward(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
        return self.loss_fn(logits, labels)

def train_model(model: nn.Module, train_loader: data.DataLoader, criterion: nn.Module,
                optimizer: optim.Optimizer, epochs: int = 10) -> None:
    model.train()
    for epoch in range(epochs):
        total_loss = 0.0
        for features, labels in train_loader:
            features, labels = features.to(torch.float32), labels.to(torch.long)
            optimizer.zero_grad()
            emb, logits = model(features)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

def test_model(model: nn.Module, test_loader: data.DataLoader) -> None:
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(torch.float32), labels.to(torch.long)
            emb, logits = model(features)
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f"Test Accuracy: {100 * correct / total:.2f}%")

# =======================
# Main Execution
# =======================
if __name__ == "__main__":
    # Define a transform to extract mel-spectrogram features
    # Adjust the sample_rate if needed; here, we assume 16kHz audio.
    transform = T.MelSpectrogram(sample_rate=16000, n_mels=80)
    
    # Define dataset roots (update these paths according to your Kaggle dataset structure)
    train_root = "/kaggle/input/audiodataset10percent/VoxCeleb/vox1_dev_wav"
    test_root = "/kaggle/input/audiodataset10percent/VoxCeleb/vox1_test_wav"
    
    # Initialize the training and testing datasets
    train_dataset = VoxCelebDataset(root=train_root, transform=transform)
    test_dataset = VoxCelebDataset(root=test_root, transform=transform)
    
    # Create DataLoaders with the custom collate function
    train_loader = data.DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
    test_loader = data.DataLoader(test_dataset, batch_size=16, shuffle=False, collate_fn=collate_fn)
    
    # Determine the number of speakers from the training dataset
    num_classes = len(train_dataset.speaker_set)
    print(f"Number of speakers (classes): {num_classes}")
    
    # Initialize model, loss, and optimizer
    model = ECAPA_TDNN(input_dim=80, channels=512, emb_dim=192, num_classes=num_classes)
    criterion = SoftmaxLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    print("Starting training...")
    train_model(model, train_loader, criterion, optimizer, epochs=10)
    
    print("Evaluating on test set...")
    test_model(model, test_loader)


In [None]:
print("Done")