In [6]:
!pip install librosa torchaudio torch numpy tqdm scikit-learn matplotlib




[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
import os, torch, librosa, torchaudio
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt

In [8]:
class RawNet2(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1)
        self.pool1 = nn.MaxPool1d(4)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3)
        self.pool2 = nn.MaxPool1d(4)
        self.fc1 = nn.Linear(32 * 123, 64)
        self.fc2 = nn.Linear(64, 2)

    def forward(self, x):
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        return self.fc2(x)

In [9]:
class AudioDataset(Dataset):
    def __init__(self, root_dir):
        self.files = []
        self.labels = []
        for label, folder in enumerate(["bonafide", "spoof"]):
            for file in os.listdir(os.path.join(root_dir, folder)):
                self.files.append(os.path.join(root_dir, folder, file))
                self.labels.append(label)

    def __len__(self):
        return len(self.files)

    def __getitem__(self, idx):
        path = self.files[idx]
        y, sr = librosa.load(path, sr=16000)
        y = torch.tensor(y[:2000])  # Trim to 2000 samples
        if len(y) < 2000:
            y = F.pad(y, (0, 2000 - len(y)))
        return y.unsqueeze(0), torch.tensor(self.labels[idx])

In [10]:
import torch
from torch.utils.data import Dataset, DataLoader
import torchaudio
import os

class AudioDataset(Dataset):
    def __init__(self, root_dir, transform=None, sample_rate=16000):
        self.root_dir = root_dir
        self.transform = transform
        self.sample_rate = sample_rate
        self.audio_files = []
        self.labels = []

        for label_folder in ['bonafide', 'spoof']:
            full_path = os.path.join(root_dir, label_folder)
            label = 1 if label_folder == 'bonafide' else 0
            for file in os.listdir(full_path):
                if file.endswith('.wav') or file.endswith('.flac'):
                    self.audio_files.append(os.path.join(full_path, file))
                    self.labels.append(label)

    def __len__(self):
        return len(self.audio_files)

    def __getitem__(self, idx):
        audio_path = self.audio_files[idx]
        waveform, sr = torchaudio.load(audio_path)

        # Resample if needed
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        return waveform, self.labels[idx]


In [11]:
train_dataset = AudioDataset("data/train")
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)

test_dataset = AudioDataset("data/test")
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class RawNet2(nn.Module):
    def __init__(self, input_dim=1, num_classes=2):
        super(RawNet2, self).__init__()
        self.conv1 = nn.Conv1d(input_dim, 128, kernel_size=3, stride=1, padding=1)
        self.bn1 = nn.BatchNorm1d(128)
        self.relu = nn.ReLU()

        self.resblock = nn.Sequential(
            nn.Conv1d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Conv1d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm1d(128)
        )

        self.fc = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)

        # ResNet-style residual block
        residual = x
        x = self.resblock(x)
        x += residual
        x = F.relu(x)

        x = torch.mean(x, dim=2)  # Global average pooling
        x = self.fc(x)
        return x


In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RawNet2().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

for epoch in range(5):
    model.train()
    running_loss = 0.0
    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
        inputs, labels = inputs.to(device), labels.to(device)
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    print(f"Epoch {epoch+1} - Loss: {running_loss:.4f}")


Epoch 1:   3%|███▎                                                                                                                      | 48/1745 [09:00<5:18:24, 11.26s/it]


KeyboardInterrupt: 

In [14]:
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
    for inputs, labels in test_loader:
        inputs = inputs.to(device)
        outputs = model(inputs)
        preds = torch.argmax(outputs, dim=1).cpu().numpy()
        y_pred.extend(preds)
        y_true.extend(labels.numpy())

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

Accuracy: 0.6672794117647058
Confusion Matrix:
 [[538   6]
 [356 188]]
