In [1]:
import os
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch.nn.functional as F
from sklearn.metrics import accuracy_score


In [2]:

# AudioDataset class remains the same
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, processor, model, min_length=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.processor = processor
        self.model = model
        self.min_length = min_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
        if waveform.size(1) < self.min_length:
            padding_size = self.min_length - waveform.size(1)
            waveform = F.pad(waveform, (0, padding_size), "constant")
        input_values = self.processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values
        with torch.no_grad():
            features = self.model(input_values).last_hidden_state.mean(dim=1)
        return features.squeeze(0), self.labels[idx]

# Define the AudioCNN class
class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        # We'll determine the correct size in a moment
        self.fc1 = nn.Linear(64, 128)  # This 64 will be replaced
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Add a channel dimension
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the features
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def initialize_fc1(self, input_size):
        # Temporarily create a random tensor to determine output size
        sample_input = torch.rand(1, 1, input_size)
        output = self.pool(F.relu(self.conv3(self.pool(F.relu(self.conv2(self.pool(F.relu(self.conv1(sample_input)))))))))
        output_size = output.view(-1).shape[0]
        # Now initialize fc1 with the correct input size
        self.fc1 = nn.Linear(output_size, 128)


# Load the processor and model for feature extraction
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
model.eval()

# Prepare the dataset
recordings_path = 'free-spoken-digit-dataset/recordings'
file_paths = [os.path.join(recordings_path, f) for f in os.listdir(recordings_path) if f.endswith('.wav')]
labels = [int(f.split('_')[0]) for f in os.listdir(recordings_path) if f.endswith('.wav')]

file_paths_train, file_paths_test, labels_train, labels_test = train_test_split(file_paths, labels, test_size=0.2, random_state=42)

train_dataset = AudioDataset(file_paths_train, labels_train, processor, model)
test_dataset = AudioDataset(file_paths_test, labels_test, processor, model)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize the CNN
net = AudioCNN(num_classes=10)
net.initialize_fc1(1024)  # Replace 1024 with the actual size of the input feature vector

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Training the model
num_epochs = 10
net.train()
for epoch in range(num_epochs):
    for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(outputs, torch.tensor(labels, dtype=torch.long))
        loss.backward()
        optimizer.step()

# Evaluating the model
net.eval()  # Switch the network to evaluation mode
predictions = []
true_labels = []
with torch.no_grad():  # No need to track gradients for evaluation
    for features, labels in test_loader:
        outputs = net(features)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.tolist())
        true_labels.extend(labels)

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  loss = criterion(outputs, torch.tensor(labels, dtype=torch.long))
Epoch 1/10: 100%|██████████| 75/75 [09:12<00:00,  7.37s/it]
Epoch 2/10: 100%|██████████| 75/75 [15:59<00:00, 12.79s/it]
Epoch 3/10: 100%|██████████| 75/75 [09:30<00:00,  7.60s/it]
Epoch 4/10: 100%|██████████| 75/75 [09:22<00:00,  7.50s/it]
Epoch 5/10: 100%|██████████| 75/75 [09:00<00:00,  7.21s/it]
Epoch 6/10: 100%|██████████| 75/75 [11:12<00:00,  8.97s/it]
Epoch 7/10: 100%|██████████| 75/75 [11:23<00:00,  9.12s/it]
Epoch 8/10: 100%|██████████| 75/75 [11:03<00:00,  8.85s/it]
Epoch 9/10: 100%|█████████

Accuracy on test set: 92.33%
