In [1]:
!pip install torchaudio
!pip install scikit-learn transformers tqdm


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
import os
import torch
import torchaudio
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch.nn.functional as F
from sklearn.metrics import accuracy_score


  from .autonotebook import tqdm as notebook_tqdm


In [7]:
class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, processor, model, min_length=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.processor = processor
        self.model = model
        self.min_length = min_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
        if waveform.size(1) < self.min_length:
            padding_size = self.min_length - waveform.size(1)
            waveform = F.pad(waveform, (0, padding_size), "constant")
        input_values = self.processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values
        with torch.no_grad():
            features = self.model(input_values).last_hidden_state.mean(dim=1)
        return features.squeeze(0), self.labels[idx]

class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        # Assuming features is a 1D tensor with 1024 features
        self.conv1 = nn.Conv1d(1024, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2, padding=1)
        
        # The linear layer's input size will be dynamically calculated later
        self.fc1 = nn.Linear(64, 128)  
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        # No need to unsqueeze since x will already have batch size as the first dimension
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the features
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x


    def initialize_fc1(self, input_size):
        sample_input = torch.rand(1, 1, input_size)
        output = self.pool(F.relu(self.conv3(self.pool(F.relu(self.conv2(self.pool(F.relu(self.conv1(sample_input)))))))))
        output_size = output.view(-1).shape[0]
        self.fc1 = nn.Linear(output_size, 128)

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
model.eval()

recordings_path = 'free-spoken-digit-dataset/recordings'
file_paths = [os.path.join(recordings_path, f) for f in os.listdir(recordings_path) if f.endswith('.wav')]
labels = [int(f.split('_')[0]) for f in os.listdir(recordings_path) if f.endswith('.wav')]

file_paths_train, file_paths_temp, labels_train, labels_temp = train_test_split(file_paths, labels, test_size=0.2, random_state=42)
file_paths_test, file_paths_val, labels_test, labels_val = train_test_split(file_paths_temp, labels_temp, test_size=0.5, random_state=42)
print("TR")
train_dataset = AudioDataset(file_paths_train, labels_train, processor, model)
print("V")
val_dataset = AudioDataset(file_paths_val, labels_val, processor, model)
print("T")
test_dataset = AudioDataset(file_paths_test, labels_test, processor, model)
print("TR")
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
print("V")
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
print("T")
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)



net = AudioCNN(num_classes=10)
net.initialize_fc1(1024)  # Adjust the input size based on your feature extractor's output
net.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

num_epochs = 10
net.train()
for epoch in range(num_epochs):
    for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        features, labels = features.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(outputs, torch.tensor(labels, dtype=torch.long).to(device))
        loss.backward()
        optimizer.step()

    # Validation step at the end of each epoch
    net.eval()
    val_losses = []
    with torch.no_grad():
        for features, labels in val_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = net(features)
            loss = criterion(outputs, torch.tensor(labels, dtype=torch.long).to(device))
            val_losses.append(loss.item())
    print(f"Epoch {epoch+1}, Validation Loss: {sum(val_losses)/len(val_losses):.4f}")
    net.train()

# Evaluate the model on the test set
net.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for features, labels in test_loader:
        features, labels = features.to(device), labels.to(device)
        outputs = net(features)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.cpu().tolist())
        true_labels.extend(labels.cpu())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")

# Save the model
torch.save({
    'model_state_dict': net.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'model_checkpoint.pth')

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TR
V
T
TR
V
T


RuntimeError: Given groups=1, weight of size [16, 1024, 3], expected input[1, 1, 1024] to have 1024 channels, but got 1 channels instead

  from .autonotebook import tqdm as notebook_tqdm
