In [1]:
!pip install torchaudio
!pip install scikit-learn transformers tqdm


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:

import os
import torch
import torchaudio
import pickle
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import torch.nn.functional as F
from sklearn.metrics import accuracy_score



  from .autonotebook import tqdm as notebook_tqdm


In [17]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import torchaudio
from transformers import Wav2Vec2Processor, Wav2Vec2Model
from tqdm.auto import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, processor, model, min_length=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.processor = processor
        self.model = model.to(device)
        self.min_length = min_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
        if waveform.size(1) < self.min_length:
            padding_size = self.min_length - waveform.size(1)
            waveform = F.pad(waveform, (0, padding_size), "constant")
        input_values = self.processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values.to(device)
        with torch.no_grad():
            features = self.model(input_values).last_hidden_state.mean(dim=1)
        return features.squeeze(0), torch.tensor(self.labels[idx], device=device)

class AudioCNN(nn.Module):
    def __init__(self, num_classes, input_channels):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(input_channels, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)

        # Using a dummy input to determine the size after convolutions and pooling
        dummy_input = torch.rand(1, input_channels, 16000)
        dummy_output = self.forward_features(dummy_input)
        fc1_input_size = dummy_output.shape[1]

        self.fc1 = nn.Linear(fc1_input_size, 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward_features(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return x

    def forward(self, x):
        x = self.forward_features(x)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x




        

# Initialize and prepare the CNN
net = AudioCNN(num_classes=10, input_channels=768).to(device)

# Load the processor and model for feature extraction
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h").to(device)
model.eval()

# Prepare the dataset
recordings_path = 'free-spoken-digit-dataset/recordings'
file_paths = [os.path.join(recordings_path, f) for f in os.listdir(recordings_path) if f.endswith('.wav')]
labels = [int(f.split('_')[0]) for f in os.listdir(recordings_path) if f.endswith('.wav')]

file_paths_train, file_paths_temp, labels_train, labels_temp = train_test_split(
    file_paths, labels, test_size=0.4, random_state=42)
file_paths_val, file_paths_test, labels_val, labels_test = train_test_split(
    file_paths_temp, labels_temp, test_size=0.5, random_state=42)

train_dataset = AudioDataset(file_paths_train, labels_train, processor, model)
val_dataset = AudioDataset(file_paths_val, labels_val, processor, model)
test_dataset = AudioDataset(file_paths_test, labels_test, processor, model)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Training the model with validation
num_epochs = 30
for epoch in range(num_epochs):
    net.train()
    for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        features = features.unsqueeze(1)
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    net.eval()
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for features, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            outputs = net(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_accuracy += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy /= len(val_loader.dataset)
    print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_accuracy * 100:.2f}%")

# Testing the model
net.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for features, labels in test_loader:
        outputs = net(features)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/30 - Training:   0%|          | 0/57 [00:00<?, ?it/s]


RuntimeError: Given groups=1, weight of size [16, 768, 3], expected input[32, 1, 1024] to have 768 channels, but got 1 channels instead

In [7]:
# Save the trained model
model_save_path = 'model_directory/audio_cnn_model.pth'
torch.save(net.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")

Model saved to model_directory/audio_cnn_model.pth


Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [26]:

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda")

class AudioDataset(Dataset):
    def __init__(self, file_paths, labels, processor, model, min_length=16000):
        self.file_paths = file_paths
        self.labels = labels
        self.processor = processor
        self.model = model.to(device)
        self.min_length = min_length

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        waveform, sample_rate = torchaudio.load(self.file_paths[idx])
        if sample_rate != 16000:
            resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = resampler(waveform)
        if waveform.size(1) < self.min_length:
            padding_size = self.min_length - waveform.size(1)
            waveform = F.pad(waveform, (0, padding_size), "constant")
        input_values = self.processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=16000).input_values.to(device)
        with torch.no_grad():
            features = self.model(input_values).last_hidden_state.mean(dim=1)
        return features.squeeze(0), torch.tensor(self.labels[idx], device=device)

class AudioCNN(nn.Module):
    def __init__(self, num_classes):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64, 128)  # Temporary initialization
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)  # Ensure this is the only place we unsqueeze
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def initialize_fc1(self, sample_feature):
        # Pass a sample feature tensor through the convolutional layers to get the output size
        output = self.forward_features(sample_feature.unsqueeze(0))  # Unsqueeze to add a batch dimension
        output_size = output.size(-1)
        self.fc1 = nn.Linear(output_size, 128).to(device)





    def forward_features(self, x):
        # Remove the unsqueeze from here
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        return x


  

# Load the processor and model for feature extraction
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h").to(device)
model.eval()

# Prepare the dataset
recordings_path = 'free-spoken-digit-dataset/recordings'
file_paths = [os.path.join(recordings_path, f) for f in os.listdir(recordings_path) if f.endswith('.wav')]
labels = [int(f.split('_')[0]) for f in os.listdir(recordings_path) if f.endswith('.wav')]

file_paths_train, file_paths_temp, labels_train, labels_temp = train_test_split(
    file_paths, labels, test_size=0.4, random_state=42)
file_paths_val, file_paths_test, labels_val, labels_test = train_test_split(
    file_paths_temp, labels_temp, test_size=0.5, random_state=42)

train_dataset = AudioDataset(file_paths_train, labels_train, processor, model)
val_dataset = AudioDataset(file_paths_val, labels_val, processor, model)
test_dataset = AudioDataset(file_paths_test, labels_test, processor, model)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Initialize and prepare the CNN
net = AudioCNN(num_classes=10).to(device)
#net.initialize_fc1(16000)  # Initialize with the correct input size, here it's 16000 for the audio length
# Get a sample feature from the dataset (adjust as necessary to get the correct feature tensor)
sample_feature, _ = train_dataset[0]
net.initialize_fc1(sample_feature)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Training the model with validation
num_epochs = 30
for epoch in range(num_epochs):
    net.train()
    for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        optimizer.zero_grad()
        outputs = net(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
    net.eval()
    val_loss = 0
    val_accuracy = 0
    with torch.no_grad():
        for features, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            outputs = net(features)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            val_accuracy += (predicted == labels).sum().item()
    val_loss /= len(val_loader)
    val_accuracy /= len(val_loader.dataset)
    print(f"Validation loss: {val_loss:.4f}, Validation accuracy: {val_accuracy * 100:.2f}%")

# Testing the model
net.eval()
predictions = []
true_labels = []
with torch.no_grad():
    for features, labels in test_loader:
        outputs = net(features)
        _, predicted = torch.max(outputs.data, 1)
        predictions.extend(predicted.tolist())
        true_labels.extend(labels.tolist())

accuracy = accuracy_score(true_labels, predictions)
print(f"Accuracy on test set: {accuracy * 100:.2f}%")


Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/30 - Training:   0%|          | 0/57 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (32x8192 and 128x128)

In [62]:
import os

model_path = 'model_directory/audio_cnn_model.pth'  # Update this to the correct path

if os.path.exists(model_path) and model_path.endswith('.pth'):
    net.load_state_dict(torch.load(model_path, map_location=device))
else:
    raise FileNotFoundError("Model file not found or invalid. Please check the path and file extension.")


RuntimeError: Error(s) in loading state_dict for AudioCNN:
	size mismatch for fc1.weight: copying a param with shape torch.Size([128, 8192]) from checkpoint, the shape in current model is torch.Size([128, 4096]).