In [2]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from transformers import Wav2Vec2Processor, Wav2Vec2Model
import os
import pickle
from torch.nn.utils.rnn import pad_sequence
import torchaudio

In [7]:

# Define training parameters
num_epochs = 10
batch_size = 32
learning_rate = 0.001
num_classes = 10  # Number of classes in the FSDD dataset

# Check if GPU is available and set the device accordingly
device = torch.device('cuda')
# Define the classifier model using the Wav2Vec model for feature extraction
# Define the classifier model using a CNN and Wav2Vec model for feature extraction
class Wav2VecCNNClassifier(nn.Module):
    def __init__(self, num_classes):
        super(Wav2VecCNNClassifier, self).__init__()
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
        self.wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h").to(device)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc = nn.Linear(32 * 16, num_classes)

    def forward(self, x):
        with torch.no_grad():
            features = self.wav2vec(x.input_values).last_hidden_state.squeeze().mean(dim=1)
        features = features.unsqueeze(1)  # Add channel dimension for CNN
        x = F.relu(self.conv1(features))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)  # Flatten the features
        return self.fc(x)

# Initialize the model and move it to the device (GPU or CPU)
# Initialize the Wav2VecCNN classifier model
classifier = Wav2VecCNNClassifier(num_classes).to(device)


# Load and preprocess the FSDD dataset
def load_fsdd_data(dataset_path):
    data = []
    labels = []
    for filename in os.listdir(dataset_path):
        if filename.endswith(".wav"):
            filepath = os.path.join(dataset_path, filename)
            waveform, sample_rate = torchaudio.load(filepath)
            label = int(filename[0])  # Extracting the label from the filename
            data.append(waveform.squeeze(0))  # Remove the channel dimension for padding
            labels.append(label)
    # Pad the sequences to have the same length
    data = pad_sequence(data, batch_first=True).unsqueeze(1)  # Add the channel dimension back
    return data, torch.tensor(labels)  # Directly return the padded data tensor and label tensor


# Load and preprocess the EMI FSDD dataset
def load_emi_fsdd_data(dataset_path):
    data = []
    labels = []
    for filename in os.listdir(dataset_path):
        if filename.endswith(".pkl"):
            with open(os.path.join(dataset_path, filename), 'rb') as file:
                file_data = pickle.load(file)
                for waveform, label_info in file_data:
                    waveform = waveform.squeeze(0)  # Remove the channel dimension for padding, if necessary
                    label = label_info[0]  # Assuming the first element of label_info is the label tensor
                    data.append(waveform)
                    labels.append(label.item())  # Convert label tensor to integer
    # Pad the sequences to have the same length
    data = pad_sequence(data, batch_first=True).unsqueeze(1)  # Add the channel dimension back
    return data, torch.tensor(labels)  # Directly return the padded data tensor and label tensor




# Define paths to FSDD and EMI FSDD datasets
fsdd_dataset_path = "free-spoken-digit-dataset/recordings/"
emi_fsdd_dataset_path = "emi_dataset/"

# Load FSDD dataset
fsdd_data, fsdd_labels = load_fsdd_data(fsdd_dataset_path)
fsdd_dataset = TensorDataset(fsdd_data, fsdd_labels)
fsdd_loader = DataLoader(fsdd_dataset, batch_size=batch_size, shuffle=True)

# Load EMI FSDD dataset
emi_fsdd_data, emi_fsdd_labels = load_emi_fsdd_data(emi_fsdd_dataset_path)
emi_fsdd_dataset = TensorDataset(emi_fsdd_data, emi_fsdd_labels)
emi_fsdd_loader = DataLoader(emi_fsdd_dataset, batch_size=batch_size)

# Initialize the Wav2Vec classifier model
classifier = Wav2VecClassifier(num_classes)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    classifier.train()
    for batch_idx, (data, labels) in enumerate(fsdd_loader):
        data, labels = data.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = classifier(data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        print(f"Epoch [{epoch + 1}/{num_epochs}], Step [{batch_idx + 1}/{len(fsdd_loader)}], Loss: {loss.item():.4f}")

# Save the trained model
torch.save(classifier.state_dict(), 'model_directory/fsdd_model.pth')

# Load the saved model
classifier.load_state_dict(torch.load('model_directory/fsdd_model.pth'))

# Evaluation on EMI FSDD dataset
classifier.eval()
correct = 0
total = 0
with torch.no_grad():
    for data, labels in emi_fsdd_loader:
        data, labels = data.to(device), labels.to(device)
        outputs = classifier(data)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy on EMI FSDD dataset: {accuracy:.2f}%")

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


NameError: name 'Wav2VecClassifier' is not defined

In [None]:
driver = webdriver.Chrome('/content/drive/MyDrive/chromedriver_mac64/chromedriver')
