In [2]:
pip install accelerate -U

Defaulting to user installation because normal site-packages is not writeable
Collecting accelerate
  Downloading accelerate-0.29.1-py3-none-any.whl.metadata (18 kB)
Downloading accelerate-0.29.1-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.3/297.3 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: accelerate
Successfully installed accelerate-0.29.1
Note: you may need to restart the kernel to use updated packages.


In [3]:
import os
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset, Subset
from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor, TrainingArguments, Trainer
from sklearn.model_selection import KFold

# AudioCNN definition
class AudioCNN(nn.Module):
    def __init__(self, num_classes, input_size):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * (input_size // 8), 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# AudioData for Wav2Vec2
class AudioData(Dataset):
    def __init__(self, waveforms, labels, processor, sampling_rate=16000):
        self.processor = processor
        self.waveforms = waveforms
        self.labels = labels
        self.sampling_rate = sampling_rate
        self.label_to_index = {label: index for index, label in enumerate(sorted(set(labels)))}

    def __len__(self):
        return len(self.waveforms)

    def __getitem__(self, idx):
        waveform = self.waveforms[idx]
        label = self.labels[idx]
        label_index = self.label_to_index[label]  # Convert string label to integer index
        return {
            "input_values": self.processor(waveform, sampling_rate=self.sampling_rate).input_values[0],
            "labels": torch.tensor(label_index)  # Use the integer index here
        }


# Function to load data
def load_raw_audio_data(dataset_path, data_type):
    data = []
    labels = []
    for file_number in range(1, 51):
        filename = f"data_{file_number}_{data_type}.pkl"
        file_path = os.path.join(dataset_path, filename)
        if os.path.exists(file_path):
            with open(file_path, 'rb') as file:
                file_data = pickle.load(file)
                for waveform, label in file_data:
                    data.append(waveform.squeeze().numpy())
                    labels.append(label)
    return data, labels

# Load data
dataset_path = 'emi_dataset/'
waveforms, labels = load_raw_audio_data(dataset_path, "train")
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
audio_data = AudioData(waveforms, labels, processor)

# Initialize Wav2Vec2
model_wav2vec2 = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h")

# Fine-tune Wav2Vec2
training_args = TrainingArguments(
    output_dir="./wav2vec2_finetuned",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
trainer = Trainer(
    model=model_wav2vec2,
    args=training_args,
    train_dataset=audio_data,
)
trainer.train()

# Cross-validation for AudioCNN
num_folds = 5
kfold = KFold(n_splits=num_folds, shuffle=True)
num_classes = 10
input_size = 1024  # Adjust based on Wav2Vec2 feature size

for fold, (train_ids, valid_ids) in enumerate(kfold.split(waveforms)):
    print(f"FOLD {fold}")

    # Creating train and validation subsets
    train_subsampler = Subset(audio_data, train_ids)
    valid_subsampler = Subset(audio_data, valid_ids)

    # Data loaders for the subsets
    train_loader = DataLoader(train_subsampler, batch_size=16, shuffle=True)
    valid_loader = DataLoader(valid_subsampler, batch_size=16)

    # Initializing the AudioCNN model for this fold
    audio_cnn = AudioCNN(num_classes=num_classes, input_size=input_size)
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(audio_cnn.parameters(), lr=0.001)

    # Training loop for AudioCNN
    for epoch in range(num_epochs):
        audio_cnn.train()
        for batch in train_loader:
            # Extract features using the fine-tuned Wav2Vec2 model
            with torch.no_grad():
                inputs = processor(batch['input_values'], return_tensors="pt", padding=True, sampling_rate=16000)
                features = model_wav2vec2(inputs.input_values).last_hidden_state.mean(dim=1).squeeze()

            # Train AudioCNN
            optimizer.zero_grad()
            outputs = audio_cnn(features)
            loss = criterion(outputs, batch['labels'])
            loss.backward()
            optimizer.step()

        # Validation loop
        audio_cnn.eval()
        val_loss = 0
        correct = 0
        total = 0
        for batch in valid_loader:
            with torch.no_grad():
                inputs = processor(batch['input_values'], return_tensors="pt", padding=True, sampling_rate=16000)
                features = model_wav2vec2(inputs.input_values).last_hidden_state.mean(dim=1).squeeze()

            outputs = audio_cnn(features)
            loss = criterion(outputs, batch['labels'])
            val_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += batch['labels'].size(0)
            correct += (predicted == batch['labels']).sum().item()

        val_loss /= len(valid_loader)
        print(f'Fold {fold}, Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {(100 * correct / total):.2f}%')


# Load test data
test_waveforms, test_labels = load_raw_audio_data(dataset_path, "test")
test_audio_data = AudioData(test_waveforms, test_labels, processor)

# Create a DataLoader for the test data
test_loader = DataLoader(test_audio_data, batch_size=16)

# Evaluate the AudioCNN on the test set
audio_cnn.eval()
test_loss = 0
test_correct = 0
test_total = 0

with torch.no_grad():
    for batch in test_loader:
        # Extract features for the test data using the fine-tuned Wav2Vec2 model
        inputs = processor(batch['input_values'], return_tensors="pt", padding=True, sampling_rate=16000)
        features = model_wav2vec2(inputs.input_values).last_hidden_state.mean(dim=1).squeeze()

        # Forward pass through the AudioCNN
        outputs = audio_cnn(features)
        loss = criterion(outputs, batch['labels'])
        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        test_total += batch['labels'].size(0)
        test_correct += (predicted == batch['labels']).sum().item()

test_loss /= len(test_loader)
test_accuracy = 100 * test_correct / test_total
print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')


Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

ValueError: Label values must be <= vocab_size: 32