In [1]:
import os
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from transformers import Wav2Vec2Processor, Wav2Vec2Model, Wav2Vec2ForCTC, AdamW
from torch.utils.data import Dataset, DataLoader
import torchaudio


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import os
import glob

audio_dir = 'emi_dataset/'
audio_paths = glob.glob(os.path.join(audio_dir, '*.pkl'))  # Adjust the pattern if your files have a different extension

# Check if we found any audio files
if not audio_paths:
    raise RuntimeError(f"No audio files found in directory {audio_dir}")

# Now create the dataset
audio_dataset = AudioDataset(audio_paths, labels, processor)


RuntimeError: No audio files found in directory emi_dataset/

In [5]:


# Define the AudioCNN class
class AudioCNN(nn.Module):
    def __init__(self, num_classes, input_size):
        super(AudioCNN, self).__init__()
        self.conv1 = nn.Conv1d(1, 16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv1d(32, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool1d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * (input_size // 8), 128)
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = x.unsqueeze(1)
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = F.relu(self.conv3(x))
        x = self.pool(x)
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Function to load data and extract features using Wav2Vec2
def load_data(dataset_path, data_type, sampling_rate=16000):
    data = []
    labels = []
    processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
    model_wav2vec2 = Wav2Vec2Model.from_pretrained("fine_tuned_wav2vec2")

    for file_number in range(1, 51):
        filename = f"data_{file_number}_{data_type}.pkl"
        file_path = os.path.join(dataset_path, filename)
        if os.path.exists(file_path):
            with open(file_path, 'rb') as file:
                file_data = pickle.load(file)
                for waveform, label in file_data:
                    input_values = processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=sampling_rate).input_values
                    with torch.no_grad():
                        features = model_wav2vec2(input_values).last_hidden_state.mean(dim=1).squeeze()
                    data.append(features)
                    labels.append(label)

    labels = [x[0] for x in labels]
    return torch.stack(data), torch.tensor(labels)

# Fine-tuning Wav2Vec2 - Placeholder function
# Implement this based on your dataset specifics
def fine_tune_wav2vec2(train_loader, learning_rate=1e-4, num_epochs=3):
    model_wav2vec2 = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-large-960h").to('cuda')
    model_wav2vec2.train()
    optimizer = AdamW(model_wav2vec2.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        for batch_idx, (input_values, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            input_values = input_values.to('cuda')
            labels = labels.to('cuda')

            outputs = model_wav2vec2(input_values, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()

            if batch_idx % 10 == 0:
                print(f"Epoch {epoch+1}/{num_epochs}, Step {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    model_wav2vec2.save_pretrained("fine_tuned_wav2vec2")



class AudioDataset(Dataset):
    
    def __init__(self, audio_paths, labels, processor):
        self.audio_paths = audio_paths
        self.labels = labels
        self.processor = processor

    def __len__(self):
        return len(self.audio_paths)

    def __getitem__(self, idx):
        audio_path = self.audio_paths[idx]
        label = self.labels[idx]

        waveform, sr = torchaudio.load(audio_path)  # Load the audio file
        input_values = self.processor(waveform.squeeze().numpy(), return_tensors="pt", sampling_rate=sr).input_values  # Process the audio file

        return input_values.squeeze(), torch.tensor(label)

# Assuming you have `audio_paths` and `labels` lists prepared
labels = [0,1,2,3,4,5,6,7,8,9]
audio_paths = 'emi_dataset/'
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-large-960h")
audio_dataset = AudioDataset(audio_paths, labels, processor)
wav2vec2_train_loader = DataLoader(audio_dataset, batch_size=4, shuffle=True)

# Now you can fine-tune the Wav2Vec2 model
fine_tune_wav2vec2(wav2vec2_train_loader)


# Load and prepare data for AudioCNN training
dataset_path = 'emi_dataset/'
train_data, train_labels = load_data(dataset_path, "train")
train_loader = DataLoader(TensorDataset(train_data, train_labels), batch_size=32, shuffle=True)

validate_data, validate_labels = load_data(dataset_path, "valid")
validate_loader = DataLoader(TensorDataset(validate_data, validate_labels), batch_size=32)

test_data, test_labels = load_data(dataset_path, "test")
test_loader = DataLoader(TensorDataset(test_data, test_labels), batch_size=32)

# Initialize the AudioCNN model
num_classes = 10  # Update based on your dataset
input_size = 1024  # Update if necessary
audio_cnn = AudioCNN(num_classes=num_classes, input_size=input_size)

# AudioCNN training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(audio_cnn.parameters(), lr=0.001)

for epoch in range(num_epochs):
    audio_cnn.train()
    for batch_idx, (features, labels) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = audio_cnn(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if batch_idx % 10 == 0:
            print(f"Epoch {epoch+1}/{num_epochs}, Step {batch_idx+1}/{len(train_loader)}, Loss: {loss.item():.4f}")

    # Validation loop
    audio_cnn.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    for features, labels in validate_loader:
        outputs = audio_cnn(features)
        loss = criterion(outputs, labels)
        val_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    val_loss /= len(validate_loader)
    print(f'Epoch {epoch+1}, Validation Loss: {val_loss:.4f}, Validation Accuracy: {(100 * correct / total):.2f}%')

# Add a similar test loop to evaluate the model on the test set


# Test loop for AudioCNN
audio_cnn.eval()  # Set the model to evaluation mode
test_loss = 0.0
correct = 0
total = 0

with torch.no_grad():  # No need to track gradients
    for features, labels in test_loader:
        outputs = audio_cnn(features)  # Forward pass
        loss = criterion(outputs, labels)  # Compute the loss
        test_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)  # Get the class with the highest output as the prediction
        total += labels.size(0)  # Total number of labels
        correct += (predicted == labels).sum().item()  # Count correct predictions

test_loss /= len(test_loader)  # Average loss
test_accuracy = 100 * correct / total  # Calculate accuracy

print(f'Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%')


Some weights of the model checkpoint at facebook/wav2vec2-large-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.masked_spec_embed']
You s

RuntimeError: Failed to open the input "a" (No such file or directory).
Exception raised from get_input_format_context at /__w/_temp/conda_environment_7505053768/conda-bld/torchaudio_1705078604992/work/src/libtorio/ffmpeg/stream_reader/stream_reader.cpp:42 (most recent call first):
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x57 (0x151bd33a9d87 in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #1: c10::detail::torchCheckFail(char const*, char const*, unsigned int, std::string const&) + 0x64 (0x151bd335a75f in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torch/lib/libc10.so)
frame #2: <unknown function> + 0x42904 (0x151b6facb904 in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torio/lib/libtorio_ffmpeg6.so)
frame #3: torio::io::StreamingMediaDecoder::StreamingMediaDecoder(std::string const&, std::optional<std::string> const&, std::optional<std::map<std::string, std::string, std::less<std::string>, std::allocator<std::pair<std::string const, std::string> > > > const&) + 0x14 (0x151b6face304 in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torio/lib/libtorio_ffmpeg6.so)
frame #4: <unknown function> + 0x3a58e (0x151b65c9f58e in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torio/lib/_torio_ffmpeg6.so)
frame #5: <unknown function> + 0x32147 (0x151b65c97147 in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torio/lib/_torio_ffmpeg6.so)
<omitting python frames>
frame #11: <unknown function> + 0xf244 (0x151b748d8244 in /apps/pytorch/2.2.0/lib/python3.10/site-packages/torchaudio/lib/_torchaudio.so)
