<a href="https://colab.research.google.com/github/Sari275/my-deep-learning-projects/blob/main/Program_3_Transformer_Based_Audio_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import Libraries

In [None]:
from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
import torch
import torchaudio
from glob import glob
import os
from sklearn.metrics import f1_score
from tqdm import tqdm
import pandas as pd

Define Parameters

In [None]:
output_size = 25
input_size = 1
batch_size = 8
epochs = 100

Defining feature Extraction Using Meta wav2vec2

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained('facebook/wav2vec2-base')

Create dataset class using PyTorch

In [None]:
class AudioData(torch.utils.data.Dataset):
    def __init__(self, split):
        files = glob(f'/WAVE/projects/CSEN-342-Wi24/data/pr3/{split}/*.wav')
        label = f'/WAVE/projects/CSEN-342-Wi24/data/pr3/{split}/labels.txt'
        if os.path.isfile(label):
            with open(label, 'r') as f:
                txt = f.read()
                labels = [int(x)-1 for x in txt.split('\n') if x]
        else:
            labels = []
        audios = []
        for file in files:
            waveform, samplerate = torchaudio.load(file)
            waveform = torchaudio.functional.resample(waveform, samplerate, 16000)
            features = torch.tensor(feature_extractor(waveform, sampling_rate=16000)['input_values'][0][0])
            audios.append(features)

        self.audios = audios
        self.labels = labels

    def __len__(self):
        return len(self.audios)

    def __getitem__(self, idx):
        if self.labels:
            label = torch.tensor(self.labels[idx])
            return self.audios[idx], label
        return self.audios[idx]

Load Training Set

In [None]:
train_data = AudioData('train')

Load Validation Set

In [None]:
valid_data = AudioData('val')

Examine one sample from training set

In [None]:
audio, label = next(iter(train_data))
print(audio.shape, label.shape)

Dataloader takes training and validation audio datasets and creates a batch of samples.

In [None]:
train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
valid_loader = torch.utils.data.DataLoader(valid_data, batch_size=batch_size, shuffle=False)

Convert audio to chunks of 25 milliseconds and create latent features

In [None]:
model = AutoModelForAudioClassification.from_pretrained('facebook/wav2vec2-base', num_labels=output_size)

Send each model parameter to GPU one after the other to be processed

In [None]:
model = model.cuda()

Define Loss Function

In [None]:
criterion = torch.nn.CrossEntropyLoss()

Define Optimizer

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-5)

Initialize Best F1 Score

In [None]:
best_f1 = 0

Training classification model for train set

In [None]:
for epoch in range(epochs):
    train_loss = 0
    train_preds = []
    train_trues = []
    model.train()
    for audios, labels in tqdm(train_loader, total=len(train_loader)):
        audios = audios.cuda()
        labels = labels.cuda()
        optimizer.zero_grad()
        outputs = model(audios).logits
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * audios.size(0)
        train_preds.append(preds)
        train_trues.append(labels)
    train_preds = torch.concat(train_preds)
    train_trues = torch.concat(train_trues)

Go into evaluation mode to do the prediction for the validation set

In [None]:
    model.eval()
    valid_loss = 0
    valid_preds = []
    valid_trues = []
    for audios, labels in valid_loader:
        audios = audios.cuda()
        labels = labels.cuda()
        outputs = model(audios).logits
        _, preds = torch.max(outputs, 1)
        loss = criterion(outputs, labels)
        valid_loss += loss.item() * audios.size(0)  # calculate loss for validation
        valid_preds.append(preds)
        valid_trues.append(labels)
    valid_preds = torch.concat(valid_preds)
    valid_trues = torch.concat(valid_trues)

    train_loss = train_loss/len(train_loader.dataset) #average loss per sample
    valid_loss = valid_loss/len(valid_loader.dataset)

    valid_preds = valid_preds.detach().cpu().numpy()
    valid_trues = valid_trues.detach().cpu().numpy()
    train_preds = train_preds.detach().cpu().numpy()
    train_trues = train_trues.detach().cpu().numpy()

    valid_f1 = f1_score(valid_trues, valid_preds, average='micro')  # calculate f1 predicition for entire dataset
    train_f1 = f1_score(train_trues, train_preds, average='micro')

    print(f'Epoch = {epoch}, train_loss = {train_loss:.3f}, train_f1 = {train_f1:.3f}, valid_loss = {valid_loss:.3f}, valid_f1 = {valid_f1:.3f}')
    if valid_f1 > best_f1:
        best_f1 = valid_f1 #keep track of the best f1 validation score
        torch.save(model.state_dict(), 'best.pth') #save weights
torch.save(model.state_dict(), 'last.pth') #last weights

Make a Prediction for the Test Set

In [None]:
test_data = AudioData('test') #declare use of audio files in test folder
test_loader = torch.utils.data.DataLoader(test_data, batch_size=batch_size, shuffle=False) #create batch. Dataloader then takes the audio test dataset and creates a batch of samples.
model = AutoModelForAudioClassification.from_pretrained('facebook/wav2vec2-base', num_labels=output_size)
state = torch.load('best.pth') #use best weights to make predictions for the audios in the test set
model.load_state_dict(state) #load the state and update the model with the state
model = model.cuda()
model.eval()
test_preds = [] #create a list to store test_preds
for audios in test_loader: # go over each batch of audios
    audios = audios.cuda() # pass it through GPU
    outputs = model(audios).logits # pass through model
    _, preds = torch.max(outputs, 1) # get prediction of model
    test_preds.append(preds) # convert probability to classes
test_preds = torch.concat(test_preds).detach().cpu().numpy() + 1  #after getting all the batches, concat them all together, therefore the test predictions are brought back from GPU to CPU
pd.Series(test_preds).to_csv('predictions.txt', index=False, header=None) #generate prediction .txt file