In [1]:
# !pip3 install accelerate -U
# !pip3 install datasets transformers[sentencepiece]
# !pip3 install --upgrade torchvision
# !pip3 install --upgrade transformers
# !pip3 install --upgrade regex

In [2]:
import os
import glob
import librosa
import io
import datasets

import torch.utils.data
from torch.utils.data import Dataset, DataLoader

import pandas as pd
import numpy as np

import torch
import torch.nn as nn

from datasets import load_dataset, DatasetDict,  Audio

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score

from transformers import WhisperModel, WhisperFeatureExtractor, AdamW

In [3]:
audio_df = pd.read_csv(r"/home/l083319/Cough_Related/Dataset/urbansound8k/UrbanSound8K.csv")
audio_df = audio_df.sample(n=2000, random_state=42)
audio_df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
6770,54898-8-0-2.wav,54898,47.992301,51.992301,2,3,8,siren
3534,172338-9-0-7.wav,172338,91.76048,95.76048,1,4,9,street_music
8556,95562-4-3-0.wav,95562,8.795241,12.795241,1,3,4,drilling
7870,75490-8-0-2.wav,75490,1.0,5.0,1,6,8,siren
1226,128891-3-0-4.wav,128891,2.0,6.0,1,6,3,dog_bark


In [4]:
audio_df["class"].value_counts()

street_music        251
jackhammer          238
drilling            235
air_conditioner     234
dog_bark            224
engine_idling       218
children_playing    215
siren               204
car_horn             97
gun_shot             84
Name: class, dtype: int64

In [5]:
def get_all_full_paths(parent_directory):
    # List to store file paths
    audio_file_paths = []

    # Iterate through audio folders (assuming they are named fold1, fold2, ..., fold10)
    for folder_name in range(1, 11):
        folder_path = os.path.join(parent_directory, 'fold{}'.format(folder_name))
        # Iterate through files in the current folder and add their paths to the list
        for filename in os.listdir(folder_path):
            if filename.endswith('.wav'):  # Assuming your audio files have .wav extension
                file_path = os.path.join(folder_path, filename)
                audio_file_paths.append(file_path)

    # Create a dictionary to map base name to full_path
    file_path_dict = {os.path.basename(path): path for path in audio_file_paths}
    return file_path_dict

audio_files_directory = r"/home/l083319/Cough_Related/Dataset/urbansound8k"
file_path_dict = get_all_full_paths(audio_files_directory)

In [6]:
def get_single_full_path(slice_file_name):
    return file_path_dict.get(slice_file_name)

# Add 'full_path' column to the DataFrame
audio_df['full_path'] = audio_df['slice_file_name'].apply(get_single_full_path)
audio_df.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class,full_path
6770,54898-8-0-2.wav,54898,47.992301,51.992301,2,3,8,siren,/home/l083319/Cough_Related/Dataset/urbansound...
3534,172338-9-0-7.wav,172338,91.76048,95.76048,1,4,9,street_music,/home/l083319/Cough_Related/Dataset/urbansound...
8556,95562-4-3-0.wav,95562,8.795241,12.795241,1,3,4,drilling,/home/l083319/Cough_Related/Dataset/urbansound...
7870,75490-8-0-2.wav,75490,1.0,5.0,1,6,8,siren,/home/l083319/Cough_Related/Dataset/urbansound...
1226,128891-3-0-4.wav,128891,2.0,6.0,1,6,3,dog_bark,/home/l083319/Cough_Related/Dataset/urbansound...


In [7]:
train_df, temp_df = train_test_split(audio_df, test_size=0.3, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [8]:
print('Train:', len(train_df))
print('Val  :', len(val_df))
print('Test :', len(test_df))

Train: 1400
Val  : 300
Test : 300


In [9]:
train_audio_dataset = datasets.Dataset.from_dict({
    "audio": train_df["full_path"].tolist(),
    "labels": train_df["classID"].tolist()    
    }).cast_column("audio", Audio(sampling_rate=16000))

test_audio_dataset = datasets.Dataset.from_dict({
    "audio": test_df["full_path"].tolist(),
    "labels": test_df["classID"].tolist()
    }).cast_column("audio", Audio(sampling_rate=16000))

val_audio_dataset = datasets.Dataset.from_dict({
    "audio": val_df["full_path"].tolist(),
    "labels": val_df["classID"].tolist()
    }).cast_column("audio", Audio(sampling_rate=16000))

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/290M [00:00<?, ?B/s]

  return torch._C._cuda_getDeviceCount() > 0


In [13]:
class SpeechClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, audio_data,  text_processor):
        self.audio_data = audio_data
        self.text_processor = text_processor

    def __len__(self):
        return len(self.audio_data)

    def __getitem__(self, index):

        inputs = self.text_processor(
            self.audio_data[index]["audio"]["array"],
            return_tensors="pt",
            sampling_rate=self.audio_data[index]["audio"]["sampling_rate"]
        )

        input_features = inputs.input_features
        decoder_input_ids = torch.tensor([[1, 1]]) * encoder.config.decoder_start_token_id

        labels = np.array(self.audio_data[index]['labels'])

        return input_features, decoder_input_ids, torch.tensor(labels)


In [14]:
train_dataset = SpeechClassificationDataset(train_audio_dataset, feature_extractor)
test_dataset = SpeechClassificationDataset(test_audio_dataset, feature_extractor)
val_dataset = SpeechClassificationDataset(val_audio_dataset, feature_extractor)

batch_size = 8

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [15]:
class SpeechClassifier(nn.Module):
    def __init__(self, num_labels, encoder):
        super(SpeechClassifier, self).__init__()
        self.encoder = encoder
        self.classifier = nn.Sequential(
            nn.Linear(self.encoder.config.hidden_size, 4096),
            nn.ReLU(),
            nn.Linear(4096, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, num_labels)
        )

    def forward(self, input_features, decoder_input_ids):
        outputs = self.encoder(input_features, decoder_input_ids=decoder_input_ids)
        pooled_output = outputs['last_hidden_state'][:, 0, :]
        logits = self.classifier(pooled_output)
        return logits



In [16]:
num_labels = 10

model = SpeechClassifier(num_labels, encoder).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5, betas=(0.9, 0.999), eps=1e-08)
criterion = nn.CrossEntropyLoss()



In [17]:

# Define the training function
def train(model, train_loader, val_loader, optimizer,  criterion, device, num_epochs):

    best_accuracy = 0.0

    for epoch in range(num_epochs):

        model.train()

        for i, batch in enumerate(train_loader):

            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze()
            input_features = input_features.to(device)

            decoder_input_ids = decoder_input_ids.squeeze()
            decoder_input_ids = decoder_input_ids.to(device)

            labels = labels.view(-1)
            labels = labels.to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)

            loss = criterion(logits, labels)
            loss.backward()

            optimizer.step()

            if (i+1) % 8 == 0:
                print(f'Epoch {epoch+1}/{num_epochs}, Batch {i+1}/{len(train_loader)}, Train Loss: {loss.item() :.4f}')
                train_loss = 0.0

        val_loss, val_accuracy, val_f1, _ , _ = evaluate(model, val_loader, device)

        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            torch.save(model.state_dict(), '/home/l083319/Cough_Related/Dataset/urbansound8k/best_model.pt')

        print("========================================================================================")
        print(f'Epoch {epoch+1}/{num_epochs}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}, Val F1: {val_f1:.4f}, Best Accuracy: {best_accuracy:.4f}')
        print("========================================================================================")



In [18]:
def evaluate(model, data_loader,  device):

    all_labels = []
    all_preds = []
    total_loss = 0.0

    with torch.no_grad():

        for i, batch in enumerate(data_loader):

            input_features, decoder_input_ids, labels = batch

            input_features = input_features.squeeze()
            input_features = input_features.to(device)

            decoder_input_ids = decoder_input_ids.squeeze()
            decoder_input_ids = decoder_input_ids.to(device)

            labels = labels.view(-1)
            labels = labels.to(device)

            optimizer.zero_grad()

            logits = model(input_features, decoder_input_ids)

            loss = criterion(logits, labels)
            total_loss += loss.item()

            _, preds = torch.max(logits, 1)
            all_labels.append(labels.cpu().numpy())
            all_preds.append(preds.cpu().numpy())

    all_labels = np.concatenate(all_labels, axis=0)
    all_preds = np.concatenate(all_preds, axis=0)

    loss = total_loss / len(data_loader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='macro')
    return loss, accuracy, f1, all_labels, all_preds


In [19]:
num_epochs = 5
train(model, train_loader, val_loader, optimizer, criterion, device, num_epochs)

Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.43.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Epoch 1/5, Batch 8/175, Train Loss: 2.3303
Epoch 1/5, Batch 16/175, Train Loss: 2.1506
Epoch 1/5, Batch 24/175, Train Loss: 1.9492
Epoch 1/5, Batch 32/175, Train Loss: 1.9016
Epoch 1/5, Batch 40/175, Train Loss: 1.7862
Epoch 1/5, Batch 48/175, Train Loss: 0.9539
Epoch 1/5, Batch 56/175, Train Loss: 1.5466
Epoch 1/5, Batch 64/175, Train Loss: 1.2771
Epoch 1/5, Batch 72/175, Train Loss: 1.7923
Epoch 1/5, Batch 80/175, Train Loss: 0.7236
Epoch 1/5, Batch 88/175, Train Loss: 0.8296
Epoch 1/5, Batch 96/175, Train Loss: 1.7375
Epoch 1/5, Batch 104/175, Train Loss: 1.3535
Epoch 1/5, Batch 112/175, Train Loss: 0.5792
Epoch 1/5, Batch 120/175, Train Loss: 1.0513
Epoch 1/5, Batch 128/175, Train Loss: 0.9317
Epoch 1/5, Batch 136/175, Train Loss: 0.6085
Epoch 1/5, Batch 144/175, Train Loss: 0.7902
Epoch 1/5, Batch 152/175, Train Loss: 0.7334
Epoch 1/5, Batch 160/175, Train Loss: 1.4406
Epoch 1/5, Batch 168/175, Train Loss: 0.4313
Epoch 1/5, Val Loss: 0.9922, Val Accuracy: 0.6433, Val F1: 0.6445, B

In [20]:
# state_dict = torch.load('best_model.pt')
state_dict = torch.load('/home/l083319/Cough_Related/Dataset/urbansound8k/best_model.pt')

# Create a new instance of the model and load the state dictionary
num_labels = 10
model = SpeechClassifier(num_labels, encoder).to(device)
model.load_state_dict(state_dict)

_, _, _, all_labels, all_preds = evaluate(model, test_loader, device)


print(classification_report(all_labels, all_preds))
print(accuracy_score(all_labels, all_preds))


  state_dict = torch.load('/home/l083319/Cough_Related/Dataset/urbansound8k/best_model.pt')


              precision    recall  f1-score   support

           0       0.65      0.82      0.73        39
           1       0.93      0.72      0.81        18
           2       0.81      0.76      0.79        34
           3       1.00      0.97      0.98        33
           4       0.86      0.76      0.81        25
           5       0.72      0.78      0.75        27
           6       1.00      1.00      1.00        18
           7       0.83      0.78      0.81        37
           8       0.87      0.90      0.89        30
           9       0.79      0.77      0.78        39

    accuracy                           0.82       300
   macro avg       0.85      0.83      0.83       300
weighted avg       0.83      0.82      0.82       300

0.8233333333333334
