<h2>Variables</h2>

In [1]:
# Directory containing the dataset
dataset_dir = './12K-Sorted'

# Label map
label_map = {
    'Block': 0,
    'Interjection': 1,
    'NoStutter': 2,
    'Prolongation': 3,
    'SoundRepetition': 4,
    'WordRepetition': 5
}

# Preprocessing
duration = 3.0  # seconds
sampling_rate = 16000  # Hz
n_mels = 128
n_mfcc = 13

<h2>Retrieving audio files and their labels</h2>

In [2]:
import os

def get_audio_files_and_labels(dataset_dir):
    audio_files = []
    for label_dir in os.listdir(dataset_dir):
        if label_dir in label_map:
            label = label_map[label_dir]
            class_dir = os.path.join(dataset_dir, label_dir)
            for audio_file in os.listdir(class_dir):
                audio_path = os.path.join(class_dir, audio_file)
                audio_files.append((audio_path, label))
    return audio_files

<h2>Preprocessing</h2>

In [3]:
import librosa
import numpy as np

def preprocess_audio(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=sampling_rate)

        # Check if the length of the audio is exactly 3 seconds.
        # If not, return None, None immediately.
        if len(y) != int(duration * sampling_rate):
            print(f"Audio file {audio_path} is not exactly {duration} seconds.")
            return None, None

        # Log-Mel Spectrogram
        log_mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
        log_mel_spectrogram = librosa.power_to_db(log_mel_spectrogram, ref=np.max)

        # MFCC
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)

        return log_mel_spectrogram.T, mfcc.T  # (time_steps, features)
    except Exception as e:
        print(f"Error processing file {audio_path}: {e}")
        return None, None

<h2>Load and preprocess dataset</h2>

In [4]:
def load_and_preprocess_dataset(audio_files):
    x = []
    y = []

    for audio_path, label in audio_files:
        log_mel, mfcc = preprocess_audio(audio_path)
        if log_mel is not None and mfcc is not None:
            combined_features = np.concatenate((log_mel, mfcc), axis=-1) # (time_steps, log_mel_features + mfcc_features)
            x.append(combined_features)
            y.append(label)

    x = np.array(x)
    y = np.array(y)

    return x, y

In [5]:
import torch
import torch.nn as nn
import math

class Conv1DBlock(nn.Module):
    def __init__(self, feature_size: int, kernel_size: int, dropout: float) -> None:
        super().__init__()
        # First Conv1D Layer
        self.conv1 = nn.Conv1d(in_channels=feature_size, out_channels=feature_size*2, kernel_size=kernel_size, padding='same')
        self.norm = nn.LayerNorm(feature_size*2)
        self.activation = nn.GELU()
        self.dropout = nn.Dropout(dropout)

        # Second Conv1D Layer
        self.conv2 = nn.Conv1d(in_channels=feature_size*2, out_channels=512, kernel_size=kernel_size, padding='same')
        self.norm2 = nn.LayerNorm(512)
        
    def forward(self, x):
        # Conv1D -> LayerNorm -> GELU -> Dropout
        x = self.conv1(x.transpose(1, 2))  # Conv1D expects (batch, channels, timesteps), so transpose
        x = self.norm(x.transpose(1, 2))  # LayerNorm expects (batch, timesteps, features), so transpose back
        x = self.activation(x)
        x = self.dropout(x)

        # Repeat for second Conv1D layer
        x = self.conv2(x.transpose(1, 2))
        x = self.norm2(x.transpose(1, 2))
        x = self.activation(x)
        return self.dropout(x)

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, timesteps: int, d_model: int, dropout: float) -> None:
        super().__init__()
        self.timesteps = timesteps
        self.d_model = d_model
        self.dropout = nn.Dropout(dropout)

        # Create a matrix of shape (timesteps, d_model)
        pe = torch.zeros(timesteps, d_model)
        # Create a vector of shape (timesteps, 1)
        position = torch.arange(0, timesteps, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(1000.0) / d_model))
        # Apply the sin to even positions
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)

        pe = pe.unsqueeze(0) # (1, timesteps, d_model)

        # Register as buffer so that the tensor will be saved to the file but not as a learned params
        self.register_buffer('pe', pe)

    def forward(self, x):
        # self.pe = self.pe.to(x.device)  
        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
        return self.dropout(x)

In [7]:
class LayerNormalization(nn.Module):
    def __init__(self, features: int, eps: float = 1e-6) -> None:
        super().__init__()
        self.eps = eps
        self.alpha = nn.Parameter(torch.ones(features))
        self.bias = nn.Parameter(torch.zeros(features))

    def forward(self, x):
        alpha = self.alpha.to(x.device)
        bias = self.bias.to(x.device)

        mean = x.mean(dim=-1, keepdim=True)
        std = x.std(dim=-1, keepdim=True)
        return alpha * (x - mean) / (std + self.eps) + bias

In [8]:
class FeedForwardBlock(nn.Module):
    def __init__(self, d_model: int, d_ff: int, dropout: float) -> None:
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff) # W1 and B1
        self.dropout = nn.Dropout(dropout)
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and B2

    def forward(self, x):
        # (batch, timesteps, d_model) --> (batch, timesteps, d_ff) --> (batch, timesteps, d_model)
        return self.linear_2(self.dropout(torch.relu(self.linear_1(x))))

In [9]:
class MultiHeadAttentionBlock(nn.Module):
    def __init__(self, d_model: int, h: int, dropout: float) -> None:
        super().__init__()
        self.d_model = d_model
        self.h = h
        assert d_model % h == 0, "d_model is not divisible by h"
        
        self.d_k = d_model // h
        self.w_q = nn.Linear(d_model, d_model) #Wq
        self.w_k = nn.Linear(d_model, d_model) #Wk
        self.w_v = nn.Linear(d_model, d_model) #Wv

        self.w_o = nn.Linear(d_model, d_model) #Wo
        self.dropout = nn.Dropout(dropout)

    @staticmethod
    def attention(query, key, value, mask, dropout: nn.Dropout):
        d_k = query.shape[-1]

        # (batch, h, timesteps, d_k) --> (batch, h, timesteps, timesteps)
        attention_scores = (query @ key.transpose(-2, -1)) / math.sqrt(d_k)
        if mask is not None:
            attention_scores.masked_fill_(mask == 0, -1e9)
        attention_scores = attention_scores.softmax(dim = -1) # (batch, h, timesteps, timesteps)
        if dropout is not None:
            attention_scores = dropout(attention_scores)

        return (attention_scores @ value), attention_scores

    def forward(self, q, k, v, mask):
        query = self.w_q(q) # (batch, timesteps, d_model) --> (batch, timesteps, d_model)
        key = self.w_k(k) # (batch, timesteps, d_model) --> (batch, timesteps, d_model)
        value = self.w_v(v) # (batch, timesteps, d_model) --> (batch, timesteps, d_model)

        # (batch, timesteps, d_model) --> (batch, timesteps, h, d_k) --> (batch, h, timesteps, d_k)
        query = query.view(query.shape[0], query.shape[1], self.h, self.d_k).transpose(1, 2)
        key = key.view(key.shape[0], key.shape[1], self.h, self.d_k).transpose(1, 2)
        value = value.view(value.shape[0], value.shape[1], self.h, self.d_k).transpose(1, 2)

        x, self.attention_scores = MultiHeadAttentionBlock.attention(query, key, value, mask, self.dropout)

        # (batch, h, timesteps, d_k) --> (batch, timesteps, h, d_k) --> (batch, timesteps, d_model)
        x = x.transpose(1, 2).contiguous().view(x.shape[0], -1, self.h * self.d_k)

        # (batch, timesteps, d_model) --> (batch, timesteps, d_model)
        return self.w_o(x)

In [10]:
class ResidualConnection(nn.Module):
    def __init__(self, features: int, dropout: float) -> None:
        super().__init__()
        self.dropout = nn.Dropout(dropout)
        self.norm = LayerNormalization(features)

    def forward(self, x, sublayer):
        return x + self.dropout(sublayer(self.norm(x)))

In [11]:
class EncoderBlock(nn.Module):
    def __init__(self, features: int, self_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.feed_foward_block = feed_forward_block
        self.residual_connections = nn.ModuleList([ResidualConnection(features, dropout) for _ in range(2)])
    
    def forward(self, x, src_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, src_mask))
        x = self.residual_connections[1](x, self.feed_foward_block)
        return x
    
class Encoder(nn.Module):
    def __init__(self, features: int, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization(features)

    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return self.norm(x)

In [12]:
class DecoderBlock(nn.Module):
    def __init__(self, self_attention_block: MultiHeadAttentionBlock, cross_attention_block: MultiHeadAttentionBlock, feed_forward_block: FeedForwardBlock, dropout: float) -> None:
        super().__init__()
        self.self_attention_block = self_attention_block
        self.cross_attention_block = cross_attention_block
        self.feed_forward_block = feed_forward_block
        self.residual_connections = nn.ModuleList(ResidualConnection(dropout) for _ in range(3))

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        x = self.residual_connections[0](x, lambda x: self.self_attention_block(x, x, x, tgt_mask))
        x = self.residual_connections[1](x, lambda x: self.self_attention_block(x, encoder_output, encoder_output, src_mask))
        x = self.residual_connections[2](x, self.feed_forward_block)
        return x
    
class Decoder(nn.Module):
    def __init__(self, layers: nn.ModuleList) -> None:
        super().__init__()
        self.layers = layers
        self.norm = LayerNormalization()

    def forward(self, x, encoder_output, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoder_output, src_mask, tgt_mask)
        return self.norm(x)

In [13]:
class ClassificationHead(nn.Module):
    def __init__(self, d_model: int, num_classes: int) -> None:
        super().__init__()
        self.fc = nn.Linear(d_model, num_classes)

    def forward(self, x):
        # Take the mean of the encoder outputs over the sequence dimension
        x = x.mean(dim=1)
        return self.fc(x)

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, encoder: Encoder, pos_enc: PositionalEncoding, classifier: ClassificationHead) -> None:
        super().__init__()
        self.encoder = encoder
        self.pos_enc = pos_enc
        self.classifier = classifier

    def encode(self, inputs, mask=None):
        x = self.pos_enc(inputs)
        return self.encoder(x, mask)
    
    def classify(self, x):
        return self.classifier(x)

In [15]:
class AudioClassificationModel(nn.Module):
    def __init__(self, input_dim: int, num_classes: int, num_heads: int, num_layers: int, kernel_size: int, dropout: float) -> None:
        super().__init__()
        self.conv_block = Conv1DBlock(input_dim, kernel_size, dropout)

        self.pos_enc = None

        self.encoder_blocks = nn.ModuleList()
        self.num_classes = num_classes
        self.num_heads = num_heads
        self.num_layers = num_layers
        self.dropout = dropout

        self.classifier = None

    def forward(self, x):
        # Apply Conv1D block
        x = self.conv_block(x)  # Output shape: (batch, timesteps, features)

        # Dynamically compute d_model from the output shape of Conv1D
        batch_size, timesteps, d_model = x.shape

        # Positional encoding
        if self.pos_enc is None:
            self.pos_enc = PositionalEncoding(timesteps, d_model, self.dropout).to(x.device)

        if len(self.encoder_blocks) == 0:
            for _ in range(self.num_layers):
                encoder_self_attention_block = MultiHeadAttentionBlock(d_model, self.num_heads, self.dropout)
                feed_forward_block = FeedForwardBlock(d_model, d_model*4, self.dropout)
                encoder_block = EncoderBlock(d_model, encoder_self_attention_block, feed_forward_block, self.dropout)
                self.encoder_blocks.append(encoder_block)

            # Encoder
            self.encoder = Encoder(d_model, self.encoder_blocks).to(x.device)

            # Classification head
            self.classifier = ClassificationHead(d_model, self.num_classes).to(x.device)

        # Freeze specific encoder blocks: 1st, 2nd, 3rd, 7th, 8th, and 9th layers
        freezed_layers = [0, 1, 2, 6, 7, 8]

        for i in freezed_layers:
            for param in self.encoder_blocks[i].parameters():
                param.requires_grad = False

        # Apply positional encoding
        x = self.pos_enc(x)

        # Encoder
        x = self.encoder(x, mask=None)

        # Classification head
        return self.classifier(x)

In [16]:
from config import get_config, get_weights_file_path, latest_weights_file_path

def get_model(config, input_dim):
    model = AudioClassificationModel(input_dim, config['num_classes'], config['num_heads'], config['num_layers'], config['kernel_size'], config['dropout'])
    return model

In [17]:
from tqdm import tqdm

def validate_model(model, val_loader, loss_fn, device, writer, global_step):
    model.eval()  # Set the model to evaluation mode
    val_loss = 0.0
    correct_predictions = 0
    total_predictions = 0

    with torch.no_grad():  # Disable gradient computation for validation
        batch_iterator = tqdm(val_loader, desc="Validating")
        for batch in batch_iterator:
            inputs = batch[0].to(device)
            labels = batch[1].to(device)

            # Forward pass
            outputs = model(inputs)

            # Compute validation loss
            loss = loss_fn(outputs, labels)
            val_loss += loss.item()  # Accumulate the validation loss

            # Log validation loss to the batch iterator for display
            batch_iterator.set_postfix({"val_loss": f"{loss.item():6.3f}"})

            # Calculate accuracy
            _, predicted_labels = torch.max(outputs, 1)  # Get predicted class
            correct_predictions += (predicted_labels == labels).sum().item()
            total_predictions += labels.size(0)

    # Calculate the average validation loss and accuracy for this epoch
    avg_val_loss = val_loss / len(val_loader)
    accuracy = correct_predictions / total_predictions

    print(f"Validation Loss: {avg_val_loss:.4f}, Accuracy: {accuracy:.4f}")

    # Log validation loss and accuracy to TensorBoard
    writer.add_scalar('val loss', avg_val_loss, global_step)
    writer.add_scalar('val accuracy', accuracy, global_step)
    writer.flush()

    return avg_val_loss, accuracy


In [18]:
from pathlib import Path
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import torch.optim as optim
from torch.utils.tensorboard import SummaryWriter
from sklearn.utils import class_weight

def train_model(config):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device {device}")
    if (device == "cuda"):
        print(f"Device name: {torch.cuda.get_device_name(device.index)}")
        print(f"Device memory: {torch.cuda.get_device_properties(device.index).total_memory / 1024 ** 3} GB")
    device = torch.device(device)

    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)

    audio_files = get_audio_files_and_labels(dataset_dir)
    x, y = load_and_preprocess_dataset(audio_files)

    # # Checking the dataset size
    # assert len(x) == len(y) == 12730, "Dataset size mismatch!"

    # Train/Test split
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

    # Convert to PyTorch tensors
    x_train = torch.from_numpy(x_train).float()
    x_val = torch.from_numpy(x_val).float()
    y_train = torch.from_numpy(np.array(y_train)).long()
    y_val = torch.from_numpy(np.array(y_val)).long()

    # Move the tensors to device specified
    x_train = x_train.to(device)
    y_train = y_train.to(device)
    x_val = x_val.to(device)
    y_val = y_val.to(device)
    
    # Create Dataset and DataLoader
    train_dataset = TensorDataset(x_train, y_train)
    val_dataset = TensorDataset(x_val, y_val)
    train_loader = DataLoader(train_dataset, config['batch_size'], shuffle=True)
    val_loader = DataLoader(val_dataset, config['batch_size'])

    input_shape = x_train.shape[-1]

    model = get_model(config, input_shape).to(device)
    # Tensorboard
    writer = SummaryWriter(config['experiment_name'])

    optimizer = optim.Adam(model.parameters(), lr=config['lr'])

    initial_epoch = 0
    global_step = 0
    preload = config['preload']
    model_filename = latest_weights_file_path(config) if preload == 'latest' else get_weights_file_path(config, preload) if preload else None
    if model_filename:
        print(f'Preloading model {model_filename}')
        state = torch.load(model_filename)
        model.load_state_dict(state['model_state_dict'])
        initial_epoch = state['epoch'] + 1
        optimizer.load_state_dict(state['optimizer_state_dict'])
        global_step = state['global_step']
    else:
        print('No model to preload, starting from scratch')

    class_weights = class_weight.compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )

    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

    loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1, weight=class_weights_tensor).to(device)

    for epoch in range(initial_epoch, config['epochs']):
        torch.cuda.empty_cache()
        model.train()
        batch_iterator = tqdm(train_loader, desc=f"Processing Epoch {epoch:02d}")
        for batch in batch_iterator:

            inputs = batch[0].to(device) 
            labels = batch[1].to(device) 

            outputs = model(inputs)

            # Compute the loss using a simple cross entropy
            loss = loss_fn(outputs, labels)
            batch_iterator.set_postfix({"loss": f"{loss.item():6.3f}"})

            # Log the loss
            writer.add_scalar('train loss', loss.item(), global_step)
            writer.flush()

            # Backpropagate the loss
            loss.backward()

            # Update the weights
            optimizer.step()
            optimizer.zero_grad(set_to_none=True)

            global_step += 1

        # Run validation at the end of every epoch
        validate_model(model, val_loader, loss_fn, device, writer, global_step)

        # Save the model every 10 epochs
        if (epoch + 1) % 10 == 0:
            model_filename = get_weights_file_path(config, f"{epoch:02d}")
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'global_step': global_step
            }, model_filename)

<h2>Evaluating the Model</h2>

In [19]:
import warnings

warnings.filterwarnings("ignore")
config = get_config()
train_model(config)

Using device cuda
Device name: NVIDIA GeForce RTX 3060 Laptop GPU
Device memory: 5.99951171875 GB
Audio file ./12K-Sorted\Block\HeStutters_1_75.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Block\StrongVoices_25_3.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Block\WomenWhoStutter_0_46.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\HeStutters_1_24.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\HeStutters_1_36.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\HeStutters_1_43.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\HeStutters_1_70.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\HeStutters_1_76.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\StrongVoices_25_18.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\StrongVoices_25_31.wav is not exactly 3.0 seconds.
Audio file ./12K-Sorted\Interjection\StutterTalk_59_33.wav i

Processing Epoch 00: 100%|██████████| 316/316 [00:57<00:00,  5.46it/s, loss=1.323]
Validating: 100%|██████████| 79/79 [00:04<00:00, 17.48it/s, val_loss=1.211]


Validation Loss: 1.3834, Accuracy: 0.5778


Processing Epoch 01: 100%|██████████| 316/316 [01:00<00:00,  5.26it/s, loss=1.347]
Validating: 100%|██████████| 79/79 [00:05<00:00, 15.04it/s, val_loss=1.223]


Validation Loss: 1.4203, Accuracy: 0.5778


Processing Epoch 02: 100%|██████████| 316/316 [01:05<00:00,  4.81it/s, loss=1.352]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.43it/s, val_loss=1.178]


Validation Loss: 1.3576, Accuracy: 0.5718


Processing Epoch 03: 100%|██████████| 316/316 [01:07<00:00,  4.70it/s, loss=1.306]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.16it/s, val_loss=1.169]


Validation Loss: 1.3616, Accuracy: 0.5778


Processing Epoch 04: 100%|██████████| 316/316 [01:08<00:00,  4.61it/s, loss=1.205]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.95it/s, val_loss=1.123]


Validation Loss: 1.3331, Accuracy: 0.5805


Processing Epoch 05: 100%|██████████| 316/316 [01:09<00:00,  4.57it/s, loss=1.430]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.83it/s, val_loss=1.144]


Validation Loss: 1.3571, Accuracy: 0.5813


Processing Epoch 06: 100%|██████████| 316/316 [01:09<00:00,  4.56it/s, loss=1.365]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.68it/s, val_loss=1.100]


Validation Loss: 1.2988, Accuracy: 0.6110


Processing Epoch 07: 100%|██████████| 316/316 [01:09<00:00,  4.54it/s, loss=1.133]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.69it/s, val_loss=1.028]


Validation Loss: 1.3013, Accuracy: 0.6023


Processing Epoch 08: 100%|██████████| 316/316 [01:08<00:00,  4.59it/s, loss=1.414]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.80it/s, val_loss=1.054]


Validation Loss: 1.2871, Accuracy: 0.6055


Processing Epoch 09: 100%|██████████| 316/316 [01:09<00:00,  4.56it/s, loss=1.208]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.72it/s, val_loss=1.003]


Validation Loss: 1.2763, Accuracy: 0.6106


Processing Epoch 10: 100%|██████████| 316/316 [01:09<00:00,  4.53it/s, loss=1.159]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.64it/s, val_loss=0.988]


Validation Loss: 1.2630, Accuracy: 0.6122


Processing Epoch 11: 100%|██████████| 316/316 [01:09<00:00,  4.54it/s, loss=1.094]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.78it/s, val_loss=1.005]


Validation Loss: 1.2683, Accuracy: 0.6130


Processing Epoch 12: 100%|██████████| 316/316 [01:10<00:00,  4.51it/s, loss=1.145]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.62it/s, val_loss=1.023]


Validation Loss: 1.2766, Accuracy: 0.6161


Processing Epoch 13: 100%|██████████| 316/316 [01:10<00:00,  4.51it/s, loss=1.109]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.45it/s, val_loss=1.019]


Validation Loss: 1.2353, Accuracy: 0.6359


Processing Epoch 14: 100%|██████████| 316/316 [01:07<00:00,  4.69it/s, loss=1.136]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.71it/s, val_loss=1.040]


Validation Loss: 1.2847, Accuracy: 0.6181


Processing Epoch 15: 100%|██████████| 316/316 [01:06<00:00,  4.78it/s, loss=1.220]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.41it/s, val_loss=0.991]


Validation Loss: 1.2210, Accuracy: 0.6407


Processing Epoch 16: 100%|██████████| 316/316 [01:04<00:00,  4.91it/s, loss=1.309]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.88it/s, val_loss=1.010]


Validation Loss: 1.2180, Accuracy: 0.6399


Processing Epoch 17: 100%|██████████| 316/316 [01:05<00:00,  4.83it/s, loss=1.106]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.99it/s, val_loss=1.001]


Validation Loss: 1.2297, Accuracy: 0.6383


Processing Epoch 18: 100%|██████████| 316/316 [01:05<00:00,  4.83it/s, loss=1.339]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.27it/s, val_loss=1.004]


Validation Loss: 1.2214, Accuracy: 0.6399


Processing Epoch 19: 100%|██████████| 316/316 [01:07<00:00,  4.66it/s, loss=0.984]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.82it/s, val_loss=1.011]


Validation Loss: 1.2850, Accuracy: 0.6284


Processing Epoch 20: 100%|██████████| 316/316 [01:06<00:00,  4.74it/s, loss=1.310]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.15it/s, val_loss=0.986]


Validation Loss: 1.2075, Accuracy: 0.6431


Processing Epoch 21: 100%|██████████| 316/316 [01:05<00:00,  4.85it/s, loss=1.247]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.26it/s, val_loss=0.982]


Validation Loss: 1.2405, Accuracy: 0.6324


Processing Epoch 22: 100%|██████████| 316/316 [01:05<00:00,  4.86it/s, loss=1.156]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.84it/s, val_loss=0.951]


Validation Loss: 1.2095, Accuracy: 0.6419


Processing Epoch 23: 100%|██████████| 316/316 [01:05<00:00,  4.86it/s, loss=1.019]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.91it/s, val_loss=1.001]


Validation Loss: 1.2156, Accuracy: 0.6470


Processing Epoch 24: 100%|██████████| 316/316 [01:06<00:00,  4.76it/s, loss=1.286]
Validating: 100%|██████████| 79/79 [00:05<00:00, 15.12it/s, val_loss=1.063]


Validation Loss: 1.2320, Accuracy: 0.6395


Processing Epoch 25: 100%|██████████| 316/316 [01:02<00:00,  5.08it/s, loss=1.075]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.86it/s, val_loss=0.991]


Validation Loss: 1.2256, Accuracy: 0.6450


Processing Epoch 26: 100%|██████████| 316/316 [01:06<00:00,  4.72it/s, loss=1.149]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.33it/s, val_loss=0.985]


Validation Loss: 1.2233, Accuracy: 0.6446


Processing Epoch 27: 100%|██████████| 316/316 [01:06<00:00,  4.76it/s, loss=1.486]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.52it/s, val_loss=1.026]


Validation Loss: 1.2742, Accuracy: 0.6359


Processing Epoch 28: 100%|██████████| 316/316 [01:06<00:00,  4.76it/s, loss=1.072]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.59it/s, val_loss=0.980]


Validation Loss: 1.1954, Accuracy: 0.6427


Processing Epoch 29: 100%|██████████| 316/316 [01:05<00:00,  4.80it/s, loss=1.243]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.86it/s, val_loss=0.962]


Validation Loss: 1.1880, Accuracy: 0.6458


Processing Epoch 30: 100%|██████████| 316/316 [01:05<00:00,  4.82it/s, loss=1.178]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.57it/s, val_loss=1.027]


Validation Loss: 1.2429, Accuracy: 0.6419


Processing Epoch 31: 100%|██████████| 316/316 [01:05<00:00,  4.84it/s, loss=1.298]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.55it/s, val_loss=0.986]


Validation Loss: 1.1870, Accuracy: 0.6553


Processing Epoch 32: 100%|██████████| 316/316 [01:05<00:00,  4.82it/s, loss=1.050]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.43it/s, val_loss=0.982]


Validation Loss: 1.2091, Accuracy: 0.6407


Processing Epoch 33: 100%|██████████| 316/316 [01:05<00:00,  4.84it/s, loss=0.875]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.57it/s, val_loss=1.011]


Validation Loss: 1.2464, Accuracy: 0.6419


Processing Epoch 34: 100%|██████████| 316/316 [01:05<00:00,  4.81it/s, loss=1.018]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.72it/s, val_loss=1.000]


Validation Loss: 1.1883, Accuracy: 0.6498


Processing Epoch 35: 100%|██████████| 316/316 [01:06<00:00,  4.75it/s, loss=1.192]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.92it/s, val_loss=1.027]


Validation Loss: 1.1932, Accuracy: 0.6522


Processing Epoch 36: 100%|██████████| 316/316 [01:08<00:00,  4.63it/s, loss=1.212]
Validating: 100%|██████████| 79/79 [00:05<00:00, 13.96it/s, val_loss=1.004]


Validation Loss: 1.2442, Accuracy: 0.6514


Processing Epoch 37: 100%|██████████| 316/316 [01:08<00:00,  4.63it/s, loss=1.228]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.05it/s, val_loss=0.976]


Validation Loss: 1.1798, Accuracy: 0.6514


Processing Epoch 38: 100%|██████████| 316/316 [01:07<00:00,  4.67it/s, loss=1.160]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.25it/s, val_loss=1.019]


Validation Loss: 1.1850, Accuracy: 0.6486


Processing Epoch 39: 100%|██████████| 316/316 [01:07<00:00,  4.67it/s, loss=1.085]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.23it/s, val_loss=0.954]


Validation Loss: 1.1905, Accuracy: 0.6557


Processing Epoch 40: 100%|██████████| 316/316 [01:07<00:00,  4.67it/s, loss=1.184]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.17it/s, val_loss=0.974]


Validation Loss: 1.1809, Accuracy: 0.6514


Processing Epoch 41: 100%|██████████| 316/316 [01:07<00:00,  4.68it/s, loss=1.051]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.16it/s, val_loss=0.983]


Validation Loss: 1.2163, Accuracy: 0.6545


Processing Epoch 42: 100%|██████████| 316/316 [01:06<00:00,  4.75it/s, loss=1.014]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.00it/s, val_loss=0.995]


Validation Loss: 1.1804, Accuracy: 0.6601


Processing Epoch 43: 100%|██████████| 316/316 [01:03<00:00,  4.94it/s, loss=1.170]
Validating: 100%|██████████| 79/79 [00:05<00:00, 15.05it/s, val_loss=0.987]


Validation Loss: 1.1943, Accuracy: 0.6486


Processing Epoch 44: 100%|██████████| 316/316 [01:05<00:00,  4.84it/s, loss=1.388]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.70it/s, val_loss=1.007]


Validation Loss: 1.1810, Accuracy: 0.6617


Processing Epoch 45: 100%|██████████| 316/316 [01:05<00:00,  4.86it/s, loss=1.008]
Validating: 100%|██████████| 79/79 [00:05<00:00, 15.02it/s, val_loss=1.029]


Validation Loss: 1.2419, Accuracy: 0.6454


Processing Epoch 46: 100%|██████████| 316/316 [01:06<00:00,  4.76it/s, loss=1.150]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.79it/s, val_loss=0.989]


Validation Loss: 1.2010, Accuracy: 0.6529


Processing Epoch 47: 100%|██████████| 316/316 [01:04<00:00,  4.93it/s, loss=1.038]
Validating: 100%|██████████| 79/79 [00:05<00:00, 15.31it/s, val_loss=0.968]


Validation Loss: 1.1844, Accuracy: 0.6593


Processing Epoch 48: 100%|██████████| 316/316 [01:01<00:00,  5.11it/s, loss=1.442]
Validating: 100%|██████████| 79/79 [00:05<00:00, 15.71it/s, val_loss=0.981]


Validation Loss: 1.1762, Accuracy: 0.6565


Processing Epoch 49: 100%|██████████| 316/316 [01:05<00:00,  4.81it/s, loss=1.206]
Validating: 100%|██████████| 79/79 [00:05<00:00, 14.54it/s, val_loss=0.965]


Validation Loss: 1.1688, Accuracy: 0.6628


In [20]:
# Define the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
config = get_config()

# Preprocessing
audio_files = get_audio_files_and_labels(dataset_dir)
x, y = load_and_preprocess_dataset(audio_files)

# Train/Test split
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

# Convert to PyTorch tensors
x_train = torch.from_numpy(x_train).float()
x_val = torch.from_numpy(x_val).float()
y_train = torch.from_numpy(np.array(y_train)).long()
y_val = torch.from_numpy(np.array(y_val)).long()

# Move the tensors to device specified
x_train = x_train.to(device)
y_train = y_train.to(device)
x_val = x_val.to(device)
y_val = y_val.to(device)

# Create Dataset and DataLoader
train_dataset = TensorDataset(x_train, y_train)
val_dataset = TensorDataset(x_val, y_val)
train_loader = DataLoader(train_dataset, config['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, config['batch_size'])

input_shape = x_train.shape[-1]

model = get_model(config, input_shape).to(device)

# Load the pretrained weights
model_filename = latest_weights_file_path(config)
state = torch.load(model_filename)
model.load_state_dict(state['model_state_dict'])

class_weights = class_weight.compute_class_weight(
    'balanced',
    classes=np.unique(y_train),
    y=y_train
)

class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32).to(device)

loss_fn = nn.CrossEntropyLoss(label_smoothing=0.1, weight=class_weights_tensor).to(device)

validate_model(model, val_loader, loss_fn, device, None, 0)

Using device: cuda


NameError: name 'x' is not defined