In [None]:
import os
import torchaudio

# Set Torchaudio backend: use 'sox_io' or 'soundfile'
try:
    torchaudio.set_audio_backend("sox_io")  # Set 'sox_io' backend for audio processing
except Exception:
    torchaudio.set_audio_backend("soundfile")  # If 'sox_io' is not available, use 'soundfile'

import torch
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from torch import nn
from torch.utils.data import Dataset, DataLoader

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device used:", device)

# Paths to training and test audio directories
train_audio_path = "C:/Users/User/Desktop/deep_learning_project/train/audio"
test_audio_path = "C:/Users/User/Desktop/deep_learning_project/test/audio"

# Define commands (classes/labels)
commands = ['happy', 'cat', 'two', 'dog', 'house', 'bird', 'no', 'three', 'bed', 'right',
            'six', 'nine', 'seven', 'sheila', 'marvin', 'zero', 'wow', 'up', 'one', 'go',
            'left', 'off', 'yes', 'stop', 'five', 'on', 'eight', 'down', 'four', 'tree']
sample_rate = 16000  # Audio sampling rate

# Function to create spectrogram from audio file
def load_and_process_audio(file_path):
    waveform, sr = torchaudio.load(file_path)  # Load audio file
    waveform = waveform.mean(dim=0)  # Convert stereo to mono if necessary
    
    # Generate spectrogram
    transform = torchaudio.transforms.Spectrogram(
        n_fft=384,
        win_length=256,
        hop_length=160,
        power=0.5
    )
    spec = transform(waveform)
    spec = (spec - spec.mean()) / (spec.std() + 1e-10)  # Normalize
    return spec.T  # Transpose to make time axis first

# Pad spectrogram to have consistent size
def pad_spectrogram(spec, max_len):
    pad_width = max_len - spec.shape[0]
    if pad_width > 0:
        padding = torch.zeros((pad_width, spec.shape[1]))
        spec = torch.cat((spec, padding), dim=0)
    else:
        spec = spec[:max_len, :]
    return spec

# Custom dataset class for audio files
class AudioDataset(Dataset):
    def __init__(self, audio_path, commands, max_len=None):
        self.data = []
        self.labels = []
        self.commands = commands
        self.le = LabelEncoder()
        self.le.fit(commands)
        self.max_len = 0

        for label in commands:
            path = os.path.join(audio_path, label)
            for file in os.listdir(path):
                if file.endswith(".wav"):
                    spec = load_and_process_audio(os.path.join(path, file))
                    self.max_len = max(self.max_len, spec.shape[0])
                    self.data.append(spec)
                    self.labels.append(label)

        if max_len is not None:
            self.max_len = max_len

        self.data = [pad_spectrogram(spec, self.max_len) for spec in self.data]
        self.labels = self.le.transform(self.labels)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

# Create dataset and split into training and validation sets
dataset = AudioDataset(train_audio_path, commands)
indices = list(range(len(dataset)))
X_train, X_val, y_train_idx, y_val_idx = train_test_split(
    indices, dataset.labels, test_size=0.2, random_state=42
)
train_ds = torch.utils.data.Subset(dataset, X_train)
val_ds = torch.utils.data.Subset(dataset, X_val)

# Create DataLoader objects
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=32, shuffle=False)

# Model class with convolutional and GRU layers
class AudioModel(nn.Module):
    def __init__(self, input_shape, num_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(11, 41), stride=(2, 2), padding=(5, 20))
        self.bn1 = nn.BatchNorm2d(32)
        self.conv2 = nn.Conv2d(32, 32, kernel_size=(11, 21), stride=(1, 2), padding=(5, 10))
        self.bn2 = nn.BatchNorm2d(32)

        dummy_input = torch.zeros(1, 1, *input_shape)
        out = self.forward_features(dummy_input)
        gru_input_size = out.shape[-1]

        self.gru = nn.GRU(
            gru_input_size,
            256,
            batch_first=True,
            bidirectional=True,
            num_layers=2
        )

        self.fc1 = nn.Linear(512, 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, num_classes)

    def forward_features(self, x):
        x = torch.relu(self.bn1(self.conv1(x)))
        x = torch.relu(self.bn2(self.conv2(x)))
        x = x.permute(0, 2, 1, 3).contiguous()
        x = x.view(x.size(0), x.size(1), -1)
        return x

    def forward(self, x):
        x = self.forward_features(x)
        x, _ = self.gru(x)
        x = x[:, -1, :]
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

# Instantiate the model
input_shape = (dataset.max_len, dataset.data[0].shape[1])
model = AudioModel(input_shape, len(commands)).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training function
def train(model, train_loader, val_loader, criterion, optimizer, epochs=20):
    history = {'loss': [], 'val_loss': [], 'accuracy': [], 'val_accuracy': []}
    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        correct = 0
        total = 0
        for inputs, labels in train_loader:
            inputs = inputs.unsqueeze(1).to(device)
            labels = torch.tensor(labels).to(device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_loss = total_loss / len(train_loader)
        train_acc = correct / total
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs = inputs.unsqueeze(1).to(device)
                labels = torch.tensor(labels).to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_acc = val_correct / val_total
        history['loss'].append(train_loss)
        history['val_loss'].append(val_loss)
        history['accuracy'].append(train_acc)
        history['val_accuracy'].append(val_acc)

        print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, Accuracy: {train_acc:.4f}, "
              f"Val Loss: {val_loss:.4f}, Val Accuracy: {val_acc:.4f}")
    return history

# Train the model
history = train(model, train_loader, val_loader, criterion, optimizer, epochs=20)

# Plot training history
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history['loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss over Epochs')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history['accuracy'], label='Train Accuracy')
plt.plot(history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Accuracy over Epochs')
plt.legend()

plt.tight_layout()
plt.show()