In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class SpeechCommandClassifier(nn.Module):
    def __init__(self, num_classes):
        super(SpeechCommandClassifier, self).__init__()
        self.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1)  # Input channels = 1 (spectrogram), output channels = 16
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(2, 2)  # Max pooling layer
        self.fc1 = nn.Linear(32 * 32 * 32, 128)  # Fully connected layer
        self.fc2 = nn.Linear(128, num_classes)   # Output layer for the number of classes (commands)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # First convolution + ReLU + MaxPooling
        x = self.pool(F.relu(self.conv2(x)))  # Second convolution + ReLU + MaxPooling
        x = x.view(-1, 32 * 32 * 32)  # Flatten the features
        x = F.relu(self.fc1(x))  # Fully connected layer with ReLU activation
        x = self.fc2(x)  # Output layer (logits for classification)
        return x


In [2]:
import torchaudio
import torch
from torch.utils.data import DataLoader
from torchaudio.datasets import SPEECHCOMMANDS

# Path to the dataset
data_path = '/content/sample_data/Recording.m4a'

# Load the dataset
train_dataset = SPEECHCOMMANDS(root=data_path, download=True, subset='training')
test_dataset = SPEECHCOMMANDS(root=data_path, download=True, subset='testing')

# Create a DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Function to extract Mel Spectrogram features from the audio
transform = torchaudio.transforms.MelSpectrogram()

def preprocess_batch(batch):
    inputs = []
    labels = []
    for waveform, sample_rate, label, *_ in batch:
        mel_spec = transform(waveform)  # Convert to Mel Spectrogram
        inputs.append(mel_spec)
        labels.append(label)  # Append the corresponding label

    inputs = torch.stack(inputs)  # Stack all tensors into a batch
    labels = torch.tensor([int(label) for label in labels])  # Convert labels to tensor
    return inputs, labels


NotADirectoryError: [Errno 20] Not a directory: '/content/sample_data/Recording.m4a/speech_commands_v0.02.tar.gz.f640a53b7d244c31b4cfd9fd5830a154.partial'

In [1]:
import torchaudio
import torch
from torch.utils.data import DataLoader
from torchaudio.datasets import SPEECHCOMMANDS

# Path to the dataset
data_path = "/Recording (2).m4a"

# Load the dataset
train_dataset = SPEECHCOMMANDS(root=data_path, download=True, subset='training')
test_dataset = SPEECHCOMMANDS(root=data_path, download=True, subset='testing')

# Create a DataLoader for batch processing
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Function to extract Mel Spectrogram features from the audio
transform = torchaudio.transforms.MelSpectrogram()

def preprocess_batch(batch):
    inputs = []
    labels = []
    for waveform, sample_rate, label, *_ in batch:
        mel_spec = transform(waveform)  # Convert to Mel Spectrogram
        inputs.append(mel_spec)
        labels.append(label)  # Append the corresponding label

    inputs = torch.stack(inputs)  # Stack all tensors into a batch
    labels = torch.tensor([int(label) for label in labels])  # Convert labels to tensor
    return inputs, labels


NotADirectoryError: [Errno 20] Not a directory: '/Recording (2).m4a/speech_commands_v0.02.tar.gz.79d721fd3ba54cc3b38aafa82e156e9b.partial'

In [None]:
import torch.optim as optim

# Initialize the model
num_classes = len(train_dataset.labels)  # Total number of voice commands
model = SpeechCommandClassifier(num_classes).to(device)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, criterion, optimizer, train_loader, epochs=10):
    model.train()  # Set the model to training mode
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            inputs, labels = preprocess_batch(inputs)  # Preprocess batch data
            inputs, labels = inputs.to(device), labels.to(device)

            optimizer.zero_grad()  # Zero out gradients
            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, labels)  # Compute loss
            loss.backward()  # Backpropagation
            optimizer.step()  # Update weights

            running_loss += loss.item()

        print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")

# Start training
train_model(model, criterion, optimizer, train_loader)


In [None]:
def evaluate_model(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation
        for inputs, labels in test_loader:
            inputs, labels = preprocess_batch(inputs)
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)  # Get the predicted class
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f}%')

# Evaluate the trained model
evaluate_model(model, test_loader)
