# Spoken Digit Classification with Audio

This notebook implements a lightweight solution for classifying spoken digits (0-9) using the Free Spoken Digit Dataset (FSDD).

## Setup and Imports

In [22]:
import torch
import torchaudio
import librosa
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from torch import nn
from torch.utils.data import DataLoader, Dataset
from tqdm.notebook import tqdm
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

## 1. Data Loading and Exploration

Load the FSDD dataset from Hugging Face and explore its characteristics.

In [23]:
# Load dataset
dataset = load_dataset("mteb/free-spoken-digit-dataset")
print(f"Dataset size: {len(dataset['train'])}")

# Display sample information
sample = dataset['train'][0]
print(f"Sample keys: {sample.keys()}")
print(f"Audio shape: {sample['audio']['array'].shape}")
print(f"Sample rate: {sample['audio']['sampling_rate']}Hz")
print(f"Label: {sample['label']}")

Dataset size: 2700


ModuleNotFoundError: No module named 'torchcodec.decoders'

## 2. Audio Preprocessing

Create functions for audio preprocessing including resampling and normalization.

In [None]:
def preprocess_audio(waveform, sample_rate, target_sample_rate=8000, target_length=8000):
    """Preprocess audio by resampling and padding/truncating"""
    # Resample if necessary
    if sample_rate != target_sample_rate:
        waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=target_sample_rate)
    
    # Pad or truncate to target length
    if len(waveform) > target_length:
        waveform = waveform[:target_length]
    else:
        waveform = np.pad(waveform, (0, target_length - len(waveform)))
    
    # Normalize (avoid division by zero)
    max_val = np.max(np.abs(waveform))
    if max_val > 0:
        waveform = waveform / max_val
    return waveform

## 3. Feature Extraction

Extract Mel spectrograms from the audio data.

In [None]:
def extract_features(waveform, sample_rate=8000, n_mels=64, n_fft=1024, hop_length=512):
    """Extract mel spectrogram features from audio"""
    mel_spec = librosa.feature.melspectrogram(
        y=waveform,
        sr=sample_rate,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    return mel_spec_db

# Create custom dataset
class SpokenDigitDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        item = self.dataset[idx]
        waveform = item['audio']['array']
        sample_rate = item['audio']['sampling_rate']
        
        # Preprocess audio
        waveform = preprocess_audio(waveform, sample_rate)
        
        # Extract features
        features = extract_features(waveform)
        
        # Convert to tensor
        features = torch.FloatTensor(features).unsqueeze(0)
        label = torch.tensor(int(item['label']))
        
        return features, label

## 4. Model Architecture

Define a lightweight CNN for audio classification.

In [None]:
class AudioCNN(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.features = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(32, 64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )
        
        self.classifier = nn.Sequential(
            nn.Flatten(),
            nn.Linear(64 * 8 * 4, 128),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(128, 10)
        )
        
    def forward(self, x):
        x = self.features(x)
        x = self.classifier(x)
        return x

## 5. Training Pipeline

Set up the training loop with loss function and optimizer.

In [None]:
# Create dataloaders
train_dataset = SpokenDigitDataset(dataset['train'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize model, loss, and optimizer
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AudioCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    
    for features, labels in tqdm(train_loader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        features, labels = features.to(device), labels.to(device)
        
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = total_loss / len(train_loader)
    epoch_acc = 100 * correct / total
    print(f'Epoch {epoch+1}, Loss: {epoch_loss:.4f}, Accuracy: {epoch_acc:.2f}%')

## 6. Model Evaluation

Evaluate model performance using confusion matrix and classification report.

In [None]:
def evaluate_model(model, dataset):
    model.eval()
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for features, labels in DataLoader(dataset, batch_size=32):
            features = features.to(device)
            labels = labels.to(device)
            outputs = model(features)
            _, predicted = torch.max(outputs.data, 1)
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    # Plot confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.show()
    
    # Print classification report
    print('\nClassification Report:')
    print(classification_report(all_labels, all_preds))

evaluate_model(model, train_dataset)

## 7. Real-time Audio Processing

Create functions for processing new audio samples.

In [None]:
def predict_audio(model, waveform, sample_rate):
    """Process and predict digit from audio input"""
    model.eval()
    with torch.no_grad():
        # Preprocess audio
        waveform = preprocess_audio(waveform, sample_rate)
        
        # Extract features
        features = extract_features(waveform)
        
        # Convert to tensor
        features = torch.FloatTensor(features).unsqueeze(0).unsqueeze(0).to(device)
        
        # Get prediction
        outputs = model(features)
        _, predicted = torch.max(outputs.data, 1)
        
        return predicted.item()

## 8. Microphone Integration

Add live microphone input support.

In [None]:
import sounddevice as sd

def record_audio(duration=1, sample_rate=8000):
    """Record audio from microphone"""
    print("Recording...")
    audio = sd.rec(int(duration * sample_rate), samplerate=sample_rate, channels=1)
    sd.wait()
    return audio.flatten()

def live_prediction():
    """Make real-time predictions from microphone input"""
    while True:
        input("Press Enter to record a 1-second audio clip (or Ctrl+C to exit)...")
        waveform = record_audio()
        prediction = predict_audio(model, waveform, 8000)
        print(f"\nPredicted digit: {prediction}\n")

# Uncomment to test live predictions
# live_prediction()

In [None]:
# Install required packages if missing
!python -m pip install sounddevice datasets librosa

In [None]:
# Test prediction on a sample
test_sample = dataset['train'][10]
waveform = test_sample['audio']['array']
sample_rate = test_sample['audio']['sampling_rate']
predicted_digit = predict_audio(model, waveform, sample_rate)
print(f"Predicted digit: {predicted_digit}, True label: {test_sample['label']}")