Install Dependencies

In [None]:
!pip install librosa soundfile scikit-learn


1. Setup & Imports

In [None]:
import os
import csv
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm
import pandas as pd


2. Download and Extract Dataset

In [6]:

# Dataset download details
dataset_url = "http://download.tensorflow.org/data/speech_commands_v0.02.tar.gz"
dataset_tar = "speech_commands_v0.02.tar.gz"
extract_path = "speech_commands"

# Download if not already downloaded
if not os.path.exists(dataset_tar):
    print("Downloading dataset...")
    urllib.request.urlretrieve(dataset_url, dataset_tar)
    print("Download complete.")

# Extract if not already extracted
if not os.path.exists(extract_path):
    print("Extracting dataset...")
    with tarfile.open(dataset_tar, 'r:gz') as tar:
        tar.extractall(extract_path)
    print("Extraction complete.")


Downloading dataset...
Download complete.
Extracting dataset...
Extraction complete.


3. Prepare Keyword List and Dataset

In [19]:
keywords = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
output_dir = 'splits'
os.makedirs(output_dir, exist_ok=True)

# Gather all labeled .wav files
all_samples = []
for word in keywords:
    class_path = os.path.join(extract_path, word)
    files = [f for f in os.listdir(class_path) if f.endswith('.wav')]
    all_samples.extend([(os.path.join(class_path, f), word) for f in files])

# Split
train_set, temp_set = train_test_split(all_samples, test_size=0.2, stratify=[lbl for _, lbl in all_samples], random_state=42)
val_set, test_set = train_test_split(temp_set, test_size=0.5, stratify=[lbl for _, lbl in temp_set], random_state=42)

# Save to CSV
def write_csv(data, filename):
    with open(os.path.join(output_dir, filename), 'w', newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['filepath', 'label'])
        writer.writerows(data)

write_csv(train_set, 'train.csv')
write_csv(val_set, 'val.csv')
write_csv(test_set, 'test.csv')

print(f"Train: {len(train_set)}, Val: {len(val_set)}, Test: {len(test_set)}")


Train: 30836, Val: 3855, Test: 3855


4. Feature Extraction

In [20]:
keywords = ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go']
label_map = {label: idx for idx, label in enumerate(keywords)}

def extract_features(path, sr=16000, feature_type='mfcc', win_ms=25, overlap_perc=0.5):
    y, sr = librosa.load(path, sr=sr)
    win_len = int(sr * win_ms / 1000)
    hop_len = int(win_len * (1 - overlap_perc))

    if feature_type == 'mfcc':
        features = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_fft=win_len, hop_length=hop_len)
    elif feature_type == 'mel':
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=win_len, hop_length=hop_len)
        features = librosa.power_to_db(mel, ref=np.max)
    else:
        raise ValueError("Unknown feature type")

    return features

def load_split(csv_path, feature_type='mfcc', win_ms=25, overlap=0.5, max_frames=100):
    X, y = [], []
    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            try:
                features = extract_features(row['filepath'], feature_type=feature_type, win_ms=win_ms, overlap_perc=overlap)
                if features.shape[1] > max_frames:
                    features = features[:, :max_frames]  # Truncate
                elif features.shape[1] < max_frames:
                    pad_width = max_frames - features.shape[1]
                    features = np.pad(features, ((0, 0), (0, pad_width)), mode='constant')  # Pad time axis

                X.append(features)
                y.append(label_map[row['label']])
            except Exception as e:
                print(f"Skipping {row['filepath']} due to error: {e}")
                continue

    X = np.array(X, dtype='float32')
    y = to_categorical(y, num_classes=len(keywords))
    return np.expand_dims(X, -1), y

5. Define CNN Model, Train and Evaluate

In [21]:
def build_model(input_shape):
    model = models.Sequential([
        layers.Conv2D(32, (3,3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation='relu'),
        layers.MaxPooling2D((2,2)),
        layers.Flatten(),
        layers.Dense(128, activation='relu'),
        layers.Dense(len(keywords), activation='softmax')
    ])
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

def train_and_evaluate(feature_type, win_ms, overlap):
    print(f"\nTraining with {feature_type}, window {win_ms}ms, overlap {int(overlap * 100)}%")
    X_train, y_train = load_split('splits/train.csv', feature_type, win_ms, overlap)
    X_val, y_val = load_split('splits/val.csv', feature_type, win_ms, overlap)
    X_test, y_test = load_split('splits/test.csv', feature_type, win_ms, overlap)

    model = build_model(X_train.shape[1:])
    model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val), verbose=0)
    _, test_acc = model.evaluate(X_test, y_test, verbose=0)

    print(f"Test Accuracy: {test_acc:.3f}")
    return test_acc


6. Run Experiments with Varying Parameters

In [None]:
# Define feature types and parameter ranges
feature_types = ['mfcc', 'mel']
window_sizes = list(range(10, 101, 10))  # 10ms to 100ms
overlap_percs = [0.0, 0.25, 0.5, 0.75]
min_win_ms = 10

results = []

for ft in feature_types:
    for win in window_sizes:
        if win < min_win_ms:
            continue
        for ovlp in overlap_percs:
            print(f"Running: Feature={ft}, Window={win}ms, Overlap={int(ovlp * 100)}%")
            try:
                acc = train_and_evaluate(ft, win_ms=win, overlap=ovlp)
                results.append({
                    'feature': ft,
                    'window_ms': win,
                    'overlap': int(ovlp * 100),
                    'accuracy': round(acc, 4)
                })
            except Exception as e:
                print(f"Error with {ft}, win={win}, overlap={ovlp}: {e}")


Running: Feature=mfcc, Window=10ms, Overlap=0%

Training with mfcc, window 10ms, overlap 0%


  mel_basis = filters.mel(sr=sr, n_fft=n_fft, **kwargs)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Test Accuracy: 0.827
Running: Feature=mfcc, Window=10ms, Overlap=25%

Training with mfcc, window 10ms, overlap 25%
Test Accuracy: 0.755
Running: Feature=mfcc, Window=10ms, Overlap=50%

Training with mfcc, window 10ms, overlap 50%
Test Accuracy: 0.619
Running: Feature=mfcc, Window=10ms, Overlap=75%

Training with mfcc, window 10ms, overlap 75%
Test Accuracy: 0.282
Running: Feature=mfcc, Window=20ms, Overlap=0%

Training with mfcc, window 20ms, overlap 0%
Test Accuracy: 0.859
Running: Feature=mfcc, Window=20ms, Overlap=25%

Training with mfcc, window 20ms, overlap 25%
Test Accuracy: 0.867
Running: Feature=mfcc, Window=20ms, Overlap=50%

Training with mfcc, window 20ms, overlap 50%
Test Accuracy: 0.832
Running: Feature=mfcc, Window=20ms, Overlap=75%

Training with mfcc, window 20ms, overlap 75%
Test Accuracy: 0.627
Running: Feature=mfcc, Window=30ms, Overlap=0%

Training with mfcc, window 30ms, overlap 0%
Test Accuracy: 0.874
Running: Feature=mfcc, Window=30ms, Overlap=25%

Training with 

7.  Display Results as DataFrame

In [1]:
df_results = pd.DataFrame(results)
df_results = df_results.sort_values(by='accuracy', ascending=False)
df_results


NameError: name 'pd' is not defined

8. Plot Results

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_results, x='window_ms', y='accuracy', hue='feature', style='overlap', markers=True)
plt.title("Accuracy vs Window Size for MFCC and Mel Features")
plt.ylabel("Accuracy")
plt.xlabel("Window Size (ms)")
plt.grid(True)
plt.show()


NameError: name 'df_results' is not defined

<Figure size 1200x600 with 0 Axes>