# download the data

In [1]:
!wget -O mini_speech_commands.zip https://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip
!unzip -q mini_speech_commands.zip -d mini_speech_commands

--2025-03-15 23:16:35--  https://storage.googleapis.com/download.tensorflow.org/data/mini_speech_commands.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.197.207, 74.125.135.207, 172.253.117.207, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.197.207|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 182082353 (174M) [application/zip]
Saving to: ‘mini_speech_commands.zip’


2025-03-15 23:16:36 (110 MB/s) - ‘mini_speech_commands.zip’ saved [182082353/182082353]



# import the libraries

In [2]:
import os
import librosa
import IPython.display as ipd
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import librosa.util
import seaborn as sns
from scipy.stats import multivariate_normal
from sklearn.model_selection import train_test_split
from scipy.stats import multivariate_normal
from scipy.special import logsumexp

In [3]:

down_dir = "/content/mini_speech_commands/mini_speech_commands/left"

audio_files = sorted(os.listdir(down_dir))

wav_file = os.path.join(down_dir, audio_files[0])

print("First audio file path:", wav_file)

ipd.Audio(wav_file)

First audio file path: /content/mini_speech_commands/mini_speech_commands/left/00b01445_nohash_0.wav


In [4]:
y, sr = librosa.load(wav_file, sr=None)


mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13, n_mels = 26, n_fft = 512, hop_length = 160).T
print("MFCC shape:", mfccs.shape)
print(sr)

MFCC shape: (101, 13)
16000


# splitting the dataset into train-test and compute mfcc for each class

In [5]:
import os
import librosa
import numpy as np
from sklearn.model_selection import train_test_split
import librosa.display

# Path to the dataset
DATASET_PATH = "/content/mini_speech_commands/mini_speech_commands"

# MFCC parameters
SAMPLE_RATE = 16000  # Assuming 16kHz audio
N_MFCC = 13          # Number of MFCC coefficients
N_MELS = 26          # Number of Mel filters
FFT_WINDOW = 512     # FFT window size
HOP_SIZE = 160       # Hop size (10ms)

# Dictionary to store MFCC features
X_train = {}
X_test = {}

# Iterate through each class directory
for label in sorted(os.listdir(DATASET_PATH)):
    class_path = os.path.join(DATASET_PATH, label)
    if not os.path.isdir(class_path):
        continue  # Skip non-directory files

    print(f"Processing class: {label}")

    # Collect all audio file paths
    file_paths = [os.path.join(class_path, f) for f in os.listdir(class_path) if f.endswith('.wav')]

    # Split into train (80%) and test (20%)
    train_files, test_files = train_test_split(file_paths, test_size=0.2, random_state=42)

    # Function to compute MFCCs for a list of files
    def extract_mfccs(file_list):
        mfccs = []
        max_frames = 101  # Set a fixed number of time frames based on your data
        for file in file_list:
            y, sr = librosa.load(file, sr=SAMPLE_RATE)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, n_fft=FFT_WINDOW, hop_length=HOP_SIZE, n_mels=N_MELS)

            # Pad or truncate MFCCs to max_frames
            if mfcc.shape[1] < max_frames:
                pad_width = max_frames - mfcc.shape[1]
                mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant')
            else:
                mfcc = mfcc[:, :max_frames]  # Truncate if longer

            mfccs.append(mfcc)

        return np.array(mfccs)  # Now all have the same shape

    # Compute MFCCs for train and test sets
    X_train[label] = extract_mfccs(train_files)
    X_test[label] = extract_mfccs(test_files)

# Print dataset structure
print("\nDataset Structure:")
for label in X_train:
    print(f"{label}: Train -> {X_train[label].shape}, Test -> {X_test[label].shape}")


Processing class: down
Processing class: go
Processing class: left
Processing class: no
Processing class: right
Processing class: stop
Processing class: up
Processing class: yes

Dataset Structure:
down: Train -> (800, 13, 101), Test -> (200, 13, 101)
go: Train -> (800, 13, 101), Test -> (200, 13, 101)
left: Train -> (800, 13, 101), Test -> (200, 13, 101)
no: Train -> (800, 13, 101), Test -> (200, 13, 101)
right: Train -> (800, 13, 101), Test -> (200, 13, 101)
stop: Train -> (800, 13, 101), Test -> (200, 13, 101)
up: Train -> (800, 13, 101), Test -> (200, 13, 101)
yes: Train -> (800, 13, 101), Test -> (200, 13, 101)


In [6]:
for key in X_train.keys():
    print(f"Before {key}: Train Shape {X_train[key].shape}, Test Shape {X_test[key].shape}")

    X_train[key] = X_train[key].transpose(0, 2, 1)  # Swap last two axes
    X_test[key] = X_test[key].transpose(0, 2, 1)

    print(f"After {key}: Train Shape {X_train[key].shape}, Test Shape {X_test[key].shape}")


Before down: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After down: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before go: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After go: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before left: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After left: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before no: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After no: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before right: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After right: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before stop: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After stop: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before up: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After up: Train Shape (800, 101, 13), Test Shape (200, 101, 13)
Before yes: Train Shape (800, 13, 101), Test Shape (200, 13, 101)
After yes: Tr

# HMM_guassian class


In [14]:
X_train['stop'][0][0].shape

(13,)

In [76]:
import numpy as np
from scipy.stats import multivariate_normal
from scipy.special import logsumexp

class HMM_gaussian:
    def __init__(self, n_hidden_states, n_observations, size_observation):
        self.n_hidden_states = n_hidden_states
        self.n_observations = n_observations
        self.size_observation = size_observation
        self.init_transitions()
        self.init_emissions()
        self.init_prob()

    def init_transitions(self):
        self.transition_matrix = np.zeros((self.n_hidden_states, self.n_hidden_states))
        for i in range(self.n_hidden_states - 1):
            self.transition_matrix[i, i] = 0.5
            self.transition_matrix[i, i + 1] = 0.5
        self.transition_matrix[-1, -1] = 1
        return self.transition_matrix

    def init_emissions(self):
        self.mean = np.random.rand(self.n_hidden_states, self.size_observation)
        self.covariance = np.array([np.eye(self.size_observation) for _ in range(self.n_hidden_states)])
        print("covariance shape", self.covariance.shape)
        return self.mean, self.covariance

    def init_prob(self):
        self.initial_prob = np.zeros((self.n_hidden_states))
        self.initial_prob[0] = 1  # First state has probability 1
        return self.initial_prob

    def get_data(self, train, test):
        self.train = train
        self.test = test

    def compute_alpha(self):
        alphas = np.full((self.n_observations, self.n_hidden_states), -np.inf)
        alphas[0, 0] = multivariate_normal.logpdf(self.train[0,0], mean=self.mean[0], cov=self.covariance[0])


        for t in range(1, self.n_observations):
            for j in range(self.n_hidden_states):
                log_transition_probs = np.log(np.maximum(self.transition_matrix[:, j], 1e-10))
                log_emission_prob = multivariate_normal.logpdf(self.train[t,0], mean=self.mean[j], cov=self.covariance[j])
                alphas[t, j] = logsumexp(alphas[t - 1, :] + log_transition_probs) + log_emission_prob

        likelihood = logsumexp(alphas[-1, :])
        return alphas, likelihood

    def compute_beta(self):
        betas = np.full((self.n_observations, self.n_hidden_states), -np.inf)
        betas[-1, :] = 0

        for t in range(self.n_observations - 2, -1, -1):
            for i in range(self.n_hidden_states):
                log_transition_probs = np.log(np.maximum(self.transition_matrix[i, :], 1e-10))
                log_emission_probs = np.array([
                    multivariate_normal.logpdf(self.train[t + 1,0], mean=self.mean[j], cov=self.covariance[j])
                    for j in range(self.n_hidden_states)
                ])
                betas[t, i] = logsumexp(log_transition_probs + log_emission_probs + betas[t + 1, :])

        return betas

    def compute_posterior(self, alphas, betas):
        log_px = logsumexp(alphas[-1, :])
        log_gamma = alphas + betas - log_px
        gamma = np.exp(log_gamma)
        return gamma

    def compute_xi(self, alphas, betas):
        xi = np.full((self.n_observations - 1, self.n_hidden_states, self.n_hidden_states), -np.inf)
        for t in range(self.n_observations - 1):
            log_transition_probs = np.log(np.maximum(self.transition_matrix, 1e-10))
            log_emission_probs = np.array([
                multivariate_normal.logpdf(self.train[t + 1,0], mean=self.mean[j], cov=self.covariance[j])
                for j in range(self.n_hidden_states)
            ])
            log_xi_t = alphas[t, :, None] + log_transition_probs + log_emission_probs + betas[t + 1, :]
            log_xi_t -= logsumexp(log_xi_t, axis=(0, 1))  # Normalize over all states
            xi[t] = np.exp(log_xi_t)

        return xi

    def fit(self, n_iterations):
        for _ in range(n_iterations):
            alphas, likelihood = self.compute_alpha()
            betas = self.compute_beta()

            gamma = self.compute_posterior(alphas, betas)
            xi = self.compute_xi(alphas, betas)

            # Update transition matrix
            self.transition_matrix = np.sum(xi, axis=0)
            self.transition_matrix /= (np.sum(self.transition_matrix, axis=1, keepdims=True) + 1e-10)  # Stability

            # Update means
            self.mean = np.array([
                np.sum(gamma[:, i, None] * self.train, axis=(0, 1)) / (np.sum(gamma[:, i]) + 1e-10)
                for i in range(self.n_hidden_states)
            ])

            # Update covariances
            epsilon = 1e-6
            for i in range(self.n_hidden_states):
                self.covariance[i] = (self.covariance[i] + self.covariance[i].T) / 2  # Enforce symmetry
                self.covariance[i] += epsilon * np.eye(self.covariance.shape[-1])  # Add small value to diagonal

    # Check determinant
            if np.linalg.det(self.covariance[i]) < 1e-10:
               print(f"Covariance matrix for state {i} is near singular, applying correction.")
               self.covariance[i] = nearest_positive_definite(self.covariance[i])  # Fix near-singular matrix










# train for all classes

In [79]:
hmm_models = {}

for key in X_train.keys():
    hmm = HMM_gaussian(6, 101, 13)
    hmm.get_data(X_train[key], X_test[key])
    hmm.fit(100)
    hmm_models[key] = hmm  # Store the trained model in the dictionary


covariance shape (6, 13, 13)
covariance shape (6, 13, 13)
covariance shape (6, 13, 13)
covariance shape (6, 13, 13)
covariance shape (6, 13, 13)
covariance shape (6, 13, 13)
covariance shape (6, 13, 13)
covariance shape (6, 13, 13)


In [82]:
X_train.keys()

dict_keys(['down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes'])

In [86]:
print(hmm_models['stop'])

<__main__.HMM_gaussian object at 0x7cc329dff5d0>


#

# implementing the viterbi algorithm

In [102]:


def viterbi_algorithm(hmm, observations):


    T = observations.shape[0]  # Number of time steps
    N = hmm.n_hidden_states  # Number of hidden states

    # Viterbi matrix
    delta = np.zeros((T, N))

    # Compute emission probabilities for all states at all time steps
    emission_probs = np.zeros((T, N))
    for j in range(N):
        for t in range(T):
            emission_probs[t, j] = multivariate_normal.pdf(
                observations[t], mean=hmm.mean[j], cov=hmm.covariance[j]
            )

    # Step 1: Initialization
    delta[0] = np.log(hmm.initial_prob + 1e-10) + np.log(emission_probs[0] + 1e-10)  # Log for stability

    # Step 2: Recursion
    for t in range(1, T):
        for j in range(N):
            delta[t, j] = np.max(delta[t - 1] + np.log(hmm.transition_matrix[:, j] + 1e-10)) + np.log(emission_probs[t, j] + 1e-10)

    # The best log-probability is the max value in the last row of delta
    best_log_prob = np.max(delta[-1])

    return best_log_prob


# select the best one

In [103]:
def select_best_hmm_viterbi(hmm_list, observations):

    best_hmm = None
    best_log_prob = -np.inf

    for hmm in hmm_list:
        log_prob = viterbi_algorithm(hmm, observations)
        if log_prob > best_log_prob:
            best_log_prob = log_prob
            best_hmm = hmm

    return best_hmm, best_log_prob


In [104]:
best_hmm, best_log_prob = select_best_hmm_viterbi(list(hmm_models.values()), X_test['stop'][0][0])
best_class = [key for key, model in hmm_models.items() if model is best_hmm][0]
print("Predicted class:", best_class)


Predicted class: right


<__main__.HMM_gaussian at 0x7cc3329f7350>

#accuracey

In [105]:
from collections import defaultdict

# Initialize counters for each class
correct_counts = defaultdict(int)
total_counts = defaultdict(int)

# Loop through each test sample
for true_class, samples in X_test.items():
    for sample in samples:
        # Get the predicted HMM
        best_hmm, _ = select_best_hmm_viterbi(list(hmm_models.values()), np.array(sample[0]))

        # Get the predicted class name
        predicted_class = [key for key, model in hmm_models.items() if model is best_hmm][0]

        # Update counts
        total_counts[true_class] += 1
        if predicted_class == true_class:
            correct_counts[true_class] += 1

# Compute accuracy per class
accuracy_per_class = {cls: correct_counts[cls] / total_counts[cls] for cls in total_counts}

# Print results
for cls, acc in accuracy_per_class.items():
    print(f"Accuracy for class '{cls}': {acc:.2%}")


Accuracy for class 'down': 0.00%
Accuracy for class 'go': 0.00%
Accuracy for class 'left': 0.00%
Accuracy for class 'no': 0.00%
Accuracy for class 'right': 100.00%
Accuracy for class 'stop': 0.00%
Accuracy for class 'up': 0.00%
Accuracy for class 'yes': 0.00%
