# Required Imports

In [2]:
import numpy as np
import soundfile as sf
import matplotlib.pyplot as plt
import sounddevice as sd
import scipy.signal
import os
from MFCC import concatenate_mfcc_deltas,compute_mfcc, compute_delta
import librosa
import logging
from datetime import datetime

# Loading Audio files

In [39]:
def load_audio_names(name):
    audio_files = []
    for i in range(1,31):
        filename = f'{name}_{i}.wav'
        pathname = f'./{name}/{name}_{i}.wav'
        if os.path.isfile(pathname):
            audio, sr = librosa.load(pathname, sr=None)  # Load audio file with original sampling rate
            audio_files.append((filename, audio, sr))
        else:
            print(f'File not found: {filename}')
    return audio_files

odessa_files = load_audio_names('Odessa')
lights_off_files = load_audio_names('turn_off_the_lights')
lights_on_files = load_audio_names('turn_on_the_lights')
time_files = load_audio_names('what_time_is_it')
play_music_files = load_audio_names('play_music')
stop_music_files = load_audio_names('stop_music')

all_files = odessa_files + lights_off_files + lights_on_files + time_files + play_music_files + stop_music_files

def load_audio_filenames_from_txt(file_name):
    if not os.path.isfile(file_name):
        raise FileNotFoundError(f"The file {file_name} does not exist in the directory.")
    
    with open(file_name, 'r') as file:
        audio_filenames = [line.strip() for line in file if line.strip()]
    return audio_filenames

def find_matching_files(train_filenames, all_files):
    matching_files = []
    for train_filename in train_filenames:
        for filename, audio, sr in all_files:
            if train_filename == filename:
                matching_files.append((filename, audio, sr))
    return matching_files

train = load_audio_filenames_from_txt('train.txt')
train_files  = find_matching_files(train, all_files)

data_train = { "Odessa": [], "turn_on":[], "turn_off": [], "play_music": [], "stop_music": [], "time": []}
odessa = []
turn_on = []
turn_off = []
play_music = []
stop_music = []
time = []
M = 2
fs = 16000

for i in train_files:
    if i[0].startswith("Odessa"):
        mfcc_features = compute_mfcc(np.array(i[1]), sample_rate=fs)
        delta_mfcc = compute_delta(mfcc_features, M)
        concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, M)
        odessa.append(concatenate_features)
    elif i[0].startswith("what"):
        mfcc_features = compute_mfcc(np.array(i[1]), sample_rate=fs)
        delta_mfcc = compute_delta(mfcc_features, M)
        concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, M)
        time.append(concatenate_features)
    elif i[0].startswith("turn_on"):
        mfcc_features = compute_mfcc(np.array(i[1]), sample_rate=fs)
        delta_mfcc = compute_delta(mfcc_features, M)
        concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, M)
        turn_on.append(concatenate_features)
    elif i[0].startswith("turn_off"):
        mfcc_features = compute_mfcc(np.array(i[1]), sample_rate=fs)
        delta_mfcc = compute_delta(mfcc_features, M)
        concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, M)
        turn_off.append(concatenate_features)
    elif i[0].startswith("play_music"):
        mfcc_features = compute_mfcc(np.array(i[1]), sample_rate=fs)
        delta_mfcc = compute_delta(mfcc_features, M)
        concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, M)
        play_music.append(concatenate_features)
    elif i[0].startswith("stop_music"):
        mfcc_features = compute_mfcc(np.array(i[1]), sample_rate=fs)
        delta_mfcc = compute_delta(mfcc_features, M)
        concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, M)
        stop_music.append(concatenate_features)

        
data_train["Odessa"] = np.array(odessa)
data_train["turn_off"] = np.array(turn_off)
data_train["turn_on"] = np.array(turn_on)
data_train["play_music"] = np.array(play_music)
data_train["stop_music"] = np.array(stop_music)
data_train["time"] = np.array(time)

File not found: turn_off_the_lights_30.wav
File not found: what_time_is_it_30.wav
File not found: stop_music_30.wav


# Helper functions

In [21]:
def compute_log_likelihood(model, data):
    _, _, _, loglik, _ = model.forwardbackward(model.pi, model.A, model.obs_probs(data.T, model.means.T, model.vars.T))
    return loglik

# def train_models(data_train, global_mu, global_var, num_states, pi):

#     models = {}
#     for label, states in num_states.items():
#         mu_label = global_mu[label]
#         var_label = global_var[label]
#         model = HMM_new(num_states=states, num_features=len(mu_label), global_mean=mu_label, global_variance=var_label)
#         prior, a_ij, mu, sigma = model.hmm_train(data_train[label], max_iter=100, pi = pi[label])
#         model.pi = prior
#         model.A = a_ij
#         model.means = mu
#         model.vars = sigma
#         models[label] = model
#         print(f'Trained model for {label}')
    
#     return models


def evaluate_models(models, data_val):
    results = {}
    accuracy_per_label = {}

    for true_label, sequences in data_val.items():
        correct_predictions = 0
        total_sequences = len(sequences)
        
        for sequence in sequences:
            loglikelihoods = {model_label: compute_log_likelihood(model, sequence) for model_label, model in models.items()}
            predicted_label = max(loglikelihoods, key=loglikelihoods.get)
            
            if predicted_label == true_label:
                correct_predictions += 1
            else:
                print(f"{true_label}  {predicted_label}")

        accuracy_per_label[true_label] = correct_predictions / total_sequences if total_sequences > 0 else 0

    # Print results and accuracy per label
    for true_label, accuracy in accuracy_per_label.items():
        print(f"Label: {true_label}, Accuracy: {accuracy * 100:.2f}%")

    return accuracy_per_label


# GMM-HMM class


In [41]:
class HMM_GMM:
    def __init__(self, num_states, num_features, num_mix):
        self.num_states = num_states
        self.num_features = num_features
        self.num_mix = num_mix

        self.A = self.make_leftright_transmat(num_states, 0.6)
        self.pi = np.random.rand(num_states)
        self.pi = self.pi / np.sum(self.pi)
        self.gmms = [GMM(num_mix, 2*num_features) for _ in range(self.num_states)]
        
    def make_leftright_transmat(self, M, p):
        a_ij = np.zeros((M, M))
        for i in range(M - 1):
            a_ij[i, i] = p
            a_ij[i, i + 1] = 1 - p
        a_ij[M - 1, M - 1] = 1.0
        return a_ij
    
    def train_gmms(self, data):
        state_lengths = len(data) // self.num_states
        for i in range(self.num_states):
            state_data = data[i * state_lengths:(i + 1) * state_lengths, :]
            self.gmms[i].fit(state_data)
    
    def obs_probs(self, x):
        B = np.zeros((self.num_states, x.shape[1]))
        for i in range(self.num_states):
            B[i, :] = np.exp(self.gmms[i].score_samples(x.T))
        return B
    
    def forward_backward(self, prior, a_ij, B):
        M, T = B.shape
        scale = np.ones(T)
        loglik = 0

        alpha = np.zeros((M, T))
        beta = np.zeros((M, T))
        gamma = np.zeros((M, T))
        xiSum = np.zeros((M, M))

        t = 0
        alpha[:, t], scale[t] = self.normalise(prior * B[:, t])
        for t in range(1, T):
            m = a_ij.T @ alpha[:, t - 1]
            alpha[:, t], scale[t] = self.normalise(m * B[:, t])

        if np.any(scale == 0):
            loglik = -np.inf
        else:
            loglik = np.sum(np.log(scale))

        beta[:, T - 1] = 1
        gamma[:, T - 1], _ = self.normalise(alpha[:, T - 1] * beta[:, T - 1])

        for t in range(T - 2, -1, -1):
            b = beta[:, t + 1] * B[:, t + 1]
            beta[:, t], _ = self.normalise(a_ij @ b)
            gamma[:, t], _ = self.normalise(alpha[:, t] * beta[:, t])
            xiSum_, _ = self.normalise(a_ij * np.outer(alpha[:, t], b))
            xiSum += xiSum_

        return alpha, beta, gamma, loglik, xiSum
    
    def normalise(self, A, dim=None):
        if dim is None:
            z = np.sum(A)
            s = z if z != 0 else 1
            M = A / s
        elif dim == 1:
            z = np.sum(A, axis=0)
            s = np.where(z == 0, 1, z)
            M = A / s
        else:
            z = np.sum(A, axis=dim)
            s = np.where(z == 0, 1, z)
            shape = np.ones(A.ndim, int)
            shape[dim] = A.shape[dim]
            M = A / s.reshape(shape)
        return M, z
    
    def train_hmm(self, data, pi, max_iter=200, thresh=5e-4):
        n_train, T, N = data.shape
        M = self.num_states

        exp_num_trans = np.zeros((M, M))
        exp_num_visits = np.zeros(M)

        loglik_sum = 0
        prev_loglik = -np.inf
        converged = False
        iter_count = 1
        prior0, _ = self.normalise(pi)
        a_ij0 = self.make_leftright_transmat(M, 0.5)
        
        self.train_gmms(data.reshape(-1, data.shape[-1]))  # Train GMMs
        
        while not converged and iter_count <= max_iter:
            loglik_sum = 0
            for i in range(n_train):
                x = data[i].T 
                B = self.obs_probs(x)
                alpha, beta, gamma, cur_loglik, xi_sum = self.forward_backward(prior0, a_ij0, B)
                loglik_sum += cur_loglik
                exp_num_trans += xi_sum
                exp_num_visits += gamma[:, 0]
            
            prior0, _ = self.normalise(exp_num_visits)
            a_ij0 = self.mk_stochastic(exp_num_trans)
            
            d_loglik = abs(loglik_sum - prev_loglik)
            avg_loglik = (abs(loglik_sum) + abs(prev_loglik)) / 2

            if (d_loglik / avg_loglik) < thresh:
                converged = True

            prev_loglik = loglik_sum
            if iter_count % 10 == 0:
                print(f'Iteration {iter_count}, loglik = {loglik_sum}, threshCheck = {d_loglik / avg_loglik}')
            iter_count += 1

        self.pi = prior0
        self.A = a_ij0

        return prior0, a_ij0

    def mk_stochastic(self, M):
        M = np.array(M)
        row_sums = M.sum(axis=1)
        row_sums[row_sums == 0] = 1
        S = M / row_sums[:, np.newaxis]
        return S


class GMM:
    def __init__(self, n_components, n_features):
        np.random.seed(seed=52)
        self.n_components = n_components
        self.n_features = n_features
        self.weights = np.ones(n_components) / n_components
        self.means = np.random.randn(n_components, n_features)
        self.covariances = np.array([np.eye(n_features)] * n_components)
        self.epsilon = 1e-6  # Small value to avoid division by zero

    def fit(self, X, max_iter=100, tol=1e-6):
        n_samples, _ = X.shape
        log_likelihood = -np.inf
        for _ in range(max_iter):
            # E-step
            resp = self._estimate_responsibilities(X)
            Nk = resp.sum(axis=0) + self.epsilon  # Add epsilon to avoid division by zero
            
            # M-step
            self.weights = Nk / n_samples
            self.means = np.dot(resp.T, X) / Nk[:, np.newaxis]
            for k in range(self.n_components):
                diff = X - self.means[k]
                self.covariances[k] = np.dot(resp[:, k] * diff.T, diff) / Nk[k]
                self.covariances[k].flat[::self.n_features + 1] += self.epsilon
            
            # Check for convergence
            new_log_likelihood = self._compute_log_likelihood(X)
            if abs(new_log_likelihood - log_likelihood) < tol:
                break
            log_likelihood = new_log_likelihood

    def _estimate_responsibilities(self, X):
        weighted_log_prob = self._estimate_weighted_log_prob(X)
        log_prob_norm = logsumexp(weighted_log_prob, axis=1)
        return np.exp(weighted_log_prob - log_prob_norm[:, np.newaxis])

    def _estimate_weighted_log_prob(self, X):
        return self._estimate_log_prob(X) + np.log(self.weights + self.epsilon)  # Add epsilon to avoid log(0)

    def _estimate_log_prob(self, X):
        log_prob = np.empty((X.shape[0], self.n_components))
        for k in range(self.n_components):
            log_prob[:, k] = self._gaussian_log_prob(X, self.means[k], self.covariances[k])
        return log_prob

    def _gaussian_log_prob(self, X, mean, cov):
        n_features = X.shape[1]
        log_det = np.linalg.slogdet(cov)[1]
        inv_cov = np.linalg.inv(cov)
        diff = X - mean
        return -0.5 * (np.sum(np.dot(diff, inv_cov) * diff, axis=1) + n_features * np.log(2 * np.pi) + log_det)
    
    def _compute_log_likelihood(self, X):
        return np.sum(logsumexp(self._estimate_weighted_log_prob(X), axis=1))

    def score_samples(self, X):
        return logsumexp(self._estimate_weighted_log_prob(X), axis=1)

def logsumexp(a, axis=None):
    a_max = np.max(a, axis=axis, keepdims=True)
    a_max[~np.isfinite(a_max)] = 0
    temp = np.exp(a - a_max)
    s = np.sum(temp, axis=axis, keepdims=True)
    out = np.log(s)
    out += a_max
    return out.squeeze(axis)

def compute_log_likelihood(model, data):
    _, _, _, loglik, _ = model.forward_backward(model.pi, model.A, model.obs_probs(data.T))
    return loglik

def train_models(data_train, num_states, num_features, num_mix, pi):
    models = {}
    for label, states in num_states.items():
        model = HMM_GMM(num_states=states, num_features=num_features, num_mix=num_mix)
        prior, a_ij = model.train_hmm(data_train[label], pi[label], max_iter=100)
        model.pi = prior
        model.A = a_ij
        models[label] = model
        print(f'Trained model for {label}')
    return models

# Training GMM-HMM

In [42]:
# Adjust these parameters accordingly
num_features = 13  # #MFCC features
num_mix = 3  # #Gaussian mixtures


global_mu_5 = { "Odessa": [], "turn_on":[], "turn_off": [], "play_music": [], "stop_music": [], "time": []}
global_var_5 = { "Odessa": [], "turn_on":[], "turn_off": [], "play_music": [], "stop_music": [], "time": []}
for label, values in data_train.items():
    features = np.array([instances for instances_list in values for instances in instances_list])
    global_mu_5[label] = np.mean(features, axis=0)
    global_var_5[label] = np.var(features, axis=0)

num_states = {
            "Odessa": 14,
            "turn_off": 14,
            "turn_on": 15,
            "time": 15,
            "play_music": 15,
            "stop_music": 15
        }

pi = {
            "Odessa": np.array([0.82311034, 0.02611798, 0.21077064, 0.61842177, 0.09828447, 0.62013131, 0.05389022, 0.96065406, 0.98042937, 0.52112765, 0.63655334, 0.76475695, 0.76495529, 0.41768558]), #np.ones(num_states["Odessa"]),
            "turn_off": np.array([0.82311034, 0.02611798, 0.21077064, 0.61842177, 0.09828447, 0.62013131, 0.05389022, 0.96065406, 0.98042937, 0.52112765,0.63655334, 0.76475695, 0.76495529, 0.41768558]),#, 0.76880531,0.42320175, 0.92610357, 0.68192648]),#np.random.rand(num_states["turn_off"]), #np.ones(num_states["turn_off"]),
            "turn_on": np.array([0.82311034, 0.02611798, 0.21077064, 0.61842177, 0.09828447,0.62013131, 0.05389022, 0.96065406, 0.98042937, 0.52112765,0.63655334, 0.76475695, 0.76495529, 0.41768558, 0.76880531]),#np.random.rand(num_states["turn_on"]),#np.array([0.82311034, 0.02611798, 0.21077064, 0.61842177, 0.09828447, 0.62013131, 0.05389022, 0.96065406, 0.98042937, 0.52112765,0.63655334, 0.76475695, 0.76495529, 0.41768558, 0.76880531]),#np.random.rand(num_states["turn_off"]), #np.ones(num_states["turn_off"]),np.ones(num_states["turn_on"]),
            "time": np.array([0.36845559, 0.85890986, 0.38049568, 0.09495426, 0.32489071, 0.41511219, 0.74227395, 0.65790887, 0.20131683, 0.80848791,0.78640244, 0.39493964, 0.51061623, 0.79615954, 0.4453775 ]),#np.random.rand(num_states["time"]),#np.ones(num_states["time"]),
            "play_music": np.array([0.74306691, 0.07874907, 0.48764526, 0.43438864, 0.24605795, 0.86164072, 0.02002256, 0.45082671, 0.04742287, 0.4977275 ,0.858774, 0.33481566, 0.90159003, 0.12288755, 0.15743375]),
            "stop_music": np.array([0.74306691, 0.07874907, 0.48764526, 0.43438864, 0.24605795, 0.86164072, 0.02002256, 0.45082671, 0.04742287, 0.4977275 ,0.858774, 0.33481566, 0.90159003, 0.12288755, 0.15743375]),#np.ones(num_states["stop_music"]),
        }

models = train_models(data_train, num_states, num_features, num_mix, pi)
accuracy = evaluate_models(models, data_train)


  if (d_loglik / avg_loglik) < thresh:


Trained model for Odessa
Trained model for turn_off
Trained model for turn_on
Trained model for time
Trained model for play_music
Trained model for stop_music
Label: Odessa, Accuracy: 100.00%
Label: turn_on, Accuracy: 100.00%
Label: turn_off, Accuracy: 100.00%
Label: play_music, Accuracy: 100.00%
Label: stop_music, Accuracy: 100.00%
Label: time, Accuracy: 100.00%


# Start and end point detector

In [43]:
def compute_energy(signal, frame_size, method='absolute'):
    energy = np.zeros((len(signal) - frame_size + 1,))
    for i in range(len(energy)):
        frame = signal[i:i+frame_size]
        if method == 'absolute':
            energy[i] = np.sum(np.abs(frame))
        elif method == 'square':
            energy[i] = np.sum(frame**2)
    #log_energy = np.log10(energy + 1e-10)
    return energy

# Not using this
def estimate_noise_level(log_energy):
    Emin = np.min(log_energy)
    Emax = Emin + 1 
    hist, bin_edges = np.histogram(log_energy, bins=100, range=(Emin, Emax))
    most_frequent_bin = np.argmax(hist)
    Q = (bin_edges[most_frequent_bin] + bin_edges[most_frequent_bin + 1]) / 2
    return Q

# Not using this
def adjust_log_energy(log_energy, Q):
    adjusted_log_energy = log_energy - Q
    return adjusted_log_energy

def compute_zero_crossings(signal, frame_size):
    zero_crossings = np.zeros((len(signal) - frame_size + 1,))
    for i in range(len(zero_crossings)):
        frame = signal[i:i+frame_size+1]
        zero_crossings[i] = 0.5 * np.sum(np.abs(np.sign(frame[1:]) - np.sign(frame[:-1])))
    return zero_crossings

def set_thresholds(energy, zero_crossings):
    IMX = np.max(energy)
    IMN = np.min(energy)
    ITL = min(0.03 * (IMX - IMN) + IMN, 4 * IMN)
    ITU = 5 * ITL

    IZC = np.mean(zero_crossings[:100])  # First 100ms for mean ZC
    sigma_IZC = np.std(zero_crossings[:100])  # Standard deviation for ZC
    IZCT = min(25, IZC + 2 * sigma_IZC)

    return ITL, ITU, IZCT

def detect_speech_segments(signal, frame_size, sample_rate, k4):
    
    energy = compute_energy(signal, frame_size, method='absolute')
    zero_crossings = compute_zero_crossings(signal, frame_size)
    ITL, ITU, IZCT = set_thresholds(energy, zero_crossings)

    state = 'silence'
    speech_segments = []
    start_point = None
    min_duration_frames = int(0.075 * sample_rate)

    for i in range(len(energy)):
        if state == 'silence' and energy[i] > ITL:
            state = 'possible_start'
            possible_start = i
        elif state == 'possible_start' and energy[i] < ITU:
            possible_start = i
        elif state == 'possible_start' and energy[i] > ITU:
            # Start of speech segment
            N1 = possible_start  # Potential start point
            count_zc = 0
            for j in range(max(N1 - 25, 0), N1):
                if zero_crossings[j] > IZCT:
                    count_zc += 1
                    if count_zc == 1:
                        N1_prime = j
            start_point = N1_prime if count_zc >= 3 else N1
            state = 'speech'
        elif state == 'speech' and energy[i] < ITU:
            state = 'possible_end'
        elif state == 'possible_end' and energy[i] < ITL:
            # End of speech segment
            N2 = i  # Potential end point
            count_zc = 0
            for j in range(N2, min(N2 + 25, len(energy))):
                if zero_crossings[j] > IZCT:
                    count_zc += 1
                    if count_zc == 1:
                        N2_prime = j
            end_point = N2_prime if count_zc >= 3 else N2
            if start_point is not None:
                duration = end_point - start_point
                if duration >= min_duration_frames:
                    segment = signal[start_point : end_point]
                    peak_amplitude = np.max(np.abs(segment))
                    if peak_amplitude > k4:
                        speech_segments.append((start_point/sample_rate, end_point/sample_rate))
            start_point = None
            state = 'silence'
        elif state == 'speech' and energy[i] > ITL:
            # Remain in speech state
            continue
    
    return speech_segments

In [35]:
def add_white_noise(signal, snr_db):
    # Calculate signal power and required noise power for desired SNR
    sig_power = np.mean(signal ** 2)
    snr_linear = 10 ** (snr_db / 10)
    noise_power = sig_power / snr_linear

    # Generate white noise
    white_noise = np.random.normal(0, np.sqrt(noise_power), signal.shape)

    # Add noise to the signal
    noisy_signal = signal + white_noise
    return noisy_signal

def predict_label(models, sequence):
    
    loglikelihoods = {model_label: compute_log_likelihood(model, sequence) for model_label, model in models.items()}
    predicted_label = max(loglikelihoods, key=loglikelihoods.get)
    
    return predicted_label

# Real time detection method

In [49]:
def real_time_speech_detection(frame_size, sample_rate, k4):
    

    # Initialize variables and state
    state = 'silence'
    start_point = None
    frames = []
    speech_segments = []
    buffer_length = 5 * sample_rate  # 5 seconds * sample rate samples/second
    audio_buffer = np.zeros(buffer_length, dtype='float32')  # Initialize buffer
    buffer_index = 0  # Index to keep track of where to insert new audio data
    odessa_on = False

    def audio_callback(indata, frames, time, status):
        nonlocal buffer_index
        # Flatten incoming data and add it to the buffer
        data_flat = indata.flatten()
        # Check if there is enough space left in the buffer; if not, process the current buffer
        remaining_buffer_space = buffer_length - buffer_index
        if len(data_flat) > remaining_buffer_space:
            # Fill the buffer
            audio_buffer[buffer_index:buffer_index + remaining_buffer_space] = data_flat[:remaining_buffer_space]
            process_buffer(audio_buffer)
            # Start filling next buffer
            buffer_index = len(data_flat) - remaining_buffer_space
            audio_buffer[:buffer_index] = data_flat[remaining_buffer_space:]
        else:
            # Add incoming data to the buffer
            audio_buffer[buffer_index:buffer_index + len(data_flat)] = data_flat
            buffer_index += len(data_flat)

    def process_buffer(buffer):
        # Reset buffer_index for new data accumulation
        nonlocal buffer_index, odessa_on, state, start_point, speech_segments
        buffer_index = 0
        # Here you can process the buffer to detect speech segments
        signal = buffer.flatten()
        signal = add_white_noise(signal, 30)
        print("Entered Buffer")
        #np.savetxt('./signal.txt', signal) #saving for debugging

        energy = compute_energy(signal, frame_size, method='absolute')
        zero_crossings = compute_zero_crossings(signal, frame_size)
        ITL, ITU, IZCT = set_thresholds(energy, zero_crossings)

        # Similar speech detection logic as before
        state = 'silence'
        speech_segments = []
        start_point = None

        min_duration_frames = int(0.075 * sample_rate)

        for i in range(len(energy)):
            if state == 'silence' and energy[i] > ITL:
                state = 'possible_start'
                possible_start = i

            elif state == 'possible_start' and energy[i] < ITU:
                possible_start = i
            elif state == 'possible_start' and energy[i] > ITU:
                # Start of speech segment
                N1 = possible_start  # Potential start point
                count_zc = 0
                for j in range(max(N1 - 25, 0), N1):
                    if zero_crossings[j] > IZCT:
                        count_zc += 1
                        if count_zc == 1:
                            N1_prime = j
                start_point = N1_prime if count_zc >= 3 else N1
                start_point = max(0, start_point - int(0.3 * sample_rate))  # Adjust start point
                state = 'speech'
            elif state == 'speech' and energy[i] < ITU:
                state = 'possible_end'
            elif state == 'possible_end' and energy[i] < ITL:
                # End of speech segment
                N2 = i  # Potential end point
                count_zc = 0
                for j in range(N2, min(N2 + 25, len(energy))):
                    if zero_crossings[j] > IZCT:
                        count_zc += 1
                        if count_zc == 1:
                            N2_prime = j
                end_point = N2_prime if count_zc >= 3 else N2
                if start_point is not None:
                    duration = end_point - start_point
                    if duration >= min_duration_frames:
                        segment = signal[start_point : end_point]
                        peak_amplitude = np.max(np.abs(segment))
                        if peak_amplitude > k4:
                            if speech_segments and start_point - speech_segments[-1][1] < 0.4 * sample_rate:
                                # Merge with the previous segment
                                speech_segments[-1] = (speech_segments[-1][0], end_point)
                            else:
                                speech_segments.append((start_point, end_point))

                start_point = None
                state = 'silence'
            elif state == 'speech' and energy[i] > ITL:
                # Remain in speech state
                continue

        # Process each finalized speech segment
        for (start_point, end_point) in speech_segments:
            segment = signal[start_point:end_point]
            #np.savetxt('./segment.txt', segment) #saving for debugging
            mfcc_features = compute_mfcc(np.array(segment), sample_rate=fs)
            delta_mfcc = compute_delta(mfcc_features, 2)
            concatenate_features = concatenate_mfcc_deltas(mfcc_features, delta_mfcc, 2)
            concatenate_features = np.array(concatenate_features)
            pl = predict_label(models, concatenate_features)
            print(pl)
            if pl == "Odessa":
                print("Listening...")
                sd.play(s/5, sr)
                sd.wait()
                odessa_on = True
            elif odessa_on:
                if pl == "time":
                    current_time = datetime.now().strftime("%H:%M:%S")
                    print(f"The current time is {current_time}.")
                elif pl == "turn_on":
                    print("Glowing light.")
                    light_on_image.show()
                elif pl == "turn_off":
                    print("Turning off the light.")
                    light_off_image.show()
                elif pl == "play_music":
                    sd.play(concatenated_music/15, sr_m)
                    print("Playing low volume music.")
                elif pl == "stop_music":
                    sd.stop()
                    print("Stopping the music.")
                odessa_on = False
        

    # Open a stream with sounddevice
    with sd.InputStream(callback=audio_callback, blocksize=frame_size, dtype='float32', samplerate=sample_rate, channels=1):
        print("Starting real-time speech detection...")
        # Keep the program running to process incoming audio
        while True:
            pass

# Odessa responses build-up

In [11]:
from PIL import Image

s, sr = librosa.load("hello-made-with-Voicemod.mp3", sr=None)
sd.play(s/5, sr)
sd.wait()
light_on_image = Image.open("image_on.jpg")
light_off_image = Image.open("image_off.jpg")

music, sr_m= librosa.load("music.mp3", sr=None)

sd.play(music/5, sr_m)
#sd.wait()
sd.stop()
concatenated_music = np.concatenate((music/5, music/5))

# Real time detection

In [61]:
# Parameters
sample_rate = 16000  
frame_size = int(0.01 * sample_rate)  # 10 ms frame size
k4 = 0.05
# Start the real-time detection
real_time_speech_detection(frame_size, sample_rate, k4)
#real_time_speech_detection_overlap(frame_size, sample_rate, k4)

Starting real-time speech detection...
Entered Buffer
turn_off
Entered Buffer
turn_on
Entered Buffer
time
Entered Buffer
play_music
Entered Buffer
Odessa
Listening...
Entered Buffer
stop_music
Stopping the music.
Entered Buffer
time


KeyboardInterrupt: 