# Keyword spoting Inference

In [8]:
import tensorflow.keras as keras
import numpy as np
import librosa
import sounddevice as sd
from IPython.display import display, Audio

MODEL_PATH = "D:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\Tensorflow\saved_models\mfcc_Classification_Model.keras"
# A better way for of storing constants for the ML system 
# is having an external confix file for consistency purposes.
NUM_SAMPLES_TO_CONSIDER = 22050 # 1 sec worth of sound in librosa
CONFIDENCE_THRESHOLD = 0.6

In [10]:
# Singleton class - a calss that only can have one instance in the srvice.
class _Keyword_Spotting_Service:
    
    model = None
    #Mapping taken from the dataset json file
    _mappings = [
        "five",
        "four",
        "go",
        "no",
        "off",
        "on",
        "one",
        "stop",
        "three",
        "tree",
        "two",
        "wow",
        "yes",
        "_background_noise_"
    ]
    
    # Instance of the class - PYthon does not enforce Singleton class, to we need to do it manually
    _instance = None
    
    def predict(self, file_path):
        
        # Extract MFCCs
        MFCCs = self.preprocess(file_path) # (#(44) segments, #(13) coefficients)
        
         # Convert 2d MFCCs arry into 4d array -> (# samples, # segments, # coefficients, # channels)
        MFCCs = MFCCs[np.newaxis, ..., np.newaxis]
        
        # Make prediction
        predictions = self.model.predict(MFCCs) # [ [0.1, 0.6, 0.1, 0.2] ]
        predicted_index = np.argmax(predictions) # 1
        predicted_keyword = self._mappings[predicted_index]
        
        return predicted_keyword
        
    def preprocess(self, file_path, n_mfcc=13, n_fft=2048, hop_length=512):
        
        # Load audio file
        signal, sr = librosa.load(file_path)
        
        # Ensure consistency in the audio file length
        if len(signal) > NUM_SAMPLES_TO_CONSIDER:
            signal = signal[:NUM_SAMPLES_TO_CONSIDER]
        
        # Extract MFCCs
        MFCCs = librosa.feature.mfcc(y = signal, n_mfcc = n_mfcc, n_fft = n_fft, hop_length = hop_length)

        # Transpose the matrix
        return MFCCs.T
    
    def listen_and_predict(self, duration=1, sr=22050, overlap=0.5):
            buffer = np.zeros(int(sr * duration))
            try:
                with sd.InputStream(samplerate=sr, channels=1) as stream:
                    print("Listening... Press Ctrl+C to stop.")
                    while True:
                        audio_chunk, overflowed = stream.read(int(sr * overlap))
                        buffer = np.concatenate((buffer[len(audio_chunk):], audio_chunk.flatten()))
                        keyword = self.predict_chunk(buffer, sr)
                        if keyword:
                            print(f"Predicted Keyword: {keyword}")

            except KeyboardInterrupt:
                print("Stopped listening.")
                pass

    def predict_chunk(self, audio_chunk, sr):
        if len(audio_chunk) > NUM_SAMPLES_TO_CONSIDER:
            audio_chunk = audio_chunk[-NUM_SAMPLES_TO_CONSIDER:]
        MFCCs = librosa.feature.mfcc(y=audio_chunk, sr=sr, n_mfcc=13, n_fft=2048, hop_length=512)
        MFCCs = MFCCs.T
        MFCCs = MFCCs[np.newaxis, ..., np.newaxis]
        predictions = self.model.predict(MFCCs)
        predicted_index = np.argmax(predictions)
        confidence = predictions[0][predicted_index]
        if confidence > CONFIDENCE_THRESHOLD:
            predicted_keyword = self._mappings[predicted_index]
            return predicted_keyword
        else:
            return None

In [11]:
def Keyword_Spotting_Service():
    # Ensure only one instance of KSS is created
    if _Keyword_Spotting_Service._instance is None:
        _Keyword_Spotting_Service._instance = _Keyword_Spotting_Service()
        _Keyword_Spotting_Service.model = keras.models.load_model(MODEL_PATH)

    return _Keyword_Spotting_Service._instance

if __name__ == "__main__":
    
    kss = Keyword_Spotting_Service()
    
    kss.listen_and_predict()

Listening... Press Ctrl+C to stop.


ValueError: in user code:

    File "d:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\.venv\lib\site-packages\keras\engine\training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "d:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\.venv\lib\site-packages\keras\engine\training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "d:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\.venv\lib\site-packages\keras\engine\training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "d:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\.venv\lib\site-packages\keras\engine\training.py", line 2079, in predict_step
        return self(x, training=False)
    File "d:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\.venv\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "d:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\.venv\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential" is incompatible with the layer: expected shape=(None, 40, 63, 1), found shape=(None, 44, 13, 1)
