In [6]:
import numpy as np
import librosa
import sounddevice as sd
from keras.models import load_model
from pathlib import Path

# Configuration for audio processing and model
CONFIG = {
    "model_path": Path("saved_models/mfcc_Classification_Model.keras"),
    "mappings": [
        "dog_bark", "children_playing", "air_conditioner",
        "street_music", "engine_idling", "jackhammer",
        "drilling", "siren", "car_horn", "gun_shot"
    ],
    "num_mfcc": 40,
    "num_frames": 63,
    "num_channels": 1,
    "sample_rate": 16000,
    "n_fft": 4 * 40,  # 4 times the num_mfcc
    "hop_length": int(16000 * 0.01),  # 10% of the sample rate
    "confidence_threshold": 0.5
}

class SoundClassificationService:
    
    _instance = None

    def __init__(self, config):
        """Initialize the service with the given configuration."""
        self.config = config
        self.model = load_model(config["model_path"])
        
    @classmethod
    def get_instance(cls, config=CONFIG):
        """Singleton method to get the instance of the class."""
        if cls._instance is None:
            cls._instance = cls(config)
        return cls._instance
    
    def preprocess(self, audio):
        """Process the audio data to match the model's input requirements."""
        MFCCs = librosa.feature.mfcc(y=audio, sr=self.config["sample_rate"], n_mfcc=self.config["num_mfcc"], n_fft=self.config["n_fft"], hop_length=self.config["hop_length"])
        # Adjust MFCCs to required frame size
        num_required_mfcc = self.config["num_frames"]
        pad_width = num_required_mfcc - MFCCs.shape[1]
        if pad_width < 0:
            MFCCs = MFCCs[:, :num_required_mfcc]
        else:
            MFCCs = np.pad(MFCCs, pad_width=((0, 0), (0, pad_width)), mode='constant')
        return MFCCs

    def predict(self, MFCCs):
        """Make a prediction based on preprocessed MFCCs."""
        MFCCs = MFCCs[np.newaxis, ..., np.newaxis]
        predictions = self.model.predict(MFCCs)
        predicted_index = np.argmax(predictions)
        confidence = predictions[0][predicted_index]
        if confidence > self.config["confidence_threshold"]:
            return self.config["mappings"][predicted_index], confidence
        return None

    def listen_and_predict(self, duration=1, overlap=0.5):
        """Listen to live audio and make predictions."""
        buffer_length = int(self.config["sample_rate"] * duration)
        buffer = np.zeros(buffer_length)
        try:
            with sd.InputStream(samplerate=self.config["sample_rate"], channels=1) as stream:
                print("Listening... Press Ctrl+C to stop.")
                while True:
                    audio_chunk, _ = stream.read(int(self.config["sample_rate"] * overlap))
                    buffer = np.roll(buffer, -len(audio_chunk))
                    buffer[-len(audio_chunk):] = audio_chunk.flatten()
                    MFCCs = self.preprocess(buffer)
                    keyword = self.predict(MFCCs)
                    if keyword:
                        print(f"Predicted Keyword: {keyword[0]}, at: {keyword[1]*100}%")
        except KeyboardInterrupt:
            print("Stopped listening.")


In [7]:
def main():
    # Initialize and start the sound classification service
    scs = SoundClassificationService.get_instance()
    scs.listen_and_predict(duration=1)

if __name__ == "__main__":
    main()

Listening... Press Ctrl+C to stop.
Predicted Keyword: street_music, at: 51.2614369392395%
Predicted Keyword: street_music, at: 53.25827598571777%
Stopped listening.
