In [2]:
import sys
sys.path.append("../src")
import importlib
from IPython.display import Audio
import pyaudio
import wave
import time
from scipy.signal import butter, filtfilt
import numpy as np
import matplotlib.pyplot as plt


In [3]:
import logic.utils.io_access as io

In [None]:
# force reload imports
#importlib.reload(at)

In [4]:
def find_seed_device_index():
    p = pyaudio.PyAudio()
    device_count = p.get_device_count()
    target_description = "seeed-2mic-voicecard"

    for i in range(device_count):
        device_info = p.get_device_info_by_index(i)
        if device_info.get('maxInputChannels') > 0:
            device_name = device_info.get('name')
            if target_description in device_name:
                p.terminate()
                return i

    p.terminate()
    raise Exception(
        f"Device with description containing '{target_description}' not found")
device_index = find_seed_device_index()

ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.front
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround21
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround21
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround40
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround41
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround50
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround51
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround71
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.iec958
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.iec958
ALSA lib pcm.c:2660:(snd

In [5]:
def bytes_to_float32(bytes_data):
    return np.frombuffer(bytes_data, dtype=np.float32)

def bytes_to_int16(bytes_data):
    return np.frombuffer(bytes_data, dtype=np.int16)

def float32_to_bytes(np_data):
    return np_data.astype(np.float32).tobytes()

def int16_to_bytes(np_data):
    return np_data.astype(np.int16).tobytes()

def float32_to_int16(float32_array):
    # Clip, prevent overload
    float32_array = np.clip(float32_array, -1.0, 1.0)
    
    # Scale it to int16 values
    float32_array = np.round(float32_array * 32767)

    # Cast and return it
    return float32_array.astype(np.int16)

def filter_human_speech_only(np_channel, sample_rate):
    # Ensure the data type
    np_channel = np_channel.astype(np.float32)

    # Normalize the audio data if it is not already between -1 and 1
    if np.max(np.abs(np_channel)) > 1:
        np_channel = np_channel / np.max(np.abs(np_channel))

    # Filter design parameters
    order = 5  # <-- You might want to adjust this
    nyquist = 0.5 * sample_rate
    low = 300.0 / nyquist
    high = 3400.0 / nyquist

    # Design and apply the filter
    b, a = butter(order, [low, high], btype='band')
    y = filtfilt(b, a, np_channel)

    # Make sure the output is in float32 format
    y = y.astype(np.float32)

    return y

def volume_boost(np_channel, volume_ratio):
    # Volume boost & prevent clipping
    return np.clip(np_channel * volume_ratio, -1, 1)


def seperate_channels(np_data):
    left_channel = np_data[::2]
    right_channel = np_data[1::2]
    return left_channel, right_channel


def merge_two_channels(np_left_channel, np_right_channel):
    return (np_left_channel / 2) + (np_right_channel / 2)

def clip_float32(np_data):
    return np.clip(np_data, -1, 1)


In [6]:
input_channels_count = 2
output_channels_count = 1
sample_rate = 44100
record_seconds = 5
testfile_path = io.get_path('data', 'test.wav')

In [7]:
class SpeechAudioStreamObservable:
    def __init__(self):
        # Set hardcoded values
        self.rate = sample_rate
        self.channels = input_channels_count
        self.format = pyaudio.paFloat32
        self.frames_per_buffer = 1024
        self.input_device_index = device_index
        self.observers = []

        self.p = pyaudio.PyAudio()
        self.stream = self.p.open(
            format=self.format,
            channels=self.channels,
            rate=self.rate,
            input_device_index=self.input_device_index,
            input=True,
            frames_per_buffer=self.frames_per_buffer,
            stream_callback=self._callback
        )

    def _callback(self, in_data, frame_count, time_info, status):
        # Convert to np & seperate stereo channels
        np_f32 = bytes_to_float32(in_data)
        left, right = seperate_channels(np_f32)

        # Filter to human speech frequencies
        # WIP: We need fix whitenoise first, then test again
        #left = filter_human_speech_only(left, self.rate)
        #right = filter_human_speech_only(right, self.rate)

        # Volume boost
        # left = volume_boost(left, volume_rate)
        # right = volume_boost(right, volume_rate)

        # Merge channels to single mono channel
        np_f32 = merge_two_channels(left, right)
        
        # Clip to prevent overload
        np_f32 = clip_float32(np_f32)
        
        # Convert to int16 binary for output
        np_i16 = float32_to_int16(np_f32)
        out_binary_i16 = int16_to_bytes(np_i16)
        
        
        # Report data to observer
        for observer in self.observers:
            observer.on_received(out_binary_i16)
        return (np_f32, pyaudio.paContinue)

    def start(self):
        try:
            self.stream.start_stream()
        except Exception as ex:
            print(f"An error occurred while starting the stream: {ex}")

    def stop(self):
        try:
            self.stream.stop_stream()
            self.stream.close()
            self.p.terminate()
        except Exception as ex:
            print(f"An error occurred while stopping the stream: {ex}")

    def add_observer(self, observer):
        self.observers.append(observer)



    
class AudioDataObserver:
    def __init__(self, duration):
        self.filename = testfile_path
        self.rate = sample_rate
        self.channels = output_channels_count
        self.duration = duration
        self.binary_audio_data = bytearray()
        self.frames = int(sample_rate * duration)
        self.frame_count = 0

    def on_received(self, audio_data):
        self.binary_audio_data.extend(audio_data)
    
    # this is binary int16 data
    def get_binary_audio_data(self):
        return self.binary_audio_data
    
    def clear_audio_data():
        self.binary_audio_data = bytearray()
        self.frame_count = 0

In [8]:
# record 5 seconds of audio
audio_stream_observable = SpeechAudioStreamObservable()
audio_stream_observer = AudioDataObserver(record_seconds)
audio_stream_observable.add_observer(audio_stream_observer)

# Start the audio stream
audio_stream_observable.start()
try:
    print("Recording audio for 3 seconds...")
    time.sleep(record_seconds)
finally:
    print("Done recording.")
    audio_stream_observable.stop()
    
# Get recording binary
binary_recording_data = audio_stream_observer.get_binary_audio_data()

ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.front
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround21
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround21
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround40
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround41
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround50
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround51
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.surround71
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.iec958
ALSA lib pcm.c:2660:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.iec958
ALSA lib pcm.c:2660:(snd

Recording audio for 3 seconds...
Done recording.


In [9]:
# Convert to numpy array waves
np_input = bytes_to_int16(binary_recording_data)

# In case of two output channels
np_left, np_right = seperate_channels(np_input)
display(Audio(np_left, rate=sample_rate))
display(Audio(np_right, rate=sample_rate))

# In case of one output channel
display(Audio(np_input, rate=sample_rate))


In [None]:
# Visualize recoding
# Create the plots
fig, axs = plt.subplots(2, 1, figsize=(10, 6))

# Plot left channel
axs[0].plot(np_left)
axs[0].set_title('Left Channel')
axs[0].set_xlabel('Sample number')
axs[0].set_ylabel('Amplitude')

# Plot right channel
axs[1].plot(np_right)
axs[1].set_title('Right Channel')
axs[1].set_xlabel('Sample number')
axs[1].set_ylabel('Amplitude')

plt.tight_layout()
plt.show()

In [10]:
print("Min after processing:", np.min(np_input))
print("Max after processing:", np.max(np_input))
print("Input data type:", np_input.dtype)



Min after processing: -32756
Max after processing: 5114
Input data type: int16
