In [1]:
%pip install vosk





[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
%pip install sounddevice

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
%pip install transformers

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import sounddevice as sd
import vosk
from vosk import Model, KaldiRecognizer
import json
import numpy as np
import queue
from transformers import pipeline
import time

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load Vosk model
vosk_model = Model(lang="en-us")  # Ensure the model path is correct
recognizer = KaldiRecognizer(vosk_model, 16000)

In [12]:
# Load Blaze 999 medical NER model
ner_pipeline = pipeline("token-classification", model="blaze999/Medical-NER")

Device set to use cpu


In [6]:
q = queue.Queue()

In [23]:
NUM_SECONDS_TO_RECORD = 10

def record_and_transcribe():
    """Records audio, transcribes it with Vosk, and applies medical NER."""
    samplerate = 16000  # Sample rate
    blocksize = int(samplerate * 0.03)  # 30ms frame size
    
    print("Listening...")
    recorded_audio = []
    
    start_time = time.time()  # Start time for recording
    
    def callback(indata, frames, time_info, status):
        nonlocal start_time
        if status:
            print(status)
        
        recorded_audio.append(indata.copy())
        
        # Stop recording after 5 seconds
        if time.time() - start_time >= NUM_SECONDS_TO_RECORD:
            raise sd.CallbackStop  # Stop the stream after 5 seconds

    # Set up the InputStream
    with sd.InputStream(callback=callback, samplerate=samplerate, channels=1, blocksize=blocksize):
        try:
            # Let the stream run for 5 seconds
            sd.sleep(NUM_SECONDS_TO_RECORD * 1000)  # Sleep for 5 seconds (5000 ms)
        except sd.CallbackStop:
            print("Recording stopped")
    
    # Check if we recorded any audio
    if not recorded_audio:
        print("No audio recorded.")
        return None, None
    
    print("Processing...")
    
    # Convert recorded audio to a single numpy array
    audio_data = np.concatenate(recorded_audio, axis=0)
    
    # Convert to 16-bit PCM (Vosk requires 16-bit signed PCM format)
    audio_data_int16 = np.int16(audio_data * 32767)  # Scale float32 to int16
    audio_bytes = audio_data_int16.tobytes()  # Convert to bytes
    
    # Perform speech recognition
    recognizer.AcceptWaveform(audio_bytes)
    result = recognizer.Result()
    text = json.loads(result).get("text", "")
    
    print("Transcription:", text)

    print("Doing NER")
    ner_results = ner_pipeline(text)
    print("Medical Entities:", ner_results)
    
    return text, ner_results

In [24]:
record_and_transcribe()

Listening...
Processing...
Transcription: morning mister smith i see have been experiencing some chest discomfort
Doing NER
Medical Entities: [{'entity': 'B-BIOLOGICAL_STRUCTURE', 'score': 0.29082862, 'index': 10, 'word': '▁chest', 'start': 54, 'end': 60}, {'entity': 'B-SIGN_SYMPTOM', 'score': 0.21618494, 'index': 11, 'word': '▁discomfort', 'start': 60, 'end': 71}]


('morning mister smith i see have been experiencing some chest discomfort',
 [{'entity': 'B-BIOLOGICAL_STRUCTURE',
   'score': 0.29082862,
   'index': 10,
   'word': '▁chest',
   'start': 54,
   'end': 60},
  {'entity': 'B-SIGN_SYMPTOM',
   'score': 0.21618494,
   'index': 11,
   'word': '▁discomfort',
   'start': 60,
   'end': 71}])