In [12]:
"""
1. Create Jupyter widgets to create buttons
2. Install pyaudio to record microphone
3. vosk does speech recognition

"""

'\n1. Create Jupyter widgets to create buttons\n2. Install pyaudio to record microphone\n3. vosk does speech recognition\n\n'

In [13]:
# ! pip install ipywidgets

In [22]:
# 1. creating widgets
import ipywidgets as widgets #helps create the toggle buttons
from IPython.display import display #helps to display the button
from threading import Thread #to run processes in the background
from queue import Queue #passes messages between threads

messages = Queue() #tells the thread when to stop recording and transcribing
recordings = Queue() #stores the recordings and passes it to the transcription


record_button = widgets.Button(
    description = "Record", #text that appears on the button
    disabled = False,
    button_style = "success", #color of the button is green
    icon = "microphone"
)

stop_button = widgets.Button(
    description = "Stop",
    disabled = False,
    button_style = "warning",
    icon = "stop"
)

output = widgets.Output() #helps show the transcript, creates an instance of the output widget


# need to create threads so that more than one process can run at the same time
# in this case the recording and transcribing for the audio must happen at the same time
# need to pass messages to the thread


def start_recording(data): #Ipython passes the data to this function on clicking the record button
    messages.put(True) #keep recording, put the message on the queue
    
    with output: #print data into output widget
        display('Starting...')
        
        #creating threads that run in the background
        record = Thread(target = record_microphone) #creates a thread which starts recording
        record.start() #starts the recording
        
        transcribe = Thread(target = speech_recognition, args=(output,))
        transcribe.start()

def stop_recording(data):
    with output:
        messages.get() #gets the message off the queue
        display('Stopped.')
        
record_button.on_click(start_recording)
stop_button.on_click(stop_recording)

display(record_button)
display(stop_button)
display(output)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle())



Output()

    

In [15]:
# ! pip install pyaudio
#records system audio

In [16]:
import pyaudio

p = pyaudio.PyAudio() #connects to the system sound devices
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i)) #displays all the connected sound devices
p.terminate() #yerminates pyaudio connection

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone (2- High Definition ', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Speakers (2- High Definition Au', 'hostApi': 0, 'maxInputChann

In [17]:
# 2.listening to the audio
CHANNELS = 1 #listens from only one channel
FRAME_RATE = 16000 #quality of sound, how quickly the audio is sampled
RECORD_SECONDS = 20 #gap between recording and transcript
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2 
 
def record_microphone(chunk=1024): #chunk is - how often the audio is being read
    p = pyaudio.PyAudio()
 
    stream = p.open(format=AUDIO_FORMAT, 
                    channels = CHANNELS,
                    rate = FRAME_RATE,
                    input = True,
                    input_device_index = 2, #microphone index
                    frames_per_buffer = chunk)
    
    frames = [] #stores all the audio recordings
     
    while not messages.empty(): #recording until the message queue is empty (stopped)
        data = stream.read(chunk)
        frames.append(data)
         
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS)/chunk : #if audio is recorrded for more than 20s
            recordings.put(frames.copy()) #add the audio to the recordings thread queue by copying frames
            frames=[] #empty the frame
     
    stream.stop_stream()
    stream.close()
    p.terminate()
             

In [18]:
# 3.recognizing live speech into text
# !pip install vosk 
#no punctuation in speech

In [19]:
# !pip install transformers 
#adds punctuations

In [20]:
# !pip install torch 
#to use recasepunc, to add punctuations into the transcript

In [21]:
import subprocess #separate process to add punctuations by calling the command on the cmd (from transformers)
import json 
from vosk import Model, KaldiRecognizer 
import time

model = Model(model_name="vosk-model-small-en-us-0.15") #speech recognition model
rec = KaldiRecognizer(model, FRAME_RATE) #uses the model to do speech recognition
rec.SetWords(True) #gives a probability on the confidence of the system recognizing voice

def speech_recognition(output):
    while not messages.empty(): # until stop button has been used
        frames = recordings.get() #using the audio captured
        
        rec.AcceptWaveform(b''.join(frames)) #all the chunks of audio is been put into one binary string
        result = rec.Result()
        text = json.loads(result)["text"] #vosk returns in json , so converting that into text
        
        cased = subprocess.check_output("python recasepunc/recasepunc.py predict recasepunc/checkpoint",shell = True, text = True, input = text)
        #adds punctuations to the transcript
        #checkpoint is the trained model used for prediction
        #it uses the user shell to run
        #it takes the text variable as the input
        #recasepunc.py helps in recasing
        #this will reload the model every 20s
        #not the most efficient
        
        output.append_stdout(cased) #adding it to the output widget
        time.sleep(1)