In [3]:
# # !pip install webrtcvad
# !pip install pygame

In [9]:
# app.py
from flask import Flask, render_template, jsonify
import whisper
import pyaudio
import wave
import webrtcvad
import collections
import threading
import time
import pygame

app = Flask(__name__)

# Load Whisper model
model = whisper.load_model("base")

# PyAudio settings
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK = 1024  # 1024 samples per chunk
RECORD_SECONDS = 10  # Maximum record duration

# WebRTC VAD settings
vad = webrtcvad.Vad(3)  # Set aggressiveness from 0 to 3

audio = pyaudio.PyAudio()

mute = False



def record_audio():
    try:
        stream = audio.open(format=FORMAT, channels=CHANNELS,
                            rate=RATE, input=True,
                            frames_per_buffer=CHUNK)

        frames = []
        ring_buffer = collections.deque(maxlen=10)  # Buffer size to handle VAD chunks
        triggered = False
        voiced_frames = []

        for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)

            # Split the chunk into smaller frames for VAD processing
            num_subframes = int(len(data) / 320)
            for i in range(num_subframes):
                subframe = data[i*320:(i+1)*320]
                is_speech = vad.is_speech(subframe, RATE)
                ring_buffer.append((subframe, is_speech))

            num_voiced = len([f for f, speech in ring_buffer if speech])

            if not triggered:
                if num_voiced > 0.6 * ring_buffer.maxlen:
                    triggered = True
                    voiced_frames.extend([f for f, s in ring_buffer])
                    ring_buffer.clear()
            else:
                voiced_frames.append(data)
                if num_voiced < 0.2 * ring_buffer.maxlen:
                    triggered = False
                    break

        stream.stop_stream()
        stream.close()

        wf = wave.open("output.wav", 'wb')
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(audio.get_sample_size(FORMAT))
        wf.setframerate(RATE)
        wf.writeframes(b''.join(voiced_frames))
        wf.close()
    except Exception as e:
        print(f"An error occurred while recording audio: {e}")

def transcribe_audio():
    result = model.transcribe("output.wav")
    transcription = result['text']
    return transcription

@app.route('/')
def index():
    return render_template('index.html')

@app.route('/talk', methods=['POST'])
def talk():
    if mute:
        return jsonify({'response': "Muted"})
    
    # Play sound before recording
    

    # Record audio in a separate thread
    record_thread = threading.Thread(target=record_audio)
    record_thread.start()
    record_thread.join()  # Wait for the recording to finish

    # Transcribe the recorded audio
    transcription = transcribe_audio()
    return jsonify({'response': transcription})

@app.route('/mute', methods=['POST'])
def mute_audio():
    global mute
    mute = True
    return jsonify({'status': 'Muted'})

@app.route('/unmute', methods=['POST'])
def unmute_audio():
    global mute
    mute = False
    return jsonify({'status': 'Unmuted'})

if __name__ == '__main__':
    app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
Press CTRL+C to quit
127.0.0.1 - - [26/May/2024 15:34:00] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/May/2024 15:34:03] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [26/May/2024 15:34:13] "POST /talk HTTP/1.1" 200 -
127.0.0.1 - - [26/May/2024 15:34:27] "POST /talk HTTP/1.1" 200 -
