-
Notifications
You must be signed in to change notification settings - Fork 71
/
live.py
170 lines (144 loc) · 5.22 KB
/
live.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
# Based on code from https://github.com/Nikorasu/LiveWhisper/blob/main/livewhisper.py
import numpy as np
from .transcribe import Transcribe, TranscriptionOptions
from typing import Union, List
SampleRate = 16000 # Stream device recording frequency per second
BlockSize = 30 # Block size in milliseconds
Vocals = [50, 1000] # Frequency range to detect sounds that could be speech
EndBlocks = 33 * 2 # Number of blocks to wait before sending (30 ms is block)
FlushBlocks = 33 * 10 # Number of blocks to wait before sending
try:
import sounddevice as sd
sounddevice_available = True
except Exception as e:
sounddevice_available = False
sounddevice_exception = e
class Live:
def __init__(
self,
model_path: str,
cache_directory: str,
local_files_only: bool,
task: str,
language: str,
threads: int,
device: str,
device_index: Union[int, List[int]],
compute_type: str,
verbose: bool,
threshold: float,
input_device: int,
options: TranscriptionOptions,
):
self.model_path = model_path
self.cache_directory = cache_directory
self.local_files_only = local_files_only
self.task = task
self.language = language
self.threads = threads
self.device = device
self.device_index = device_index
self.compute_type = compute_type
self.verbose = verbose
self.threshold = threshold
self.input_device = input_device
self.options = options
self.running = True
self.waiting = 0
self.prevblock = self.buffer = np.zeros((0, 1))
self.speaking = False
self.blocks_speaking = 0
self.buffers_to_process = []
self.transcribe = None
@staticmethod
def is_available():
return sounddevice_available
@staticmethod
def force_not_available_exception():
raise (sounddevice_exception)
def _is_there_voice(self, indata, frames):
freq = np.argmax(np.abs(np.fft.rfft(indata[:, 0]))) * SampleRate / frames
volume = np.sqrt(np.mean(indata**2))
return volume > self.threshold and Vocals[0] <= freq <= Vocals[1]
def _save_to_process(self):
self.buffers_to_process.append(self.buffer.copy())
self.buffer = np.zeros((0, 1))
self.speaking = False
def callback(self, indata, frames, _time, status):
if not any(indata):
return
voice = self._is_there_voice(indata, frames)
# Silence and no nobody has started speaking
if not voice and not self.speaking:
return
if voice: # User speaking
if self.verbose:
print(".", end="", flush=True)
if self.waiting < 1:
self.buffer = self.prevblock.copy()
self.buffer = np.concatenate((self.buffer, indata))
self.waiting = EndBlocks
if not self.speaking:
self.blocks_speaking = FlushBlocks
self.speaking = True
else: # Silence after user has spoken
self.waiting -= 1
if self.waiting < 1:
self._save_to_process()
return
else:
self.buffer = np.concatenate((self.buffer, indata))
self.blocks_speaking -= 1
# User spoken for a long time and we need to flush
if self.blocks_speaking < 1:
self._save_to_process()
def process(self):
if len(self.buffers_to_process) > 0:
_buffer = self.buffers_to_process.pop(0)
if self.verbose:
print("\n\033[90mTranscribing..\033[0m")
if not self.transcribe:
self.transcribe = Transcribe(
self.model_path,
self.device,
self.device_index,
self.compute_type,
self.threads,
self.cache_directory,
self.local_files_only,
)
result = self.transcribe.inference(
audio=_buffer.flatten().astype("float32"),
task=self.task,
language=self.language,
verbose=self.verbose,
live=True,
options=self.options,
)
print(f"\033[1A\033[2K\033[0G{result['text']}")
if not self.verbose:
print("")
def listen(self):
show_device = (
self.input_device if self.input_device is not None else sd.default.device[0]
)
print(
f"\033[32mLive stream device: \033[37m{sd.query_devices(device=show_device)['name']}\033[0m"
)
print("\033[32mListening.. \033[37m(Ctrl+C to Quit)\033[0m")
with sd.InputStream(
channels=1,
callback=self.callback,
blocksize=int(SampleRate * BlockSize / 1000),
samplerate=SampleRate,
device=self.input_device,
):
while self.running:
self.process()
def inference(self):
try:
self.listen()
except (KeyboardInterrupt, SystemExit):
pass
finally:
print("\n\033[93mQuitting..\033[0m")