In [1]:
import torch

print("Number of GPU: ", torch.cuda.device_count())
print("GPU Name: ", torch.cuda.get_device_name())


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

Number of GPU:  1
GPU Name:  NVIDIA GeForce RTX 4050 Laptop GPU
Using device: cuda


In [3]:
import wave
import pyaudio
import os
from faster_whisper import WhisperModel

NEON_GREEN = "\033[92m"
RESET_COLOR = "\033[0m"

def list_input_devices(p):
    print("Available audio input devices:")
    for i in range(p.get_device_count()):
        device_info = p.get_device_info_by_index(i)
        if device_info.get("maxInputChannels") > 0:
            print(f"{i}: {device_info.get('name')}")
    device_index = int(input("Enter the device index of your preferred microphone: "))
    return device_index

def record_chunk(p, stream, chunk_file):
    CHUNK_SIZE = 1024
    RECORD_SECONDS = 4
    frames = []

    for _ in range(0, int(16000 / CHUNK_SIZE * RECORD_SECONDS)):
        data = stream.read(CHUNK_SIZE, exception_on_overflow=False)
        frames.append(data)

    if frames:
        with wave.open(chunk_file, 'wb') as wf:
            wf.setnchannels(1)
            wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
            wf.setframerate(16000)
            wf.writeframes(b''.join(frames))
        return True
    return False

def transcribe_chunk(model, file_path):
    segments, info = model.transcribe(file_path)
    return " ".join([segment.text for segment in segments])

def initialize_model():
    model_size = "medium.en"
    return WhisperModel(model_size, device="cuda", compute_type="float16")

p = pyaudio.PyAudio()
stream = None

try:
    device_index = list_input_devices(p)
    model = initialize_model()
    
    stream = p.open(
        format=pyaudio.paInt16,
        channels=1,
        rate=16000,
        input=True,
        frames_per_buffer=1024,
        input_device_index=device_index if device_index != 8 else None  # Use 'default' if selected
    )
    
    accumulated_transcription = ""
    
    while True:
        chunk_file = "temp_chunk.wav"
        if record_chunk(p, stream, chunk_file):
            transcription = transcribe_chunk(model, chunk_file)
            print(NEON_GREEN + transcription + RESET_COLOR)
            accumulated_transcription += transcription + " "
            os.remove(chunk_file)

except KeyboardInterrupt:
    print("Stopping...")
    with open("log.txt", "w") as log_file:
        log_file.write(accumulated_transcription)

except Exception as e:
    print(f"Error: {e}")

finally:
    if stream is not None:
        stream.stop_stream()
        stream.close()
    p.terminate()


ALSA lib pcm_dsnoop.c:567:(snd_pcm_dsnoop_open) unable to open slave
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2722:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_dmix.c:1000:(snd_pcm_dmix_open) unable to open slave


Available audio input devices:
7: pipewire
8: default
9: Family 17h/19h HD Audio Controller Analog Stereo
10: soundcore R50i-83
11: Google Chrome
12: Google Chrome-78
13: soundcore R50i
14: Bluetooth internal capture stream for soundcore R50i
15: Google Chrome input
[92m You[0m
[92m You[0m
[92m[0m
Stopping...
