In [1]:
import sounddevice as sd
import numpy as np
from pydub import AudioSegment

# Step 2: Record Audio
duration = 5  # seconds
samplerate = 44100  # Most common sample rate
channels = 2  # 1 for mono, 2 for stereo
filename_wav = "recorded_audio.wav"
filename_mp3 = "recorded_audio.mp3"

# Record the audio
print("Recording...")
audio_data = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=channels, dtype='int16')
sd.wait()

# Step 3: Save as WAV
print("Saving as WAV...")
sd.write(filename_wav, audio_data, samplerate)

# Step 4: Convert to MP3
print("Converting to MP3...")
audio = AudioSegment.from_wav(filename_wav)
audio.export(filename_mp3, format="mp3")
print(f"Saved as {filename_mp3}")


Recording...
Saving as WAV...


AttributeError: module 'sounddevice' has no attribute 'write'

In [2]:
import sounddevice as sd
import numpy as np
from pydub import AudioSegment
from io import BytesIO

# Record Audio
duration = 5  # seconds
samplerate = 44100  # Most common sample rate
channels = 2  # 1 for mono, 2 for stereo

# Record the audio
print("Recording...")
audio_data = sd.rec(int(samplerate * duration), samplerate=samplerate, channels=channels, dtype='float32')
sd.wait()

# Convert the audio data to mono and normalize
audio_data = audio_data.mean(axis=1)  # Convert to mono

# Convert to MP3
print("Converting to MP3...")
audio = AudioSegment(
    # raw audio data (bytes)
    data=BytesIO(audio_data.astype("int16")),
    # 2 byte (16 bit) samples
    sample_width=2,
    # 44.1 kHz sample rate
    frame_rate=44100,
    # stereo
    channels=1
)

# Export MP3
mp3_buffer = BytesIO()
audio.export(mp3_buffer, format="mp3")

# Optionally convert buffer to bytes array or save
mp3_data = mp3_buffer.getvalue()

print("Audio recorded and converted to MP3.")


Recording...
Converting to MP3...


TypeError: object of type '_io.BytesIO' has no len()

In [None]:
import whisperx
import gc 

device = "cuda" 
audio_file = "audio.mp3"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("large-v2", device, compute_type=compute_type)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code=result["language"], device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token=YOUR_HF_TOKEN, device=device)

# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs