In [1]:
import os
import io
import re
import torch
from dotenv import load_dotenv
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from pydub import AudioSegment
import torchaudio

# === Input Audio File ===
audio_file = "data/2 personal_loan.wav"
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"The audio file was not found at: {audio_file}")

# === Clean Thai Text ===
def clean_thai_text(text):
    if text == "[Transcription Error]":
        return text
    cleaned_text = re.sub(r'(?<=[\u0E00-\u0E7F])\s+(?=[\u0E00-\u0E7F])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

# === Load Environment and HF Token ===
load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")

# === Device Configuration ===
if torch.backends.mps.is_available():
    device_asr = torch.device("mps")
elif torch.cuda.is_available():
    device_asr = torch.device("cuda")
else:
    device_asr = torch.device("cpu")

# === Load ASR Model ===
print("Loading biodatlab Whisper model...")
from transformers import logging
logging.set_verbosity_error()

model_name = "biodatlab/distill-whisper-th-large-v3"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.to(device_asr)

# === Load Entire Audio File ===
audio = AudioSegment.from_wav(audio_file)

# Save to in-memory buffer
buffer = io.BytesIO()
audio.export(buffer, format="wav")
buffer.seek(0)

try:
    # Load and preprocess audio
    waveform, sample_rate = torchaudio.load(buffer)

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)

    if waveform.shape[0] > 1:
        waveform = waveform.mean(dim=0, keepdim=True)

    input_features = processor(
        waveform.squeeze().numpy(),
        sampling_rate=16000,
        return_tensors="pt"
    ).input_features.to(device_asr)

    # Generate transcription
    with torch.no_grad():
        predicted_ids = model.generate(input_features)
    transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
    cleaned_text = clean_thai_text(transcribed_text)

except Exception as e:
    print(f"Error during transcription: {e}")
    cleaned_text = "[Transcription Error]"

# === Save and Print Result ===
os.makedirs("transcript", exist_ok=True)
with open("transcript/transcript.txt", "w", encoding="utf-8") as f:
    f.write(cleaned_text)

print("\n=== Final Transcript ===")
print(cleaned_text)


  from .autonotebook import tqdm as notebook_tqdm


Loading biodatlab Whisper model...

=== Final Transcript ===
คุณพี่ติดต่ออะไรคะเรื่องอะไรอ่าอยากได้เงินหมุนเวียนไว้ใช้จ่ายส่วนตัวเนอะเดี๋ยวหนูขออนุญาตแนะนำตัวก่อนนะคะชื่อหาไทยพัฒน์ไทยยงค์เป็นผู้มีใบนุญาตในป้ายนี้มันจะยังไม่อัพเดทเนอะพี่มันหมดอายุตัวนึงแต่ว่ามันเป็นแบบออนไลน์แต่ประกันทีวิตน่าจะยังเหลืออยู่นะตอนนี้คืออีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอีกอ
