**pythaiasr + pyannote**

In [1]:
import os
import io
from dotenv import load_dotenv
import torch
from pyannote.audio import Pipeline
import pandas as pd
from pythaiasr import ASR
from pydub import AudioSegment
import re

def clean_thai_text(text):
    if text == "[Transcription Error]":
        return text
    cleaned_text = re.sub(r'(?<=[\u0E00-\u0E7F])\s+(?=[\u0E00-\u0E7F])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

load_dotenv()
hf_token = os.getenv("HF_TOKEN")
if hf_token is None:
    raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")

audio_file = "output.wav"
if not os.path.exists(audio_file):
    raise FileNotFoundError(f"The audio file was not found at: {audio_file}")

# Set devices
if torch.backends.mps.is_available():
    device_pyannote = "mps"
    compute_type_pyannote = "float16"
    device_thaiasr = "cpu"  # pythaiasr doesn't support MPS
elif torch.cuda.is_available():
    device_pyannote = "cuda"
    compute_type_pyannote = "float16"
    device_thaiasr = "cuda"
else:
    device_pyannote = "cpu"
    compute_type_pyannote = "float32"
    device_thaiasr = "cpu"

print("Starting speaker diarization...")
diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=hf_token
)
diarization_pipeline.to(torch.device(device_pyannote))
diarization = diarization_pipeline(audio_file)

# Convert diarization to DataFrame
data = [{
    'start': segment.start,
    'end': segment.end,
    'speaker': speaker
} for segment, _, speaker in diarization.itertracks(yield_label=True)]
diarization_df = pd.DataFrame(data)

print("Loading ASR model...")
try:
    asr_model = ASR(model="airesearch/wav2vec2-large-xlsr-53-th", device=device_thaiasr)
except TypeError:
    asr_model = ASR(model="airesearch/wav2vec2-large-xlsr-53-th")

# Load audio
full_audio = AudioSegment.from_wav(audio_file)

transcribed_segments = []
for i, row in diarization_df.iterrows():
    start_time_ms = int(row['start'] * 1000)
    end_time_ms = int(row['end'] * 1000)
    segment_audio = full_audio[start_time_ms:end_time_ms]

    # Save to RAM buffer
    buffer = io.BytesIO()
    segment_audio.export(buffer, format="wav")
    buffer.seek(0)

    try:
        transcribed_text = asr_model(buffer)
        cleaned_text = clean_thai_text(transcribed_text)
    except Exception as e:
        print(f"Error in segment {i}: {e}")
        cleaned_text = "[Transcription Error]"

    transcribed_segments.append({
        'start': row['start'],
        'end': row['end'],
        'speaker': row['speaker'],
        'text': cleaned_text
    })

# Save final transcript
final_transcript_df = pd.DataFrame(transcribed_segments)
final_transcript_df.to_csv("transcript v4.csv", index=False, encoding='utf-8')

# Print to console
for i, row in final_transcript_df.iterrows():
    print(f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']}: {row['text']}")


  from .autonotebook import tqdm as notebook_tqdm


Starting speaker diarization...
Loading ASR model...




Error in segment 1: Calculated padded input size per channel: (1). Kernel size: (2). Kernel size can't be greater than actual input size
Error in segment 12: Calculated padded input size per channel: (1). Kernel size: (2). Kernel size can't be greater than actual input size
Error in segment 35: Calculated padded input size per channel: (1). Kernel size: (2). Kernel size can't be greater than actual input size
[39.70s - 40.35s] SPEAKER_01: ตา
[40.73s - 40.75s] SPEAKER_01: [Transcription Error]
[40.75s - 42.84s] SPEAKER_00: สะบาดเรคาเรีอสายคนนัสซินีนะคะ
[43.23s - 43.70s] SPEAKER_01: ต่า
[43.75s - 48.19s] SPEAKER_00: คารัสนีใคยเป็นพนักงานของพนัคารกรุงไทยตีต่อลูกค้าแนะนมของพนัคารกรุ๊งไทยค่า
[48.65s - 53.85s] SPEAKER_00: เต๋าเข้ามาประชาสัมพันธ์โครงกันสินเชื้อกรุงไทยอเนีกประสงค์สุดอกแรงแสงสักครูะอยู่นะคา
[48.68s - 49.29s] SPEAKER_01: ค้าตะ
[54.00s - 54.23s] SPEAKER_00: คา
[54.03s - 54.60s] SPEAKER_01: คา
[54.60s - 63.08s] SPEAKER_00: คารขาว่าคนค่ะเจ้าหน้าที่คาดเชื้อรัสนิ์สิทธิจารย์รหัสพนักงา