In [None]:
import os
from dotenv import load_dotenv
import torch
from pyannote.audio import Pipeline
import pandas as pd
from pythaiasr import ASR
from pydub import AudioSegment
import re

def clean_thai_text(text):
    if text == "[Transcription Error]":
        return text
    cleaned_text = re.sub(r'(?<=[\u0E00-\u0E7F])\s+(?=[\u0E00-\u0E7F])', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text

load_dotenv()
hf_token = os.getenv("HF_TOKEN")

if hf_token is None:
    raise ValueError("Hugging Face token not found. Please set the HF_TOKEN environment variable.")

audio_file = "output.wav"

if not os.path.exists(audio_file):
    raise FileNotFoundError(f"The audio file was not found at: {audio_file}")

if torch.backends.mps.is_available():
    device_pyannote = "mps"
    compute_type_pyannote = "float16"
    device_thaiasr = "cpu"
elif torch.cuda.is_available():
    device_pyannote = "cuda"
    compute_type_pyannote = "float16"
    device_thaiasr = "cuda"
else:
    device_pyannote = "cpu"
    compute_type_pyannote = "float32"
    device_thaiasr = "cpu"

print("Starting speaker diarization...")
diarization_pipeline = Pipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1",
    use_auth_token=hf_token
)
diarization_pipeline.to(torch.device(device_pyannote))
diarization = diarization_pipeline(audio_file)

data = []
for segment, track, speaker in diarization.itertracks(yield_label=True):
    data.append({
        'start': segment.start,
        'end': segment.end,
        'speaker': speaker
    })
diarization_df = pd.DataFrame(data)

print("Loading ASR model...")
try:
    asr_model = ASR(model="airesearch/wav2vec2-large-xlsr-53-th", device=device_thaiasr)
except TypeError:
    asr_model = ASR(model="airesearch/wav2vec2-large-xlsr-53-th")

full_audio = AudioSegment.from_wav(audio_file)

transcribed_segments = []
total_segments = len(diarization_df)

for i, row in diarization_df.iterrows():
    start_time_ms = int(row['start'] * 1000)
    end_time_ms = int(row['end'] * 1000)

    segment_audio = full_audio[start_time_ms:end_time_ms]
    temp_audio_path = f"temp_segment_{i}.wav"
    segment_audio.export(temp_audio_path, format="wav")

    try:
        transcribed_text = asr_model(temp_audio_path)
        cleaned_text = clean_thai_text(transcribed_text)
    except Exception as e:
        cleaned_text = "[Transcription Error]"

    transcribed_segments.append({
        'start': row['start'],
        'end': row['end'],
        'speaker': row['speaker'],
        'text': cleaned_text
    })

    os.remove(temp_audio_path)

final_transcript_df = pd.DataFrame(transcribed_segments)

for i, row in final_transcript_df.iterrows():
    print(f"[{row['start']:.2f}s - {row['end']:.2f}s] {row['speaker']}: {row['text']}")

final_transcript_df.to_csv("cleaned_transcript.csv", index=False, encoding='utf-8')

  from .autonotebook import tqdm as notebook_tqdm


Starting speaker diarization...
Loading ASR model...




[0.03s - 14.19s] SPEAKER_00: เดวันของภู้พี่แล้วก็ไล่มาจนเกิดการเปลี่ยนแปลงจนผ่านยุคต่างต่าหางยุทต่างต่างโด้ยเห็นกวันที่พี่เศร้าผมได้เห็นภายในช่วงเวลาไม่ไม่กี่เดือนในขานีทึมผมหมือนผมสรวบผ้าชีวิตพีภายใน
[5.75s - 6.24s] SPEAKER_01: ห่างยุท
[12.65s - 13.38s] SPEAKER_01: สรวบผ้าชีวิต
[14.93s - 16.97s] SPEAKER_00: วิขาซยมีมาซักแมาณ์สิบ
[16.20s - 16.59s] SPEAKER_01: มา
[16.97s - 18.34s] SPEAKER_01: สามสิบสี่เปียร
[18.34s - 18.37s] SPEAKER_00: 
[18.37s - 18.51s] SPEAKER_01: 
[18.80s - 19.20s] SPEAKER_01: 
[19.20s - 19.22s] SPEAKER_00: [Transcription Error]
[19.22s - 19.44s] SPEAKER_01: กามา
[19.44s - 19.47s] SPEAKER_00: 
[19.47s - 19.64s] SPEAKER_01: นัน
[19.64s - 19.96s] SPEAKER_00: นับ
[19.96s - 20.96s] SPEAKER_01: สิบหนามมี่น่าจะออมา
[19.98s - 20.67s] SPEAKER_00: สิบสามมี่
[20.92s - 27.27s] SPEAKER_00: สิบสามปีพี่ในเวลาหกเดือนฮะเราก็ได้ได้เห็นความเป็นไปหรือว่าเป็นแฟน
[27.98s - 31.18s] SPEAKER_00: มากมากถือแม้จะมาที่หลังก็ตาย
[29.70s - 30.57s] SPEAKER_01: เขาัด
[31.23s - 31.72s] SPEAKER_00: 