In [1]:
!git clone https://github.com/VuThanhLam124/Capstone-NLUS-VDD.git

Cloning into 'Capstone-NLUS-VDD'...
remote: Enumerating objects: 88, done.[K
remote: Counting objects: 100% (88/88), done.[K
remote: Compressing objects: 100% (64/64), done.[K
remote: Total 88 (delta 33), reused 72 (delta 20), pack-reused 0 (from 0)[K
Receiving objects: 100% (88/88), 331.23 KiB | 9.74 MiB/s, done.
Resolving deltas: 100% (33/33), done.


In [2]:
cd Capstone-NLUS-VDD

/kaggle/working/Capstone-NLUS-VDD


In [3]:
ls

[0m[01;34mdocs[0m/    LICENSE     README.md         [01;34mresearch_pipeline[0m/
[01;34mimages[0m/  [01;34mnotebooks[0m/  requirements.txt


In [4]:
!pip install -r requirements.txt

Collecting duckdb==1.1.3 (from -r requirements.txt (line 1))
  Downloading duckdb-1.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (762 bytes)
Collecting openai-whisper (from -r requirements.txt (line 2))
  Downloading openai_whisper-20250625.tar.gz (803 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m803.2/803.2 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting jiwer (from -r requirements.txt (line 3))
  Downloading jiwer-4.0.0-py3-none-any.whl.metadata (3.3 kB)
Collecting bitsandbytes (from -r requirements.txt (line 4))
  Downloading bitsandbytes-0.49.0-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting edge-tts (from -r requirements.txt (line 15))
  Downloading edge_tts-7.2.7-py3-none-any.whl.metadata (5.5 kB)
Collecting rapidfuz

In [5]:
!pip install chunkformer num2words

Collecting chunkformer
  Downloading chunkformer-1.2.2-py3-none-any.whl.metadata (10 kB)
Collecting num2words
  Downloading num2words-0.5.14-py3-none-any.whl.metadata (13 kB)
Collecting tensorboardX (from chunkformer)
  Downloading tensorboardx-2.6.4-py3-none-any.whl.metadata (6.2 kB)
Collecting textgrid (from chunkformer)
  Downloading TextGrid-1.6.1.tar.gz (9.4 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting deepspeed>=0.14.0 (from chunkformer)
  Downloading deepspeed-0.18.3.tar.gz (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting docopt>=0.6.2 (from num2words)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting hjson (from deepspeed>=0.14.0->chunkformer)
  Downloading hjson-3.1.0-py3-none-any.whl.metadata (2.6 kB)
Downloading chunkformer-1.2.2-py3-

In [6]:
# !python research_pipeline/generate_large_dataset.py

In [7]:
# !python research_pipeline/kaggle_asr_pipeline.py

In [None]:
import os
import time
import gc
import torch
import jiwer
import pandas as pd
import numpy as np
import soundfile as sf
import tempfile
import re
from num2words import num2words
from transformers import pipeline

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DATA_PATH = "/kaggle/input/capstone-speech-to-sql/data"
df = pd.read_csv(os.path.join(DATA_PATH, "metadata.csv"))
print(f"Found {len(df)} samples")

def cleanup_gpu():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

def normalize_audio(arr):
    arr = np.asarray(arr, dtype=np.float32)
    max_val = np.abs(arr).max()
    if max_val > 0:
        arr = arr / max_val
    return arr

_PUNCT_RE = re.compile(r"[.,?!;:\"'()\[\]-]")
_NUM_RE = re.compile(r"\d+")

def normalize_text(text):
    text = str(text).lower().strip()
    text = _PUNCT_RE.sub(" ", text)

    def num_to_vn(match):
        try:
            return num2words(int(match.group()), lang="vi")
        except Exception:
            return match.group()

    text = _NUM_RE.sub(num_to_vn, text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def load_audio_array(path):
    tensor = torch.load(path, map_location="cpu")
    if isinstance(tensor, torch.Tensor):
        tensor = tensor.detach().cpu().numpy()
    return normalize_audio(tensor)

def compute_wer(gt_text, hyp_text):
    gt_norm = normalize_text(gt_text)
    hyp_norm = normalize_text(hyp_text)
    if not gt_norm or not hyp_norm:
        return 1.0, gt_norm, hyp_norm
    return jiwer.wer(gt_norm, hyp_norm), gt_norm, hyp_norm

results = []
# ========== 1. PhoWhisper ==========
print("=" * 50)
print("Loading PhoWhisper-Large...")
cleanup_gpu()
pipe = pipeline(
    "automatic-speech-recognition",
    model="vinai/PhoWhisper-large",
    device=0 if torch.cuda.is_available() else -1
)
for idx, row in df.iterrows():
    arr = load_audio_array(os.path.join(DATA_PATH, "tensors", row["tensor_filename"]))

    start = time.time()
    hyp = pipe({"raw": arr, "sampling_rate": 16000})["text"]
    infer_time = time.time() - start

    wer, gt_norm, hyp_norm = compute_wer(row["text"], hyp)

    results.append({
        "Model": "PhoWhisper-Large",
        "ID": row["id"],
        "GT": row["text"],
        "Hyp": hyp,
        "WER": wer,
        "Time": infer_time
    })

    if idx < 3:
        print(f"[{idx}] WER: {wer:.3f}")
        print(f"    GT: {gt_norm[:70]}")
        print(f"    Hyp: {hyp_norm[:70]}")

del pipe
cleanup_gpu()
# ========== 2. Chunkformer ==========
print("\n" + "=" * 50)
print("Loading Chunkformer-Large...")
try:
    from chunkformer import ChunkFormerModel
    chunkformer = ChunkFormerModel.from_pretrained("khanhld/chunkformer-ctc-large-vie")
    try:
        chunkformer = chunkformer.to(DEVICE)
    except Exception:
        pass

    for idx, row in df.iterrows():
        arr = load_audio_array(os.path.join(DATA_PATH, "tensors", row["tensor_filename"]))

        tmp_path = None
        try:
            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
                sf.write(tmp.name, arr, 16000)
                tmp_path = tmp.name

            start = time.time()
            hyp = chunkformer.endless_decode(
                audio_path=tmp_path,
                chunk_size=64,
                left_context_size=128,
                right_context_size=128,
                return_timestamps=False
            )
            infer_time = time.time() - start
        finally:
            if tmp_path and os.path.exists(tmp_path):
                os.unlink(tmp_path)

        hyp = hyp if isinstance(hyp, str) else str(hyp)
        wer, gt_norm, hyp_norm = compute_wer(row["text"], hyp)

        results.append({
            "Model": "Chunkformer-Large",
            "ID": row["id"],
            "GT": row["text"],
            "Hyp": hyp,
            "WER": wer,
            "Time": infer_time
        })

        if idx < 3:
            print(f"[{idx}] WER: {wer:.3f}")
            print(f"    GT: {gt_norm[:70]}")
            print(f"    Hyp: {hyp_norm[:70]}")

    del chunkformer
    cleanup_gpu()
except Exception as e:
    print(f"Chunkformer failed: {e}")
    import traceback
    traceback.print_exc()

# ========== 3. OpenAI Whisper Large V3 ==========
print("\n" + "=" * 50)
print("Loading OpenAI Whisper-Large-V3...")
cleanup_gpu()

# Load với float16 để tối ưu GPU memory và tốc độ
pipe_whisper = pipeline(
    "automatic-speech-recognition",
    model="openai/whisper-large-v3",
    device=0 if torch.cuda.is_available() else -1,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    chunk_length_s=30,  # Xử lý audio theo chunk 30s
)

for idx, row in df.iterrows():
    arr = load_audio_array(os.path.join(DATA_PATH, "tensors", row["tensor_filename"]))

    start = time.time()
    # Chỉ định ngôn ngữ tiếng Việt để tăng độ chính xác
    hyp = pipe_whisper(
        {"raw": arr, "sampling_rate": 16000},
        generate_kwargs={
            "language": "vietnamese",  # Hoặc "vi"
            "task": "transcribe"
        }
    )["text"]
    infer_time = time.time() - start

    wer, gt_norm, hyp_norm = compute_wer(row["text"], hyp)

    results.append({
        "Model": "Whisper-Large-V3",
        "ID": row["id"],
        "GT": row["text"],
        "Hyp": hyp,
        "WER": wer,
        "Time": infer_time
    })

    if idx < 3:
        print(f"[{idx}] WER: {wer:.3f}")
        print(f"    GT: {gt_norm[:70]}")
        print(f"    Hyp: {hyp_norm[:70]}")

del pipe_whisper
cleanup_gpu()

# ========== SUMMARY ==========
print("\n" + "=" * 50)
print("BENCHMARK RESULTS")
print("=" * 50)
res_df = pd.DataFrame(results)
res_df.to_csv("/kaggle/working/benchmark_results.csv", index=False)
summary = res_df.groupby("Model")[["WER", "Time"]].agg(["mean", "std"])
print(summary)


2025-12-27 00:13:08.104623: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1766794388.304434      55 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1766794388.359884      55 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1766794388.830370      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766794388.830421      55 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1766794388.830424      55 computation_placer.cc:177] computation placer alr

Found 800 samples
Loading PhoWhisper-Large...


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/6.17G [00:00<?, ?B/s]

generation_config.json: 0.00B [00:00, ?B/s]