In [1]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

device = "cuda" if torch.cuda.is_available() else "cpu"
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
print(device, torch_dtype)

cuda torch.float16


In [2]:
# load dataset
dataset = Dataset.load_from_disk("../data/processed/100测试语音/sr_16000.hf")
print(dataset)
print(dataset[0])

df = pd.read_csv("../data/processed/100测试语音/data.csv").drop(columns=["audio_url", "tags", "audio"])
df.head()

Dataset({
    features: ['sentence', 'audio_id', 'audio', 'duration'],
    num_rows: 90
})
{'sentence': '客人青春休闲时尚，喜欢老花，喜欢经典款，喜欢speedy，无锡人，喜欢精致小巧的包型。', 'audio_id': '726157', 'audio': {'path': None, 'array': array([ 0.        ,  0.        ,  0.        , ...,  0.0085144 ,
       -0.00106812, -0.00588989]), 'sampling_rate': 16000}, 'duration': 10.24}


Unnamed: 0,prediction,sentence,audio_id
0,客人青春休闲时尚，喜欢老花，喜欢春天，还喜欢speed，无数人喜欢精致小巧的包型。,客人青春休闲时尚，喜欢老花，喜欢经典款，喜欢speedy，无锡人，喜欢精致小巧的包型。,726157
1,无锡人刚刚结婚，喜欢休闲的包包，喜欢黑牛角酷一点的，喜欢小邮差包包么teeth。,无锡人刚刚结婚，喜欢休闲的包包，喜欢黑牛角酷一点的，喜欢小邮差包包metis。,726162
2,常州人喜欢小猫，喜欢猫的姑娘，喜欢老婆，喜欢奥那个喜欢戳特。,常州人喜欢小包，喜欢monogram，喜欢老花，喜欢onthego喜欢托特。,726098
3,希望金典款选老华轩爆款需要发票自己开建筑，在附近开鹿，我喜欢大车喜欢经典色。,喜欢经典款，喜欢老花，喜欢爆款，需要发票，自己开店，住在附近开鹿，喜欢大车，喜欢经典色。,725801
4,喜欢休闲，喜欢舒适，喜欢老花，喜欢小刘，才喜欢黑色，喜欢灰色。,喜欢休闲，喜欢舒适，喜欢老花，喜欢小邮差，喜欢黑色，喜欢灰色。,725761


In [3]:
def load_pipeline(hf_model_id: str) -> pipeline:
    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        hf_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(hf_model_id)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        max_new_tokens=128,
        chunk_length_s=30,
        batch_size=4,
        return_timestamps=False,
        torch_dtype=torch_dtype,
        device=device,
    )
    return pipe


def inference_dataset(pipe: pipeline, input_dataset: Dataset) -> list:
    transcriptions = []
    for i in range(len(input_dataset)):
        sample = input_dataset[i]["audio"]
        transcriptions.append(pipe(sample, generate_kwargs={"task": "transcribe", "language": "chinese"}))
    return transcriptions

In [3]:
# store the prediction results
candidates = {
    "openai/whisper-small": None,
    "openai/whisper-large-v3": None,
    "BELLE-2/Belle-whisper-large-v3-zh": None
}

import os.path as osp
import joblib

candidates_path = "evaluation_100_samples/candidates.pkl"
if osp.exists(candidates_path):
    candidates = joblib.load(candidates_path)

else:
    for key in candidates.keys():
        if key != "BELLE-2/Belle-whisper-large-v3-zh":
            _pipe = load_pipeline(key)
            candidates[key] = _pipe(dataset["audio"], generate_kwargs={"task": "transcribe", "language": "chinese"})
        elif key == "BELLE-2/Belle-whisper-large-v3-zh":
            _pipe = pipeline(
                "automatic-speech-recognition",
                model="BELLE-2/Belle-whisper-large-v3-zh",
                max_new_tokens=128,
                chunk_length_s=30,
                batch_size=4,
                return_timestamps=False,
                torch_dtype=torch_dtype,
                device=device,
            )
            _pipe.model.config.forced_decoder_ids = (
                _pipe.tokenizer.get_decoder_prompt_ids(
                    language="zh",
                    task="transcribe"
                )
            )
            candidates[key] = inference_dataset(_pipe, dataset)
        else:
            raise NotImplemented

    # save results
    joblib.dump(candidates, candidates_path)

['evaluation_100_samples/candidates.pkl']

In [4]:
from copy import deepcopy


def extract_text(transcription_dict: dict) -> dict:
    """Extract the transcription text."""
    transcription_dict = deepcopy(transcription_dict)
    for k, v in transcription_dict.items():
        _v = [item["text"] for item in v]
        transcription_dict[k] = _v
    return transcription_dict


candidates = extract_text(candidates)

In [17]:
# add to dataframe
df["openai/whisper-small"] = candidates["openai/whisper-small"]
df["openai/whisper-large-v3"] = candidates["openai/whisper-large-v3"]
df["BELLE-2/Belle-whisper-large-v3-zh"] = candidates["BELLE-2/Belle-whisper-large-v3-zh"]

# convert traditional chinese to simplified chinese
import opencc
import re

converter = opencc.OpenCC('t2s.json')


def convert_text(text: str) -> str:
    # remove the punctuation marks and convert to simplified chinese
    text = re.sub(r"[^\w\s]", "", text)
    return converter.convert(text)


for col in ["prediction", "sentence"] + list(candidates.keys()):
    df[col + "_clean"] = df[col].apply(lambda x: convert_text(x))

# save the results
df.to_csv("evaluation_100_samples/data.csv", index=False)

In [26]:
# calculate the cer and wer
import evaluate

metrics = [("cer", evaluate.load("cer")), ("wer", evaluate.load("wer"))]
pred_col = [
    # "sentence",
    # "openai/whisper-small",
    # "openai/whisper-large-v3",
    # "BELLE-2/Belle-whisper-large-v3-zh",
    "sentence_clean",
    "openai/whisper-small_clean",
    "openai/whisper-large-v3_clean",
    "BELLE-2/Belle-whisper-large-v3-zh_clean"
]

for m, m_obj in metrics:
    for col in pred_col:
        print(f"{m} for {col}: {m_obj.compute(predictions=df['prediction_clean'], references=df[col]):.3f}")

cer for sentence_clean: 0.264
cer for openai/whisper-small_clean: 0.368
cer for openai/whisper-large-v3_clean: 0.308
cer for BELLE-2/Belle-whisper-large-v3-zh_clean: 0.302
wer for sentence_clean: 0.950
wer for openai/whisper-small_clean: 1.014
wer for openai/whisper-large-v3_clean: 1.018
wer for BELLE-2/Belle-whisper-large-v3-zh_clean: 1.253
