In [1]:
import jsonlines
import whisper
from fuzzywuzzy import fuzz
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from tqdm import tqdm
import logging
from word2number import w2n
import inflect
!export CUDA_VISIBLE_DEVICES=2

In [2]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in jsonlines.Reader(f):
            data.append(line)
    return data

In [3]:
def set_whisper(model_dir):
    device = "cuda:2" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_dir, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_dir)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )
    return pipe

In [4]:
def transcribe_audio(audio_path, pipe):
    result = pipe([audio_path], batch_size=1)
    return result[0]['text']

In [5]:
def get_text_similarity(original_text, transcribed_text):
    return fuzz.ratio(original_text.strip().lower(), transcribed_text.strip().lower())

In [6]:
p = inflect.engine()
def convert_numbers_to_words(text):
    words = text[:-1].split()
    result = []
    
    for word in words:
        if word.isdigit():
            result.append(p.number_to_words(word)) 
        else:
            result.append(word)
    
    return ' '.join(result) + text[-1]

In [7]:
def filter_inaccurate_data(data, dir, threshold=95):
    filtered_data = []
    pipe = set_whisper("/data/ruiqi.yan/models/whisper-large-v3/")
    for item in tqdm(data):
        source_wav = os.path.join(dir, str(item['id']) + ".wav")
        source_text = convert_numbers_to_words(item['source_text'])

        if os.path.exists(source_wav):
            transcribed_text = convert_numbers_to_words(transcribe_audio(source_wav, pipe))
            similarity = get_text_similarity(source_text, transcribed_text)

            if similarity >= threshold:
                filtered_data.append(item)
        else:
            print(f"Warning: Audio file {str(item['id'])}.wav does not exist.")
    
    return filtered_data

In [8]:
def main(input_file, dir, similarity_threshold=95):
    data = read_jsonl(input_file)
    filtered_data = filter_inaccurate_data(data, dir, threshold=similarity_threshold)
    return filtered_data

In [9]:
filtered_data = main("/data/ruiqi.yan/data/understanding/gk_listening/gk_test.jsonl", "/data/ruiqi.yan/data/eval/gaokao/gaokao", similarity_threshold=95)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 0/330 [00:00<?, ?it/s]Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
  3%|▎         | 10/330 [00:13<06:11,  1.16s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 330/330 [06:33<00:00,  1.19s/it]


In [10]:
len(filtered_data)

303

In [11]:
with jsonlines.open("/data/ruiqi.yan/data/understanding/gk_listening/gk_after_asr.jsonl", mode='w') as writer:
    for item in filtered_data:
        writer.write(item)