In [1]:
import pandas as pd
import re
from pydub.utils import mediainfo
import os
import jsonlines

In [2]:
df = pd.read_csv('/data/ruiqi.yan/data/understanding/gk_listening/gaokao.tsv', sep='\t')

In [3]:
df_filtered = df[df['with_speech'] == True]

In [4]:
df_filtered['id_number'] = df_filtered['id'].str.extract(r'short_conv_(\d+)').astype(int)

In [5]:
df_sorted = df_filtered.sort_values(by='id_number')

In [6]:
df_sorted.head()

Unnamed: 0,id,audio,n_frames,prompt,tgt_text,with_speech,id_number
0,short_conv_1,gaokao_audio/short_conv_1.wav,111111,What will the man do next? A. Start to take ex...,A,True,1
1111,short_conv_2,gaokao_audio/short_conv_2.wav,111111,What does the man come for? A. To say goodbye;...,A,True,2
1223,short_conv_3,gaokao_audio/short_conv_3.wav,111111,Where does the talk take place? A. At the woma...,B,True,3
1334,short_conv_4,gaokao_audio/short_conv_4.wav,111111,What’s the time now in New York? A. 5 p.m; B. ...,B,True,4
1445,short_conv_5,gaokao_audio/short_conv_5.wav,111111,Why doesn’t the woman learn drawing? A. She’s ...,C,True,5


In [7]:
df_sorted = df_sorted.drop(columns=['id'])
df_sorted = df_sorted.drop(columns=['n_frames'])
df_sorted = df_sorted.drop(columns=['with_speech'])

In [8]:
df_sorted = df_sorted.rename(columns={'id_number': 'id'})
df_sorted = df_sorted.rename(columns={'audio': 'source_wav'})
df_sorted = df_sorted.rename(columns={'prompt': 'source_text'})
df_sorted = df_sorted.rename(columns={'tgt_text': 'target_text'})

In [9]:
cols = ['id'] + [col for col in df_sorted.columns if col != 'id']
df_sorted = df_sorted[cols]

In [10]:
df_sorted.head()

Unnamed: 0,id,source_wav,source_text,target_text
0,1,gaokao_audio/short_conv_1.wav,What will the man do next? A. Start to take ex...,A
1111,2,gaokao_audio/short_conv_2.wav,What does the man come for? A. To say goodbye;...,A
1223,3,gaokao_audio/short_conv_3.wav,Where does the talk take place? A. At the woma...,B
1334,4,gaokao_audio/short_conv_4.wav,What’s the time now in New York? A. 5 p.m; B. ...,B
1445,5,gaokao_audio/short_conv_5.wav,Why doesn’t the woman learn drawing? A. She’s ...,C


In [11]:
def filter_text(text):
    if len(text.split()) > 20: 
        return False
    if not text or not isinstance(text, str):
        return False
    if re.search(r'http[s]?://\S+', text) or re.search(r'\S+@\S+', text):
        return False
    if re.search(r'[\x00-\x1F\x7F]+', text):
        return False
    if re.search(r'[!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]{2,}', text):
        return False
    if re.search(r'[\$:]', text):
        return False
    return True

In [12]:
df_filtered = df_sorted[df_sorted['source_text'].apply(filter_text)]

In [13]:
def get_audio_duration(path):
    file_path = os.path.join("/data/ruiqi.yan/data/understanding/gk_listening", path)
    audio_info = mediainfo(file_path)
    duration = float(audio_info['duration'])  
    return duration

In [14]:
df_filtered = df_filtered[df_filtered['source_wav'].apply(lambda x: get_audio_duration(x) <= 15)]

In [15]:
def modify_source_text(text):
    return "According to the conversation, " + text[0].lower() + text[1:]

In [16]:
df_filtered['source_text'] = df_filtered['source_text'].apply(modify_source_text)

In [17]:
df_filtered.to_json('/data/ruiqi.yan/data/understanding/gk_listening/gk_test.jsonl', orient='records', lines=True, force_ascii=False)

In [18]:
len(df_filtered)

330

In [51]:
import jsonlines
import whisper
from fuzzywuzzy import fuzz
import os
import torch
from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
from tqdm import tqdm
import logging
!export CUDA_VISIBLE_DEVICES=2

In [52]:
def read_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in jsonlines.Reader(f):
            data.append(line)
    return data

In [53]:
def set_whisper(model_dir):
    device = "cuda:2" if torch.cuda.is_available() else "cpu"
    torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

    model = AutoModelForSpeechSeq2Seq.from_pretrained(
        model_dir, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
    )
    model.to(device)

    processor = AutoProcessor.from_pretrained(model_dir)

    pipe = pipeline(
        "automatic-speech-recognition",
        model=model,
        tokenizer=processor.tokenizer,
        feature_extractor=processor.feature_extractor,
        torch_dtype=torch_dtype,
        device=device,
    )
    return pipe

In [54]:
def transcribe_audio(audio_path, pipe):
    result = pipe([audio_path], batch_size=1)
    return result[0]['text']

In [55]:
def get_text_similarity(original_text, transcribed_text):
    return fuzz.ratio(original_text.lower(), transcribed_text.lower())

In [56]:
def filter_inaccurate_data(data, threshold=95):
    filtered_data = []
    pipe = set_whisper("/data/ruiqi.yan/models/whisper-large-v3/")
    for item in tqdm(data):
        source_wav = os.path.join("/data/ruiqi.yan/data/understanding/gk_listening/gaokao_random/", str(item['id']) + ".wav")
        source_text = item['source_text']

        if os.path.exists(source_wav):
            transcribed_text = transcribe_audio(source_wav, pipe)
            similarity = get_text_similarity(source_text, transcribed_text)
            
            if similarity >= threshold:
                filtered_data.append(item)
        else:
            print(f"Warning: Audio file {str(item['id'])}.wav does not exist.")
    
    return filtered_data

In [57]:
def main(input_file, similarity_threshold=95):
    data = read_jsonl(input_file)
    filtered_data = filter_inaccurate_data(data, threshold=similarity_threshold)
    return filtered_data

In [58]:
filtered_data = main("/data/ruiqi.yan/data/understanding/gk_listening/gk_test.jsonl", similarity_threshold=95)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  0%|          | 0/330 [00:00<?, ?it/s]



100%|██████████| 330/330 [03:00<00:00,  1.83it/s]


In [59]:
len(filtered_data)

181

In [60]:
with jsonlines.open("/data/ruiqi.yan/data/understanding/gk_listening/gk_test_new.jsonl", mode='w') as writer:
    for item in filtered_data:
        writer.write(item)

In [2]:
origin = []
data = []
with open("/data/ruiqi.yan/data/understanding/gk_listening/gk_test.jsonl", 'r', encoding='utf-8') as f:
    for line in jsonlines.Reader(f):
        origin.append(line)

In [3]:
for item in origin:
    item['source_text'] = item['source_text'].replace(" A. ", " ;AY; ")
    data.append(item)

In [4]:
with jsonlines.open("/data/ruiqi.yan/data/understanding/gk_listening/gk_test_tts.jsonl", mode='w') as writer:
    for item in data:
        writer.write(item)