# 处理KeSpeech-ASR数据集

|- Each_subdialect    
&emsp;|- Mandarin   
&emsp;|- BJ   
&emsp;|- SW   
&emsp;|- ZY    
&emsp;|- NE   
&emsp;|- LY  
&emsp;|- JH    
&emsp;|- JLu   
&emsp;|- JLo   
|- All_subdialect   
|- Mandarin  
|-Whole_training_set  



In [1]:
'''
kespeech office data
ASR:                                            
    train:
        P1: 543633
        P2: 340387       Total: 884020
    dev:
        P1: 2199
        P2: 2207         Total: 4406
    test:               Total: 19723
AR:
    train: 543633
    dev: 2199
    test: 44306
    
data1 为原始ASR数据  为phase1+phase2 data
ASR:                                            
    train audio_datasets.jsonl: 881808
    dev audio_datasets.jsonl: 16883  注意验证数据中可能包含有test数据
    test audio_datasets.jsonl: 19668
    
data2 为原始ASR数据加上口音信息  增加了text_language字段  注意验证数据中可能包含有test数据

data3 在data2的基础上增加了kmeans伪标签 同时生成了单独的phase1数据audio_datasets_phase1.jsonl  注意验证数据中可能包含有test数据
ASR:                                            
    train audio_datasets.jsonl: 881808
    dev audio_datasets.jsonl: 16883  注意验证数据中可能包含有test数据
    test audio_datasets.jsonl: 19668
    
    train audio_datasets_phase1.jsonl: 542697
    dev audio_datasets_phase1.jsonl: 16833
    test audio_datasets_phase1.jsonl: 16727
    
data4 为data2的基础上把所有不含phase1对应的text_language设置成None  注意验证数据中可能包含有test数据

data5 只有phase1的内容 kespeech的默认ar数据集
AR:
    train: 542697
    dev: 2196
    test: 16727

data6 标准的asr phase1+phase2数据集  test数据已从所有dev数据集中删除  P2的口音label为Mandarin
ASR:
    train audio_datasets.jsonl: 881808
    dev audio_datasets.jsonl: 4399
    test audio_datasets.jsonl: 19668
'''


# data1 为原始ASR数据  为phase1+phase2 data 注意验证数据中可能包含有test数据
# data2 为原始ASR数据加上口音信息  增加了text_language字段  注意验证数据中可能包含有test数据
# data3 在data2的基础上增加了kmeans伪标签 同时生成了单独的phase1数据audio_datasets_phase1.jsonl  注意验证数据中可能包含有test数据
# data4 为data2的基础上把所有不含phase1对应的text_language设置成None  注意验证数据中可能包含有test数据
# data5 只有phase1的内容 kespeech的默认ar数据集
# data6 标准的asr phase1+phase2数据集  test数据已从所有dev数据集中删除

In [13]:
import os
import re

root_dir = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech"
audio_dir = os.path.join(root_dir, "Audio")
asr_text_dir = os.path.join(root_dir, "Tasks/ASR")
train_phase_1_dir = os.path.join(asr_text_dir, "train_phase1")
train_phase_2_dir = os.path.join(asr_text_dir, "train_phase2")
train_list = [train_phase_1_dir, train_phase_2_dir]
dev_phase_1_dir = os.path.join(asr_text_dir, "dev_phase1")
dev_phase_2_dir = os.path.join(asr_text_dir, "dev_phase2")
dev_list = [dev_phase_1_dir, dev_phase_2_dir]
test_dir = os.path.join(asr_text_dir, "test")

output_dir = "/ssd/zhuang/code/FunASR2024/examples/kespeech/DATA/data"
os.makedirs(output_dir, exist_ok=True)


In [14]:
def gather_wav_scp(wav_scp, dict):
    with open(wav_scp, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, path = line.split(" ")
        path = os.path.join(root_dir, path)
        if os.path.exists(path):
            dict[id] = {}
            dict[id].update({"path": path})
    return dict

def gether_dialect_info(utt2subdialect, dict):
    with open(utt2subdialect, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, dialect = line.split(" ")
        dict[id].update({"dialect": dialect})
    return dict

def gather_text_info(text, dict):
    with open(text, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, text = line.split(" ", 1)
        dict[id].update({"text": text})
    return dict

def contains_non_chinese(text):
    # 正则表达式匹配非中文字符
    non_chinese_pattern = re.compile(r'[^\u4e00-\u9fff]')
    return non_chinese_pattern.search(text) is not None

In [15]:
train_dict  = {}
dev_dict = {}
test_dict = {}

for train_path in train_list:
    train_wav_scp = os.path.join(train_path, "wav.scp")
    train_utts2subdialect = os.path.join(train_path, "utt2subdialect")
    traain_text = os.path.join(train_path, "text")
    train_dict = gather_wav_scp(train_wav_scp, train_dict)
    train_dict = gether_dialect_info(train_utts2subdialect, train_dict)
    train_dict = gather_text_info(traain_text, train_dict)
for dev_path in dev_list:
    dev_wav_scp = os.path.join(dev_path, "wav.scp")
    dev_utts2subdialect = os.path.join(dev_path, "utt2subdialect")
    dev_text = os.path.join(dev_path, "text")
    dev_dict = gather_wav_scp(dev_wav_scp, dev_dict)
    dev_dict = gether_dialect_info(dev_utts2subdialect, dev_dict)
    dev_dict = gather_text_info(dev_text, dev_dict)

test_wav_scp = os.path.join(test_dir, "wav.scp")
test_utts2subdialect = os.path.join(test_dir, "utt2subdialect")
test_text = os.path.join(test_dir, "text")
test_dict = gather_wav_scp(test_wav_scp, test_dict)
test_dict = gether_dialect_info(test_utts2subdialect, test_dict)
test_dict = gather_text_info(test_text, test_dict)

clean_train_dict = {}
clean_dev_dict = {}
clean_test_dict = {}

for id in train_dict.keys():
    if contains_non_chinese(train_dict[id]["text"]):
        continue
    else:
        clean_train_dict[id] = train_dict[id]
for id in dev_dict.keys():
    if contains_non_chinese(dev_dict[id]["text"]):
        continue
    else:
        clean_dev_dict[id] = dev_dict[id]
for id in test_dict.keys():
    if contains_non_chinese(test_dict[id]["text"]):
        continue
    else:
        clean_test_dict[id] = test_dict[id]

train_dict = clean_train_dict
dev_dict = clean_dev_dict
test_dict = clean_test_dict


print ("Done!")
print ("Train dataset size: ", len(train_dict))
print ("Dev dataset size: ", len(dev_dict))
print ("Test dataset size: ", len(test_dict))

Done!
Train dataset size:  881808
Dev dataset size:  4399
Test dataset size:  19668


## Each_subdialect   

In [16]:
each_subdialect_dir =  os.path.join(output_dir, "ES")
os.makedirs(each_subdialect_dir, exist_ok=True)

###  Make wav.scp and text

In [17]:
subdialect_list=['Mandarin', 'Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
for dialect in subdialect_list:
    es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
    os.makedirs(es_subdialect_dir, exist_ok=True)

    es_subdialect_train = os.path.join(es_subdialect_dir, "train")
    es_subdialect_dev = os.path.join(es_subdialect_dir, "dev")
    es_subdialect_test = os.path.join(es_subdialect_dir, "test")

    es_subdialect_output = [es_subdialect_train, es_subdialect_dev, es_subdialect_test]
    data_dict = [train_dict, dev_dict, test_dict]

    for output, dict in zip(es_subdialect_output, data_dict):
        os.makedirs(output, exist_ok=True)
        wav_scp = os.path.join(output, "wav.scp")
        text_path = os.path.join(output, "text")

        with open(wav_scp, 'w') as f:
            for id, info in dict.items():
                if info["dialect"] == dialect:
                    path = info["path"]
                    f.write(f"{id} {path}\n")

        with open(text_path, 'w') as f:
            for id, info in dict.items():
                if info["dialect"] == dialect:
                    text = info["text"]
                    f.write(f"{id} {text}\n")

        print (dialect, output.split("/")[-1], "size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))
            
    print ("Done!")


Mandarin train size:  678515 check True
Mandarin dev size:  3681 check True
Mandarin test size:  4981 check True
Done!
Beijing train size:  2237 check True
Beijing dev size:  31 check True
Beijing test size:  265 check True
Done!
Southwestern train size:  45359 check True
Southwestern dev size:  132 check True
Southwestern test size:  2684 check True
Done!
Jiao-Liao train size:  20268 check True
Jiao-Liao dev size:  118 check True
Jiao-Liao test size:  1443 check True
Done!
Northeastern train size:  4843 check True
Northeastern dev size:  5 check True
Northeastern test size:  350 check True
Done!
Jiang-Huai train size:  27586 check True
Jiang-Huai dev size:  105 check True
Jiang-Huai test size:  2268 check True
Done!
Lan-Yin train size:  20549 check True
Lan-Yin dev size:  104 check True
Lan-Yin test size:  1646 check True
Done!
Ji-Lu train size:  33861 check True
Ji-Lu dev size:  156 check True
Ji-Lu test size:  2806 check True
Done!
Zhongyuan train size:  48590 check True
Zhongyuan d

### Update the test to the dev

In [18]:
def merge_files(read_file, writen_file):
    # 打开第一个文件以读取数据
    with open(read_file, 'r', encoding='utf-8') as f1:
        data = f1.read()  # 读取全部内容

    # 打开第二个文件以追加数据
    with open(writen_file, 'a', encoding='utf-8') as f2:
        f2.write(data)  # 将读取的数据追加到文件末尾


In [19]:
subdialect_list=['Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
for dialect in subdialect_list:
    
    es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
    es_subdialect_dev = os.path.join(es_subdialect_dir, "dev")
    es_subdialect_test = os.path.join(es_subdialect_dir, "test")

    es_subdialect_dev_wav_scp = os.path.join(es_subdialect_dev, "wav.scp")
    es_subdialect_dev_text = os.path.join(es_subdialect_dev, "text")

    es_subdialect_test_wav_scp = os.path.join(es_subdialect_test, "wav.scp")
    es_subdialect_test_text = os.path.join(es_subdialect_test, "text")

    print (dialect, "Dev dataset oringinal size: ", len(open(es_subdialect_dev_wav_scp).readlines()), "=", len(open(es_subdialect_dev_text).readlines()))
    print (dialect, "Test dataset oringinal size: ", len(open(es_subdialect_test_wav_scp).readlines()), "=", len(open(es_subdialect_test_text).readlines()))

    merge_files(es_subdialect_test_wav_scp, es_subdialect_dev_wav_scp)
    merge_files(es_subdialect_test_text, es_subdialect_dev_text)

    print (dialect, "Dev dataset size: ", len(open(es_subdialect_dev_wav_scp).readlines()), "=", len(open(es_subdialect_dev_text).readlines()))
    print (dialect, "Test dataset size: ", len(open(es_subdialect_test_wav_scp).readlines()),"=", len(open(es_subdialect_test_text).readlines()))

    print ("\n")



Beijing Dev dataset oringinal size:  31 = 31
Beijing Test dataset oringinal size:  265 = 265
Beijing Dev dataset size:  296 = 296
Beijing Test dataset size:  265 = 265


Southwestern Dev dataset oringinal size:  132 = 132
Southwestern Test dataset oringinal size:  2684 = 2684
Southwestern Dev dataset size:  2816 = 2816
Southwestern Test dataset size:  2684 = 2684


Jiao-Liao Dev dataset oringinal size:  118 = 118
Jiao-Liao Test dataset oringinal size:  1443 = 1443
Jiao-Liao Dev dataset size:  1561 = 1561
Jiao-Liao Test dataset size:  1443 = 1443


Northeastern Dev dataset oringinal size:  5 = 5
Northeastern Test dataset oringinal size:  350 = 350
Northeastern Dev dataset size:  355 = 355
Northeastern Test dataset size:  350 = 350


Jiang-Huai Dev dataset oringinal size:  105 = 105
Jiang-Huai Test dataset oringinal size:  2268 = 2268
Jiang-Huai Dev dataset size:  2373 = 2373
Jiang-Huai Test dataset size:  2268 = 2268


Lan-Yin Dev dataset oringinal size:  104 = 104
Lan-Yin Test dataset 

## All_subdialect

In [20]:
subdialect_list=['Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']

all_subdialect_dir = os.path.join(output_dir, "AS")
os.makedirs(all_subdialect_dir, exist_ok=True)
all_subdialect_train = os.path.join(all_subdialect_dir, "train")
WholeDevSet = os.path.join(all_subdialect_dir, "dev")
WholeTestSet = os.path.join(all_subdialect_dir, "test")
whole_dataset_output = [all_subdialect_train, WholeDevSet, WholeTestSet]

for output in whole_dataset_output:
    os.makedirs(output, exist_ok=True)
    wav_scp = os.path.join(output, "wav.scp")
    text_path = os.path.join(output, "text")

    with open(wav_scp, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_wav_scp = os.path.join(es_subdialect_output, "wav.scp")
            for line in open(es_subdialect_wav_scp):
                f.write(line)

    with open(text_path, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_text = os.path.join(es_subdialect_output, "text")
            for line in open(es_subdialect_text):
                f.write(line)

    print (output.split("/")[-1], "size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))


train size:  203293 check True
dev size:  15405 check True
test size:  14687 check True


## Whole data set

In [21]:
subdialect_list=['Mandarin', 'Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']

all_subdialect_dir = os.path.join(output_dir, "WD")
os.makedirs(all_subdialect_dir, exist_ok=True)
all_subdialect_train = os.path.join(all_subdialect_dir, "train")
WholeDevSet = os.path.join(all_subdialect_dir, "dev")
WholeTestSet = os.path.join(all_subdialect_dir, "test")
whole_dataset_output = [all_subdialect_train, WholeDevSet, WholeTestSet]

for output in whole_dataset_output:
    os.makedirs(output, exist_ok=True)
    wav_scp = os.path.join(output, "wav.scp")
    text_path = os.path.join(output, "text")

    with open(wav_scp, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_wav_scp = os.path.join(es_subdialect_output, "wav.scp")
            for line in open(es_subdialect_wav_scp):
                f.write(line)

    with open(text_path, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_text = os.path.join(es_subdialect_output, "text")
            for line in open(es_subdialect_text):
                f.write(line)

    print (output.split("/")[-1], "size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))

train size:  881808 check True
dev size:  19086 check True
test size:  19668 check True


## After CNAM

In [22]:
def merge_files(read_file, writen_file):
    # 打开第一个文件以读取数据
    with open(read_file, 'r', encoding='utf-8') as f1:
        data = f1.read()  # 读取全部内容

    # 打开第二个文件以追加数据
    with open(writen_file, 'a', encoding='utf-8') as f2:
        f2.write(data)  # 将读取的数据追加到文件末尾

In [25]:
subdialect_list=['Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
section_list = ["train", "dev", "test"]

all_subdialect_dir = os.path.join(output_dir, "AS")
each_subdialect_dir = os.path.join(output_dir, "ES")


for section in section_list:
    all_subdialect_section = os.path.join(all_subdialect_dir, section)
    all_subdialect_section_audio_datasets = os.path.join(all_subdialect_section, "audio_datasets.jsonl")
    os.makedirs(all_subdialect_section, exist_ok=True)

    for dialect in subdialect_list:
        es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
        es_subdialect_section = os.path.join(es_subdialect_dir, section)
        es_audio_datasets  = os.path.join(es_subdialect_section, "audio_datasets.jsonl")
        print (dialect, section, "dataset size: ", len(open(es_audio_datasets).readlines()))
        merge_files(es_audio_datasets, all_subdialect_section_audio_datasets)

    print (section, "dataset size: ", len(open(all_subdialect_section_audio_datasets).readlines()))



Beijing train dataset size:  2237
Southwestern train dataset size:  45359
Jiao-Liao train dataset size:  20268
Northeastern train dataset size:  4843
Jiang-Huai train dataset size:  27586
Lan-Yin train dataset size:  20549
Ji-Lu train dataset size:  33861
Zhongyuan train dataset size:  48590
train dataset size:  203293
Beijing dev dataset size:  296
Southwestern dev dataset size:  2816
Jiao-Liao dev dataset size:  1561
Northeastern dev dataset size:  355
Jiang-Huai dev dataset size:  2373
Lan-Yin dev dataset size:  1750
Ji-Lu dev dataset size:  2962
Zhongyuan dev dataset size:  3292
dev dataset size:  15405
Beijing test dataset size:  265
Southwestern test dataset size:  2684
Jiao-Liao test dataset size:  1443
Northeastern test dataset size:  350
Jiang-Huai test dataset size:  2268
Lan-Yin test dataset size:  1646
Ji-Lu test dataset size:  2806
Zhongyuan test dataset size:  3225
test dataset size:  14687


In [29]:
subdialect_list=['Mandarin', 'Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
section_list = ["train", "dev", "test"]

whole_dir = os.path.join(output_dir, "WD")
each_subdialect_dir = os.path.join(output_dir, "ES")

for section in section_list:
    whole_section = os.path.join(whole_dir, section)
    whole_audio_datasets = os.path.join(whole_section, "audio_datasets.jsonl")
    os.makedirs(whole_section, exist_ok=True)

    for dialect in subdialect_list:
        es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
        es_subdialect_section = os.path.join(es_subdialect_dir, section)
        es_audio_datasets  = os.path.join(es_subdialect_section, "audio_datasets.jsonl")
        print (dialect, section, "dataset size: ", len(open(es_audio_datasets).readlines()))
        merge_files(es_audio_datasets, whole_audio_datasets)

    print (section, "dataset size: ", len(open(whole_audio_datasets).readlines()))

Mandarin train dataset size:  678515
Beijing train dataset size:  2237
Southwestern train dataset size:  45359
Jiao-Liao train dataset size:  20268
Northeastern train dataset size:  4843
Jiang-Huai train dataset size:  27586
Lan-Yin train dataset size:  20549
Ji-Lu train dataset size:  33861
Zhongyuan train dataset size:  48590
train dataset size:  881808
Mandarin dev dataset size:  3681
Beijing dev dataset size:  296
Southwestern dev dataset size:  2816
Jiao-Liao dev dataset size:  1561
Northeastern dev dataset size:  355
Jiang-Huai dev dataset size:  2373
Lan-Yin dev dataset size:  1750
Ji-Lu dev dataset size:  2962
Zhongyuan dev dataset size:  3292
dev dataset size:  19086
Mandarin test dataset size:  4981
Beijing test dataset size:  265
Southwestern test dataset size:  2684
Jiao-Liao test dataset size:  1443
Northeastern test dataset size:  350
Jiang-Huai test dataset size:  2268
Lan-Yin test dataset size:  1646
Ji-Lu test dataset size:  2806
Zhongyuan test dataset size:  3225
test

# 添加口音信息

In [1]:
import os
import shutil

# 源文件夹路径
source_dir = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data"
# 目标文件夹路径
data_root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2"

# 确保目标目录存在，如果不存在则创建
os.makedirs(data_root, exist_ok=True)

# 使用shutil.copytree来复制目录。从 Python 3.8 开始，copytree() 可以接受 dirs_exist_ok 参数
# 如果目标目录已存在，并且你想要覆盖里面的文件，则需要设置 dirs_exist_ok=True
try:
    shutil.copytree(source_dir, data_root, dirs_exist_ok=True)
except FileExistsError:
    # 如果在较早的 Python 版本中使用 shutil.copytree 且目标目录已存在，会引发此错误
    print("目录已存在")
except Exception as e:
    print(f"复制过程中出错: {e}")


In [2]:

end_name = "audio_datasets.jsonl"

# 遍历所有的.jsonl文件
data_list = []
for root, dirs, files in os.walk(data_root):
    for file in files:
        if file == end_name:
            file_path = os.path.join(root, file)
            data_list.append(file_path)
            
print (data_list)

['/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Lan-Yin/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Lan-Yin/dev/audio_datasets.jsonl', '/ssd/zhuang/cod

In [3]:
# 读取口音文件并转换成字典
dialect_root = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/SubdialectID"
dialect_dict = {}
for root, dirs, files in os.walk(dialect_root):
    for file in files:
        if file == "utt2subdialect":
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    id, dialect = line.split(" ")
                    dialect_dict[id] = dialect

In [4]:
def add_dialect(lines, dialect_dict, path):
    
    save_path = path.replace("audio_datasets", "audio_datasets2")

    import json
    
    updated_lines = []
    
    for line in lines:
        # 解析 JSON 数据
        data = json.loads(line)
        
        # 根据 key 在 dialect_dict 中查找对应的 dialect
        key = data['key']
        dialect = dialect_dict.get(key, "Mandarin")  # 如果 key 不存在于 dialect_dict 中，则默认值为 "Unknown"
        
        # 添加 "dialect" 字段
        data['text_language'] = dialect
        
        # 将更新后的 JSON 数据转回字符串
        updated_line = json.dumps(data, ensure_ascii=False) + '\n'
        updated_lines.append(updated_line)
    
    # 将更新后的内容写回文件
    with open(save_path, 'w', encoding='utf-8') as file:
        file.writelines(updated_lines)
        
    os.remove(path)
    os.renames(save_path, path)
    
    print("文件更新完成。")

In [5]:
first_10_items = list(dialect_dict.items())[:10]

# 打印前10个项目
for key, value in first_10_items:
    print(f'{key}: {value}')

1000001_0b1a33a3: Mandarin
1000001_0e9793ff: Mandarin
1000001_11f3978b: Mandarin
1000001_1c4b6ce5: Mandarin
1000001_2c863844: Mandarin
1000001_3c84b37d: Mandarin
1000001_492740a5: Mandarin
1000001_5c8b5985: Mandarin
1000001_63740949: Mandarin
1000001_6a7435f1: Mandarin


In [6]:
for i in range(len(data_list)):
# for i in range (1):
    data_path = data_list[i]
    print ("Processing: ", data_path)
    with open(data_path, 'r') as f:
        lines = f.readlines()
    add_dialect(lines, dialect_dict, data_path)

Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/test/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/dev/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/train/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/test/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/dev/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/train/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/test/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/dev/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/train/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/exa

# 新建口音字典

In [7]:
import os

In [8]:
# 读取口音文件并转换成字典
dialect_root = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/SubdialectID"
dialect_list = []
for root, dirs, files in os.walk(dialect_root):
    for file in files:
        if file == "utt2subdialect":
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    id, dialect = line.split(" ")
                    if dialect not in dialect_list:
                        dialect_list.append(dialect)

In [9]:
save_dialect_path = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/zh_token_list/char/dialects.txt"
with open(save_dialect_path, 'w') as f:
    for dialect in dialect_list:
        f.write(f"{dialect}\n")

# 将重复的内容清洗掉

In [1]:
import os
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/"

In [2]:
pending_list = []
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith(".jsonl"):
            file_path = os.path.join(root, file)
            pending_list.append(file_path)
print (pending_list)

['/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Southwestern/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespee

In [3]:
def clean_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    print (file_path, "size: ", len(lines))
    lines = list(set(lines))
    print (file_path, "size: ", len(lines))
    with open(file_path, 'w') as f:
        f.writelines(lines)
    print (file_path, "cleaned!")

In [4]:
for file_path in pending_list:
    clean_file(file_path)

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl size:  2268
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl size:  2268
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl cleaned!
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl size:  2373
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl size:  2373
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl cleaned!
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl size:  27586
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl size:  27586
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl cleaned!
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yi

In [9]:
import os
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/"

for root, dirs, files in os.walk(root):
    for file in files:
        if "bac" in file:
            print (file)
            os.remove(os.path.join(root, file))
            print (file, "removed!")
        if "hubert" in file:
            print (file)
            os.remove(os.path.join(root, file))
            print (file, "removed!")
    

hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9la

# 将data3里面的所有phase2都去掉

In [13]:
import os

In [14]:
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith(".jsonl"):
            file_path = os.path.join(root, file)
            if not file_path.endswith("audio_datasets.jsonl"):
                print (file_path)
                os.remove(file_path)
            # print (file_path)
            # with open(file_path, 'r') as f:
            #     lines = f.readlines()
            # new_lines = []
            # for line in lines:
            #     if "phase2" not in line:
            #         new_lines.append(line)
            # clean_file_path = file_path.replace("audio_datasets", "audio_datasets_phase1")
            # with open(clean_file_path, 'w') as f:
            #     f.writelines(new_lines)
            

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/test/audio_datasets_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/test/audio_datasets_phase1_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/dev/audio_datasets_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/dev/audio_datasets_phase1_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/train/audio_datasets_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/train/audio_datasets_phase1_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/test/audio_datasets_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/test/audio_datasets_phase1_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/dev/audio_datasets_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/dev/audio_datasets_phase1_phase1.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/train/audio_datasets_

In [15]:
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets.jsonl"):
            file_path = os.path.join(root, file)
            print (file_path)
            with open(file_path, 'r') as f:
                lines = f.readlines()
            new_lines = []
            for line in lines:
                if "phase2" not in line:
                    new_lines.append(line)
            clean_file_path = file_path.replace("audio_datasets", "audio_datasets_phase1")
            with open(clean_file_path, 'w') as f:
                f.writelines(new_lines)

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/test/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/dev/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/AS/train/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/test/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/dev/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/WD/train/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/test/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/dev/audio_datasets.jsonl
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/dat

In [1]:
!nvidia-smi

Sun Aug  4 18:37:09 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4090        Off |   00000000:31:00.0 Off |                  Off |
|  0%   37C    P0             44W /  450W |       1MiB /  24564MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 4090       

# 统计语音数量

In [16]:
import os
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES"
file_list = []
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets_phase1.jsonl"):
            file_path = os.path.join(root, file)
            file_list.append(file_path)
print(file_list)

['/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/dev/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/test/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/train/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiao-Liao/dev/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiao-Liao/test/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiao-Liao/train/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Southwestern/dev/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Southwestern/test/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Southwestern/train/audio_datasets_phase1.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-

In [21]:
# 统计各个文件里的语音数量
len_dict = {}
for file_path in file_list:
    if "test" in file_path:
        continue
    id = file_path.split("/")[-3]
    with open(file_path, 'r') as f:
        lines = f.readlines()
    if id not in len_dict:
        len_dict[id] = len(lines)
    else:
        len_dict[id] += len(lines)
    

In [22]:
len_dict

{'Northeastern': 5198,
 'Jiao-Liao': 21829,
 'Southwestern': 48175,
 'Jiang-Huai': 29959,
 'Beijing': 2533,
 'Mandarin': 340882,
 'Ji-Lu': 36823,
 'Zhongyuan': 51882,
 'Lan-Yin': 22299}

In [23]:
report_len = {"Mandarin": 370819, "Beijing": 2538, "Southwestern": 48465, "Jiao-Liao": 21847, "Northeastern": 5205, "Jiang-Huai": 30008, "Lan-Yin":
    22324, "Ji-Lu": 36921, "Zhongyuan": 52012}

In [24]:
# 差值
diff_dict = {}
for key in len_dict:
    diff_dict[key] = report_len[key] - len_dict[key]
print (diff_dict)

{'Northeastern': 7, 'Jiao-Liao': 18, 'Southwestern': 290, 'Jiang-Huai': 49, 'Beijing': 5, 'Mandarin': 29937, 'Ji-Lu': 98, 'Zhongyuan': 130, 'Lan-Yin': 25}


In [25]:
utt2dia = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/ASR/test/utt2subdialect"
dia_dict = {}
with open(utt2dia, 'r') as f:
    lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, dia = line.split(" ")
        if "p2" in id:
            continue
        if dia not in dia_dict:
            dia_dict[dia] = 1
        else:
            dia_dict[dia] += 1

In [26]:
dia_dict

{'Northeastern': 351,
 'Mandarin': 2045,
 'Ji-Lu': 2809,
 'Zhongyuan': 3239,
 'Jiao-Liao': 1443,
 'Jiang-Huai': 2271,
 'Beijing': 265,
 'Southwestern': 2693,
 'Lan-Yin': 1652}

In [27]:
len_dict = {}
for file_path in file_list:
    if "test" not in file_path:
        continue
    id = file_path.split("/")[-3]
    with open(file_path, 'r') as f:
        lines = f.readlines()
    if id not in len_dict:
        len_dict[id] = len(lines)
    else:
        len_dict[id] += len(lines)
len_dict

{'Northeastern': 350,
 'Jiao-Liao': 1443,
 'Southwestern': 2684,
 'Jiang-Huai': 2268,
 'Beijing': 265,
 'Mandarin': 2040,
 'Ji-Lu': 2806,
 'Zhongyuan': 3225,
 'Lan-Yin': 1646}

In [29]:
diff_dict = {}
for key in len_dict:
    diff_dict[key] = dia_dict[key] - len_dict[key]
diff_dict

{'Northeastern': 1,
 'Jiao-Liao': 0,
 'Southwestern': 9,
 'Jiang-Huai': 3,
 'Beijing': 0,
 'Mandarin': 5,
 'Ji-Lu': 3,
 'Zhongyuan': 14,
 'Lan-Yin': 6}

In [4]:
import json
dia_dict = {}
contrast_dev_path = "/data/NAS_PLUS/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/ASContrastive/train_data.list"
with open(contrast_dev_path, 'r') as f:
    lines = f.readlines()
for line in lines:
    line = line.strip()
    data = json.loads(line)
    sample_acc = data["sample_acc"]
    if sample_acc not in dia_dict:
        dia_dict[sample_acc] = 1
    else:
        dia_dict[sample_acc] += 1

In [5]:
dia_dict

{'Zhongyuan': 14753,
 'Northeastern': 1766,
 'Ji-Lu': 13000,
 'Mandarin': 136415,
 'Southwestern': 14117,
 'Jiao-Liao': 7169,
 'Beijing': 784,
 'Jiang-Huai': 9623,
 'Lan-Yin': 6716}

# 更新data2到data4

In [1]:
import os

In [2]:
data_2 = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/"
data_4 = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/"
if not os.path.exists(data_4):
    os.makedirs(data_4)

# copy data2 to data4
for root, dirs, files in os.walk(data_2):
    for file in files:
        file_path = os.path.join(root, file)
        new_file_path = file_path.replace("data2", "data4")
        os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
        os.system(f"cp {file_path} {new_file_path}")

In [4]:
# data4中WD里的所有audio_datasets.jsonl中"source"不含phase1对应的text_language设置成None
import json
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/WD"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets.jsonl"):
            file_path = os.path.join(root, file)
            print (file_path)
            with open(file_path, 'r') as f:
                lines = f.readlines()
            new_lines = []
            for line in lines:
                item = json.loads(line)
                if "phase1" not in item["source"]:
                    item["text_language"] = None
                    
                
                new_lines.append(json.dumps(item, ensure_ascii=False) + "\n")
            print (new_lines[:10])
            clean_file_path = file_path.replace("audio_datasets", "audio_datasets_2")
            with open(clean_file_path, 'w') as f:
                f.writelines(new_lines)

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/WD/test/audio_datasets.jsonl
['{"key": "1005596_p2_b29a8b0d", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005596/phase2/1005596_p2_b29a8b0d.wav", "source_len": 701, "target": "四 招 消 除 隐 私 外 泄 风 险 注 销", "target_len": 12, "text_language": null}\n', '{"key": "1005598_p2_65747a0c", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005598/phase2/1005598_p2_65747a0c.wav", "source_len": 558, "target": "直 到 你 绝 望 之 后 参 加 他 们 的 旅 行 为 止", "target_len": 16, "text_language": null}\n', '{"key": "1005608_42545eba", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005608/phase1/1005608_42545eba.wav", "source_len": 469, "target": "一 百 零 八 元 极 食 生 活 美 学 即 艺 术 美 食", "target_len": 16, "text_language": "Mandarin"}\n', '{"key": "1005608_p2_29a38a6f", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005608/phase2/1005608_p2_29a38a6f.wav", "source_len": 263, "target": "十 分 钟 后 王 女 士 取 消 订 单", "target_len"

In [5]:
# 随机读取几个文件检查
import json
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/WD"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets_2.jsonl"):
            file_path = os.path.join(root, file)
            print (file_path)
            with open(file_path, 'r') as f:
                lines = f.readlines()
            print (lines[:10])

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/WD/test/audio_datasets_2.jsonl
['{"key": "1005596_p2_b29a8b0d", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005596/phase2/1005596_p2_b29a8b0d.wav", "source_len": 701, "target": "四 招 消 除 隐 私 外 泄 风 险 注 销", "target_len": 12, "text_language": null}\n', '{"key": "1005598_p2_65747a0c", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005598/phase2/1005598_p2_65747a0c.wav", "source_len": 558, "target": "直 到 你 绝 望 之 后 参 加 他 们 的 旅 行 为 止", "target_len": 16, "text_language": null}\n', '{"key": "1005608_42545eba", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005608/phase1/1005608_42545eba.wav", "source_len": 469, "target": "一 百 零 八 元 极 食 生 活 美 学 即 艺 术 美 食", "target_len": 16, "text_language": "Mandarin"}\n', '{"key": "1005608_p2_29a38a6f", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005608/phase2/1005608_p2_29a38a6f.wav", "source_len": 263, "target": "十 分 钟 后 王 女 士 取 消 订 单", "target_le

In [6]:
# 把data4中WD里的所有audio_datasets_2.jsonl改成audio_datasets.jsonl
import os

root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/WD"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets_2.jsonl"):
            file_path = os.path.join(root, file)
            new_file_path = file_path.replace("audio_datasets_2", "audio_datasets")
            os.rename(file_path, new_file_path)

# 更新data4到data5
data5只有phase1的内容

In [1]:
import os

In [3]:
data_4 = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data4/"
data_5 = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/"
if not os.path.exists(data_5):
    os.makedirs(data_5)

# copy data2 to data4
for root, dirs, files in os.walk(data_4):
    for file in files:
        file_path = os.path.join(root, file)
        new_file_path = file_path.replace("data4", "data5")
        os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
        os.system(f"cp {file_path} {new_file_path}")

In [4]:
# data4中WD里的所有audio_datasets.jsonl中"source"不含phase1对应的text_language设置成None
import json
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets.jsonl"):
            file_path = os.path.join(root, file)
            print (file_path)
            with open(file_path, 'r') as f:
                lines = f.readlines()
            new_lines = []
            for line in lines:
                item = json.loads(line)
                if "phase1" not in item["source"]:
                    continue
                else:
                    new_lines.append(json.dumps(item, ensure_ascii=False) + "\n")
            print (new_lines[:10])
            clean_file_path = file_path.replace("audio_datasets", "audio_datasets_2")
            with open(clean_file_path, 'w') as f:
                f.writelines(new_lines)

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test/audio_datasets.jsonl
['{"key": "1005608_42545eba", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005608/phase1/1005608_42545eba.wav", "source_len": 469, "target": "一 百 零 八 元 极 食 生 活 美 学 即 艺 术 美 食", "target_len": 16, "text_language": "Mandarin"}\n', '{"key": "1005610_4959cdde", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005610/phase1/1005610_4959cdde.wav", "source_len": 731, "target": "是 一 九 九 二 年 四 月 一 日 开 始 实 施 的", "target_len": 15, "text_language": "Mandarin"}\n', '{"key": "1005610_f637f4ff", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005610/phase1/1005610_f637f4ff.wav", "source_len": 645, "target": "创 造 出 独 一 无 二 的 地 道 澳 门 葡 国 菜", "target_len": 15, "text_language": "Mandarin"}\n', '{"key": "1005621_69034f9e", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005621/phase1/1005621_69034f9e.wav", "source_len": 767, "target": "吃 蟹 时 和 吃 蟹 后 一 小 时 内 忌 饮 茶 水", "targe

In [5]:
# 随机读取几个文件检查
import json
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets_2.jsonl"):
            file_path = os.path.join(root, file)
            print (file_path)
            with open(file_path, 'r') as f:
                lines = f.readlines()
            print (lines[:10])

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test/audio_datasets_2.jsonl
['{"key": "1005608_42545eba", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005608/phase1/1005608_42545eba.wav", "source_len": 469, "target": "一 百 零 八 元 极 食 生 活 美 学 即 艺 术 美 食", "target_len": 16, "text_language": "Mandarin"}\n', '{"key": "1005610_4959cdde", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005610/phase1/1005610_4959cdde.wav", "source_len": 731, "target": "是 一 九 九 二 年 四 月 一 日 开 始 实 施 的", "target_len": 15, "text_language": "Mandarin"}\n', '{"key": "1005610_f637f4ff", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005610/phase1/1005610_f637f4ff.wav", "source_len": 645, "target": "创 造 出 独 一 无 二 的 地 道 澳 门 葡 国 菜", "target_len": 15, "text_language": "Mandarin"}\n', '{"key": "1005621_69034f9e", "source": "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1005621/phase1/1005621_69034f9e.wav", "source_len": 767, "target": "吃 蟹 时 和 吃 蟹 后 一 小 时 内 忌 饮 茶 水", "tar

In [6]:
# 把data4中WD里的所有audio_datasets_2.jsonl改成audio_datasets.jsonl
import os

root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD"
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith("audio_datasets_2.jsonl"):
            file_path = os.path.join(root, file)
            new_file_path = file_path.replace("audio_datasets_2", "audio_datasets")
            os.rename(file_path, new_file_path)

In [2]:
# 把data5中所有dev dataset中的test删掉

test_json = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test/audio_datasets.jsonl"
dev_json = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/dev/audio_datasets.jsonl"

with open(test_json, 'r') as f:
    test_lines = f.readlines()
    
with open(dev_json, 'r') as f:
    dev_lines = f.readlines()
    
a = 0
for i in range(len(test_lines)):
    if test_lines[i] in dev_lines:
        dev_lines.remove(test_lines[i])
        a += 1
print (a)

with open(dev_json, 'w') as f:
    f.writelines(dev_lines)

14687


In [29]:
# 更新test5中的test
import os
import re
root_dir = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech"
test_dir = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/SubdialectID/test_phase1/"
output_dir = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test2/"
os.makedirs(output_dir, exist_ok=True)

In [30]:
def gather_wav_scp(wav_scp, dict):
    with open(wav_scp, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, path = line.split(" ")
        path = os.path.join(root_dir, path)
        if os.path.exists(path):
            dict[id] = {}
            dict[id].update({"path": path})
    return dict

def gether_dialect_info(utt2subdialect, dict):
    with open(utt2subdialect, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, dialect = line.split(" ")
        dict[id].update({"dialect": dialect})
    return dict

def gather_text_info(text, dict):
    with open(text, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, text = line.split(" ", 1)
        dict[id].update({"text": text})
    return dict

def contains_non_chinese(text):
    # 正则表达式匹配非中文字符
    non_chinese_pattern = re.compile(r'[^\u4e00-\u9fff]')
    return non_chinese_pattern.search(text) is not None

In [31]:
test_dict = {}

test_wav_scp = os.path.join(test_dir, "wav.scp")
test_utts2subdialect = os.path.join(test_dir, "utt2subdialect")
test_text = os.path.join(test_dir, "text")
test_dict = gather_wav_scp(test_wav_scp, test_dict)
test_dict = gether_dialect_info(test_utts2subdialect, test_dict)
test_dict = gather_text_info(test_text, test_dict)

clean_test_dict = {}

for id in test_dict.keys():
    if contains_non_chinese(test_dict[id]["text"]):
        continue
    else:
        clean_test_dict[id] = test_dict[id]


test_dict = clean_test_dict

print ("Done!")
print ("Test dataset size: ", len(test_dict))

Done!
Test dataset size:  44240


In [32]:
for key, value in list(test_dict.items())[:10]:
    print(key, value)

1000043_20cfb79d {'path': '/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1000043/phase1/1000043_20cfb79d.wav', 'dialect': 'Mandarin', 'text': '参与游戏赢取现场抵用卷购车现金券'}
1000043_2412bf4c {'path': '/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1000043/phase1/1000043_2412bf4c.wav', 'dialect': 'Mandarin', 'text': '又可以实现有害垃圾源头回收'}
1000043_2e10312d {'path': '/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1000043/phase1/1000043_2e10312d.wav', 'dialect': 'Mandarin', 'text': '参加了的考试目前成绩也没有出来'}
1000043_33fe2ef2 {'path': '/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1000043/phase1/1000043_33fe2ef2.wav', 'dialect': 'Mandarin', 'text': '参考书的阅读要花费较多的时间'}
1000043_3c3d4a10 {'path': '/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1000043/phase1/1000043_3c3d4a10.wav', 'dialect': 'Mandarin', 'text': '参展企业可以将自己企业的新产品'}
1000043_58fb23b5 {'path': '/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Audio/1000043/phase1/1000043_58fb23b5.wav', 'dialect': 'Northeastern', 'text': '参与风筝表演的都是世界级顶尖高手'}
1000043_64cf3

In [33]:

wav_scp = os.path.join(output_dir, "wav.scp")
text_path = os.path.join(output_dir, "text")

with open(wav_scp, 'w') as f:
    for id, info in test_dict.items():
        path = info["path"]
        f.write(f"{id} {path}\n")

with open(text_path, 'w') as f:
    for id, info in test_dict.items():
        text = info["text"]
        f.write(f"{id} {text}\n")

print ("size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))
            
print ("Done!")

size:  44240 check True
Done!


In [35]:
# 读取口音文件并转换成字典
dialect_root = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/SubdialectID/test_phase1/"
dialect_dict = {}
for root, dirs, files in os.walk(dialect_root):
    for file in files:
        if file == "utt2subdialect":
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    id, dialect = line.split(" ")
                    dialect_dict[id] = dialect

In [36]:
for key, value in list(dialect_dict.items())[:10]:
    print(key, value)

1000043_20cfb79d Mandarin
1000043_2412bf4c Mandarin
1000043_2e10312d Mandarin
1000043_33fe2ef2 Mandarin
1000043_3c3d4a10 Mandarin
1000043_58fb23b5 Northeastern
1000043_64cf3fc8 Mandarin
1000043_6674a7fd Northeastern
1000043_671889da Mandarin
1000043_71dd2737 Northeastern


In [37]:
def add_dialect(lines, dialect_dict, path):
    
    save_path = path.replace("audio_datasets", "audio_datasets2")

    import json
    
    updated_lines = []
    
    for line in lines:
        # 解析 JSON 数据
        data = json.loads(line)
        
        # 根据 key 在 dialect_dict 中查找对应的 dialect
        key = data['key']
        dialect = dialect_dict.get(key, "Mandarin")  # 如果 key 不存在于 dialect_dict 中，则默认值为 "Unknown"
        
        # 添加 "dialect" 字段
        data['text_language'] = dialect
        
        # 将更新后的 JSON 数据转回字符串
        updated_line = json.dumps(data, ensure_ascii=False) + '\n'
        updated_lines.append(updated_line)
    
    # 将更新后的内容写回文件
    with open(save_path, 'w', encoding='utf-8') as file:
        file.writelines(updated_lines)
        
    os.remove(path)
    os.renames(save_path, path)
    
    print("文件更新完成。")

In [38]:
data_path = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test2/audio_datasets.jsonl"
with open(data_path, 'r') as f:
    lines = f.readlines()
add_dialect(lines, dialect_dict, data_path)

文件更新完成。


# data6
## 把test data 从dev中删掉

In [10]:
import os

In [12]:
data_5 = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/"
data_6 = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data6/"
if not os.path.exists(data_6):
    os.makedirs(data_6)

# copy data2 to data4
for root, dirs, files in os.walk(data_5):
    for file in files:
        file_path = os.path.join(root, file)
        new_file_path = file_path.replace("data2", "data6")
        os.makedirs(os.path.dirname(new_file_path), exist_ok=True)
        os.system(f"cp {file_path} {new_file_path}")

In [13]:
# 把dev中的test数据删掉
import json

test_json = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data6/WD/test/audio_datasets.jsonl"
dev_json = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data6/WD/dev/audio_datasets.jsonl"

with open(test_json, 'r') as f:
    test_lines = f.readlines()
    
with open(dev_json, 'r') as f:
    dev_lines = f.readlines()
    



In [14]:
a = 0
for i in range(len(test_lines)):
    if test_lines[i] in dev_lines:
        dev_lines.remove(test_lines[i])
        a += 1
print (a)

14687


In [15]:
# 保存dev
with open(dev_json, 'w') as f:
    f.writelines(dev_lines)

In [2]:
# 把data5中的test的asr-ar的数据集中的beijing和northeastern删掉
import json
ori_path = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test/audio_datasets.jsonl"
new_path = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data5/WD/test/audio_datasets2.jsonl"
with open(ori_path, 'r') as f:
    lines = f.readlines()
    
new_lines = []
for line in lines:
    data = json.loads(line)
    if data["text_language"] == "Beijing" or data["text_language"] == "Northeastern":
        continue
    else:
        new_lines.append(line)

with open(new_path, 'w') as f:
    f.writelines(new_lines)
    
print ("Done!")

Done!
