# 处理KeSpeech-ASR数据集

|- Each_subdialect    
&emsp;|- Mandarin   
&emsp;|- BJ   
&emsp;|- SW   
&emsp;|- ZY    
&emsp;|- NE   
&emsp;|- LY  
&emsp;|- JH    
&emsp;|- JLu   
&emsp;|- JLo   
|- All_subdialect   
|- Mandarin  
|-Whole_training_set  



In [13]:
import os
import re

root_dir = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech"
audio_dir = os.path.join(root_dir, "Audio")
asr_text_dir = os.path.join(root_dir, "Tasks/ASR")
train_phase_1_dir = os.path.join(asr_text_dir, "train_phase1")
train_phase_2_dir = os.path.join(asr_text_dir, "train_phase2")
train_list = [train_phase_1_dir, train_phase_2_dir]
dev_phase_1_dir = os.path.join(asr_text_dir, "dev_phase1")
dev_phase_2_dir = os.path.join(asr_text_dir, "dev_phase2")
dev_list = [dev_phase_1_dir, dev_phase_2_dir]
test_dir = os.path.join(asr_text_dir, "test")

output_dir = "/ssd/zhuang/code/FunASR2024/examples/kespeech/DATA/data"
os.makedirs(output_dir, exist_ok=True)


In [14]:
def gather_wav_scp(wav_scp, dict):
    with open(wav_scp, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, path = line.split(" ")
        path = os.path.join(root_dir, path)
        if os.path.exists(path):
            dict[id] = {}
            dict[id].update({"path": path})
    return dict

def gether_dialect_info(utt2subdialect, dict):
    with open(utt2subdialect, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, dialect = line.split(" ")
        dict[id].update({"dialect": dialect})
    return dict

def gather_text_info(text, dict):
    with open(text, 'r') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        id, text = line.split(" ", 1)
        dict[id].update({"text": text})
    return dict

def contains_non_chinese(text):
    # 正则表达式匹配非中文字符
    non_chinese_pattern = re.compile(r'[^\u4e00-\u9fff]')
    return non_chinese_pattern.search(text) is not None

In [15]:
train_dict  = {}
dev_dict = {}
test_dict = {}

for train_path in train_list:
    train_wav_scp = os.path.join(train_path, "wav.scp")
    train_utts2subdialect = os.path.join(train_path, "utt2subdialect")
    traain_text = os.path.join(train_path, "text")
    train_dict = gather_wav_scp(train_wav_scp, train_dict)
    train_dict = gether_dialect_info(train_utts2subdialect, train_dict)
    train_dict = gather_text_info(traain_text, train_dict)
for dev_path in dev_list:
    dev_wav_scp = os.path.join(dev_path, "wav.scp")
    dev_utts2subdialect = os.path.join(dev_path, "utt2subdialect")
    dev_text = os.path.join(dev_path, "text")
    dev_dict = gather_wav_scp(dev_wav_scp, dev_dict)
    dev_dict = gether_dialect_info(dev_utts2subdialect, dev_dict)
    dev_dict = gather_text_info(dev_text, dev_dict)

test_wav_scp = os.path.join(test_dir, "wav.scp")
test_utts2subdialect = os.path.join(test_dir, "utt2subdialect")
test_text = os.path.join(test_dir, "text")
test_dict = gather_wav_scp(test_wav_scp, test_dict)
test_dict = gether_dialect_info(test_utts2subdialect, test_dict)
test_dict = gather_text_info(test_text, test_dict)

clean_train_dict = {}
clean_dev_dict = {}
clean_test_dict = {}

for id in train_dict.keys():
    if contains_non_chinese(train_dict[id]["text"]):
        continue
    else:
        clean_train_dict[id] = train_dict[id]
for id in dev_dict.keys():
    if contains_non_chinese(dev_dict[id]["text"]):
        continue
    else:
        clean_dev_dict[id] = dev_dict[id]
for id in test_dict.keys():
    if contains_non_chinese(test_dict[id]["text"]):
        continue
    else:
        clean_test_dict[id] = test_dict[id]

train_dict = clean_train_dict
dev_dict = clean_dev_dict
test_dict = clean_test_dict


print ("Done!")
print ("Train dataset size: ", len(train_dict))
print ("Dev dataset size: ", len(dev_dict))
print ("Test dataset size: ", len(test_dict))

Done!
Train dataset size:  881808
Dev dataset size:  4399
Test dataset size:  19668


## Each_subdialect   

In [16]:
each_subdialect_dir =  os.path.join(output_dir, "ES")
os.makedirs(each_subdialect_dir, exist_ok=True)

###  Make wav.scp and text

In [17]:
subdialect_list=['Mandarin', 'Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
for dialect in subdialect_list:
    es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
    os.makedirs(es_subdialect_dir, exist_ok=True)

    es_subdialect_train = os.path.join(es_subdialect_dir, "train")
    es_subdialect_dev = os.path.join(es_subdialect_dir, "dev")
    es_subdialect_test = os.path.join(es_subdialect_dir, "test")

    es_subdialect_output = [es_subdialect_train, es_subdialect_dev, es_subdialect_test]
    data_dict = [train_dict, dev_dict, test_dict]

    for output, dict in zip(es_subdialect_output, data_dict):
        os.makedirs(output, exist_ok=True)
        wav_scp = os.path.join(output, "wav.scp")
        text_path = os.path.join(output, "text")

        with open(wav_scp, 'w') as f:
            for id, info in dict.items():
                if info["dialect"] == dialect:
                    path = info["path"]
                    f.write(f"{id} {path}\n")

        with open(text_path, 'w') as f:
            for id, info in dict.items():
                if info["dialect"] == dialect:
                    text = info["text"]
                    f.write(f"{id} {text}\n")

        print (dialect, output.split("/")[-1], "size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))
            
    print ("Done!")


Mandarin train size:  678515 check True
Mandarin dev size:  3681 check True
Mandarin test size:  4981 check True
Done!
Beijing train size:  2237 check True
Beijing dev size:  31 check True
Beijing test size:  265 check True
Done!
Southwestern train size:  45359 check True
Southwestern dev size:  132 check True
Southwestern test size:  2684 check True
Done!
Jiao-Liao train size:  20268 check True
Jiao-Liao dev size:  118 check True
Jiao-Liao test size:  1443 check True
Done!
Northeastern train size:  4843 check True
Northeastern dev size:  5 check True
Northeastern test size:  350 check True
Done!
Jiang-Huai train size:  27586 check True
Jiang-Huai dev size:  105 check True
Jiang-Huai test size:  2268 check True
Done!
Lan-Yin train size:  20549 check True
Lan-Yin dev size:  104 check True
Lan-Yin test size:  1646 check True
Done!
Ji-Lu train size:  33861 check True
Ji-Lu dev size:  156 check True
Ji-Lu test size:  2806 check True
Done!
Zhongyuan train size:  48590 check True
Zhongyuan d

### Update the test to the dev

In [18]:
def merge_files(read_file, writen_file):
    # 打开第一个文件以读取数据
    with open(read_file, 'r', encoding='utf-8') as f1:
        data = f1.read()  # 读取全部内容

    # 打开第二个文件以追加数据
    with open(writen_file, 'a', encoding='utf-8') as f2:
        f2.write(data)  # 将读取的数据追加到文件末尾


In [19]:
subdialect_list=['Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
for dialect in subdialect_list:
    
    es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
    es_subdialect_dev = os.path.join(es_subdialect_dir, "dev")
    es_subdialect_test = os.path.join(es_subdialect_dir, "test")

    es_subdialect_dev_wav_scp = os.path.join(es_subdialect_dev, "wav.scp")
    es_subdialect_dev_text = os.path.join(es_subdialect_dev, "text")

    es_subdialect_test_wav_scp = os.path.join(es_subdialect_test, "wav.scp")
    es_subdialect_test_text = os.path.join(es_subdialect_test, "text")

    print (dialect, "Dev dataset oringinal size: ", len(open(es_subdialect_dev_wav_scp).readlines()), "=", len(open(es_subdialect_dev_text).readlines()))
    print (dialect, "Test dataset oringinal size: ", len(open(es_subdialect_test_wav_scp).readlines()), "=", len(open(es_subdialect_test_text).readlines()))

    merge_files(es_subdialect_test_wav_scp, es_subdialect_dev_wav_scp)
    merge_files(es_subdialect_test_text, es_subdialect_dev_text)

    print (dialect, "Dev dataset size: ", len(open(es_subdialect_dev_wav_scp).readlines()), "=", len(open(es_subdialect_dev_text).readlines()))
    print (dialect, "Test dataset size: ", len(open(es_subdialect_test_wav_scp).readlines()),"=", len(open(es_subdialect_test_text).readlines()))

    print ("\n")



Beijing Dev dataset oringinal size:  31 = 31
Beijing Test dataset oringinal size:  265 = 265
Beijing Dev dataset size:  296 = 296
Beijing Test dataset size:  265 = 265


Southwestern Dev dataset oringinal size:  132 = 132
Southwestern Test dataset oringinal size:  2684 = 2684
Southwestern Dev dataset size:  2816 = 2816
Southwestern Test dataset size:  2684 = 2684


Jiao-Liao Dev dataset oringinal size:  118 = 118
Jiao-Liao Test dataset oringinal size:  1443 = 1443
Jiao-Liao Dev dataset size:  1561 = 1561
Jiao-Liao Test dataset size:  1443 = 1443


Northeastern Dev dataset oringinal size:  5 = 5
Northeastern Test dataset oringinal size:  350 = 350
Northeastern Dev dataset size:  355 = 355
Northeastern Test dataset size:  350 = 350


Jiang-Huai Dev dataset oringinal size:  105 = 105
Jiang-Huai Test dataset oringinal size:  2268 = 2268
Jiang-Huai Dev dataset size:  2373 = 2373
Jiang-Huai Test dataset size:  2268 = 2268


Lan-Yin Dev dataset oringinal size:  104 = 104
Lan-Yin Test dataset 

## All_subdialect

In [20]:
subdialect_list=['Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']

all_subdialect_dir = os.path.join(output_dir, "AS")
os.makedirs(all_subdialect_dir, exist_ok=True)
all_subdialect_train = os.path.join(all_subdialect_dir, "train")
WholeDevSet = os.path.join(all_subdialect_dir, "dev")
WholeTestSet = os.path.join(all_subdialect_dir, "test")
whole_dataset_output = [all_subdialect_train, WholeDevSet, WholeTestSet]

for output in whole_dataset_output:
    os.makedirs(output, exist_ok=True)
    wav_scp = os.path.join(output, "wav.scp")
    text_path = os.path.join(output, "text")

    with open(wav_scp, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_wav_scp = os.path.join(es_subdialect_output, "wav.scp")
            for line in open(es_subdialect_wav_scp):
                f.write(line)

    with open(text_path, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_text = os.path.join(es_subdialect_output, "text")
            for line in open(es_subdialect_text):
                f.write(line)

    print (output.split("/")[-1], "size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))


train size:  203293 check True
dev size:  15405 check True
test size:  14687 check True


## Whole data set

In [21]:
subdialect_list=['Mandarin', 'Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']

all_subdialect_dir = os.path.join(output_dir, "WD")
os.makedirs(all_subdialect_dir, exist_ok=True)
all_subdialect_train = os.path.join(all_subdialect_dir, "train")
WholeDevSet = os.path.join(all_subdialect_dir, "dev")
WholeTestSet = os.path.join(all_subdialect_dir, "test")
whole_dataset_output = [all_subdialect_train, WholeDevSet, WholeTestSet]

for output in whole_dataset_output:
    os.makedirs(output, exist_ok=True)
    wav_scp = os.path.join(output, "wav.scp")
    text_path = os.path.join(output, "text")

    with open(wav_scp, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_wav_scp = os.path.join(es_subdialect_output, "wav.scp")
            for line in open(es_subdialect_wav_scp):
                f.write(line)

    with open(text_path, 'w') as f:
        for dialect in subdialect_list:
            es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
            es_subdialect_output = os.path.join(es_subdialect_dir, output.split("/")[-1])
            es_subdialect_text = os.path.join(es_subdialect_output, "text")
            for line in open(es_subdialect_text):
                f.write(line)

    print (output.split("/")[-1], "size: ", len(open(wav_scp).readlines()), "check",  len(open(wav_scp).readlines())==len(open(text_path).readlines()))

train size:  881808 check True
dev size:  19086 check True
test size:  19668 check True


## After CNAM

In [22]:
def merge_files(read_file, writen_file):
    # 打开第一个文件以读取数据
    with open(read_file, 'r', encoding='utf-8') as f1:
        data = f1.read()  # 读取全部内容

    # 打开第二个文件以追加数据
    with open(writen_file, 'a', encoding='utf-8') as f2:
        f2.write(data)  # 将读取的数据追加到文件末尾

In [25]:
subdialect_list=['Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
section_list = ["train", "dev", "test"]

all_subdialect_dir = os.path.join(output_dir, "AS")
each_subdialect_dir = os.path.join(output_dir, "ES")


for section in section_list:
    all_subdialect_section = os.path.join(all_subdialect_dir, section)
    all_subdialect_section_audio_datasets = os.path.join(all_subdialect_section, "audio_datasets.jsonl")
    os.makedirs(all_subdialect_section, exist_ok=True)

    for dialect in subdialect_list:
        es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
        es_subdialect_section = os.path.join(es_subdialect_dir, section)
        es_audio_datasets  = os.path.join(es_subdialect_section, "audio_datasets.jsonl")
        print (dialect, section, "dataset size: ", len(open(es_audio_datasets).readlines()))
        merge_files(es_audio_datasets, all_subdialect_section_audio_datasets)

    print (section, "dataset size: ", len(open(all_subdialect_section_audio_datasets).readlines()))



Beijing train dataset size:  2237
Southwestern train dataset size:  45359
Jiao-Liao train dataset size:  20268
Northeastern train dataset size:  4843
Jiang-Huai train dataset size:  27586
Lan-Yin train dataset size:  20549
Ji-Lu train dataset size:  33861
Zhongyuan train dataset size:  48590
train dataset size:  203293
Beijing dev dataset size:  296
Southwestern dev dataset size:  2816
Jiao-Liao dev dataset size:  1561
Northeastern dev dataset size:  355
Jiang-Huai dev dataset size:  2373
Lan-Yin dev dataset size:  1750
Ji-Lu dev dataset size:  2962
Zhongyuan dev dataset size:  3292
dev dataset size:  15405
Beijing test dataset size:  265
Southwestern test dataset size:  2684
Jiao-Liao test dataset size:  1443
Northeastern test dataset size:  350
Jiang-Huai test dataset size:  2268
Lan-Yin test dataset size:  1646
Ji-Lu test dataset size:  2806
Zhongyuan test dataset size:  3225
test dataset size:  14687


In [29]:
subdialect_list=['Mandarin', 'Beijing', 'Southwestern', 'Jiao-Liao', 'Northeastern', 'Jiang-Huai', 'Lan-Yin', 'Ji-Lu', 'Zhongyuan']
section_list = ["train", "dev", "test"]

whole_dir = os.path.join(output_dir, "WD")
each_subdialect_dir = os.path.join(output_dir, "ES")

for section in section_list:
    whole_section = os.path.join(whole_dir, section)
    whole_audio_datasets = os.path.join(whole_section, "audio_datasets.jsonl")
    os.makedirs(whole_section, exist_ok=True)

    for dialect in subdialect_list:
        es_subdialect_dir = os.path.join(each_subdialect_dir, dialect)
        es_subdialect_section = os.path.join(es_subdialect_dir, section)
        es_audio_datasets  = os.path.join(es_subdialect_section, "audio_datasets.jsonl")
        print (dialect, section, "dataset size: ", len(open(es_audio_datasets).readlines()))
        merge_files(es_audio_datasets, whole_audio_datasets)

    print (section, "dataset size: ", len(open(whole_audio_datasets).readlines()))

Mandarin train dataset size:  678515
Beijing train dataset size:  2237
Southwestern train dataset size:  45359
Jiao-Liao train dataset size:  20268
Northeastern train dataset size:  4843
Jiang-Huai train dataset size:  27586
Lan-Yin train dataset size:  20549
Ji-Lu train dataset size:  33861
Zhongyuan train dataset size:  48590
train dataset size:  881808
Mandarin dev dataset size:  3681
Beijing dev dataset size:  296
Southwestern dev dataset size:  2816
Jiao-Liao dev dataset size:  1561
Northeastern dev dataset size:  355
Jiang-Huai dev dataset size:  2373
Lan-Yin dev dataset size:  1750
Ji-Lu dev dataset size:  2962
Zhongyuan dev dataset size:  3292
dev dataset size:  19086
Mandarin test dataset size:  4981
Beijing test dataset size:  265
Southwestern test dataset size:  2684
Jiao-Liao test dataset size:  1443
Northeastern test dataset size:  350
Jiang-Huai test dataset size:  2268
Lan-Yin test dataset size:  1646
Ji-Lu test dataset size:  2806
Zhongyuan test dataset size:  3225
test

# 添加口音信息

In [1]:
import os
import shutil

# 源文件夹路径
source_dir = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data"
# 目标文件夹路径
data_root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2"

# 确保目标目录存在，如果不存在则创建
os.makedirs(data_root, exist_ok=True)

# 使用shutil.copytree来复制目录。从 Python 3.8 开始，copytree() 可以接受 dirs_exist_ok 参数
# 如果目标目录已存在，并且你想要覆盖里面的文件，则需要设置 dirs_exist_ok=True
try:
    shutil.copytree(source_dir, data_root, dirs_exist_ok=True)
except FileExistsError:
    # 如果在较早的 Python 版本中使用 shutil.copytree 且目标目录已存在，会引发此错误
    print("目录已存在")
except Exception as e:
    print(f"复制过程中出错: {e}")


In [2]:

end_name = "audio_datasets.jsonl"

# 遍历所有的.jsonl文件
data_list = []
for root, dirs, files in os.walk(data_root):
    for file in files:
        if file == end_name:
            file_path = os.path.join(root, file)
            data_list.append(file_path)
            
print (data_list)

['/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Lan-Yin/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Lan-Yin/dev/audio_datasets.jsonl', '/ssd/zhuang/cod

In [3]:
# 读取口音文件并转换成字典
dialect_root = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/SubdialectID"
dialect_dict = {}
for root, dirs, files in os.walk(dialect_root):
    for file in files:
        if file == "utt2subdialect":
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    id, dialect = line.split(" ")
                    dialect_dict[id] = dialect

In [4]:
def add_dialect(lines, dialect_dict, path):
    
    save_path = path.replace("audio_datasets", "audio_datasets2")

    import json
    
    updated_lines = []
    
    for line in lines:
        # 解析 JSON 数据
        data = json.loads(line)
        
        # 根据 key 在 dialect_dict 中查找对应的 dialect
        key = data['key']
        dialect = dialect_dict.get(key, "Mandarin")  # 如果 key 不存在于 dialect_dict 中，则默认值为 "Unknown"
        
        # 添加 "dialect" 字段
        data['text_language'] = dialect
        
        # 将更新后的 JSON 数据转回字符串
        updated_line = json.dumps(data, ensure_ascii=False) + '\n'
        updated_lines.append(updated_line)
    
    # 将更新后的内容写回文件
    with open(save_path, 'w', encoding='utf-8') as file:
        file.writelines(updated_lines)
        
    os.remove(path)
    os.renames(save_path, path)
    
    print("文件更新完成。")

In [5]:
first_10_items = list(dialect_dict.items())[:10]

# 打印前10个项目
for key, value in first_10_items:
    print(f'{key}: {value}')

1000001_0b1a33a3: Mandarin
1000001_0e9793ff: Mandarin
1000001_11f3978b: Mandarin
1000001_1c4b6ce5: Mandarin
1000001_2c863844: Mandarin
1000001_3c84b37d: Mandarin
1000001_492740a5: Mandarin
1000001_5c8b5985: Mandarin
1000001_63740949: Mandarin
1000001_6a7435f1: Mandarin


In [6]:
for i in range(len(data_list)):
# for i in range (1):
    data_path = data_list[i]
    print ("Processing: ", data_path)
    with open(data_path, 'r') as f:
        lines = f.readlines()
    add_dialect(lines, dialect_dict, data_path)

Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/test/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/dev/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/AS/train/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/test/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/dev/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/WD/train/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/test/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/dev/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/ES/Jiang-Huai/train/audio_datasets.jsonl
文件更新完成。
Processing:  /ssd/zhuang/code/FunASR/exa

# 新建口音字典

In [7]:
import os

In [8]:
# 读取口音文件并转换成字典
dialect_root = "/ssd/zhuang/dataset/data_KeSpeech/KeSpeech/Tasks/SubdialectID"
dialect_list = []
for root, dirs, files in os.walk(dialect_root):
    for file in files:
        if file == "utt2subdialect":
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                lines = f.readlines()
                for line in lines:
                    line = line.strip()
                    id, dialect = line.split(" ")
                    if dialect not in dialect_list:
                        dialect_list.append(dialect)

In [9]:
save_dialect_path = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data2/zh_token_list/char/dialects.txt"
with open(save_dialect_path, 'w') as f:
    for dialect in dialect_list:
        f.write(f"{dialect}\n")

# 将重复的内容清洗掉

In [1]:
import os
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/"

In [2]:
pending_list = []
for root, dirs, files in os.walk(root):
    for file in files:
        if file.endswith(".jsonl"):
            file_path = os.path.join(root, file)
            pending_list.append(file_path)
print (pending_list)

['/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yin/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/dev/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Northeastern/train/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Southwestern/test/audio_datasets.jsonl', '/ssd/zhuang/code/FunASR/examples/kespee

In [3]:
def clean_file(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    print (file_path, "size: ", len(lines))
    lines = list(set(lines))
    print (file_path, "size: ", len(lines))
    with open(file_path, 'w') as f:
        f.writelines(lines)
    print (file_path, "cleaned!")

In [4]:
for file_path in pending_list:
    clean_file(file_path)

/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl size:  2268
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl size:  2268
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/test/audio_datasets.jsonl cleaned!
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl size:  2373
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl size:  2373
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/dev/audio_datasets.jsonl cleaned!
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl size:  27586
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl size:  27586
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Jiang-Huai/train/audio_datasets.jsonl cleaned!
/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/Lan-Yi

In [9]:
import os
root = "/ssd/zhuang/code/FunASR/examples/kespeech/DATA/data3/ES/"

for root, dirs, files in os.walk(root):
    for file in files:
        if "bac" in file:
            print (file)
            os.remove(os.path.join(root, file))
            print (file, "removed!")
        if "hubert" in file:
            print (file)
            os.remove(os.path.join(root, file))
            print (file, "removed!")
    

hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9layer.npy
hubert_9layer.npy removed!
hubert_9layer.len
hubert_9layer.len removed!
hubert_9la