# Generate Denoising Metadata 6 July 2022 
(Updated v3 & v4 -> change naming to follow chan)

In [1]:
import os
import pandas as pd
import numpy as np
import glob

### v3 Mod

In [2]:
# Load all dataset
df = pd.read_csv('mm_all_metadata.csv')

# Generate new denoised metadata
preprocess_commands, eval_commands = [], []
mm_preprocess_commands, mm_eval_commands = [], []
for noise_path in filter(lambda x: '.txt' not in x and '.zip' not in x, glob.glob('./audios/test_denoising_v3_mod/*')):
    # Extract current noise type
    noise_type = noise_path.split('/')[-1]
    for denoise_type in ['gt', 'fullsubnet', 'noisetasnet_fullsubnet', 'noisetasnet', 'mix']:
        # Collect audio path & original audio path
        audio_paths, orig_audio_paths = [], []
        for audio_path in glob.glob(f'{noise_path}/{denoise_type}/*/*.wav'):
            speaker_id, orig_audio_name = audio_path.split('/')[-2:]
            orig_audio_path = f'./audios/{speaker_id}/{orig_audio_name}'
            audio_paths.append(audio_path)
            orig_audio_paths.append(orig_audio_path)
            
        # Filter all dataset and map audio file path
        audio_file_map = {orig_path: denoise_path for denoise_path, orig_path in zip(audio_paths, orig_audio_paths)}
        denoise_df = df.loc[df['audio_path'].isin(orig_audio_paths),:].reset_index(drop=True)
        denoise_df['audio_path'] = denoise_df['audio_path'].apply(lambda x: audio_file_map[x])            
        
        # Save new metadata files
        denoise_df.to_csv(f'mm_{noise_type}_{denoise_type}_metadata_v3_mod.csv', index=False)
        
        # Prepare preprocess scripts
        preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_denoising_v3_mod/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=datasets/mm_{noise_type}_{denoise_type}_metadata_v3_mod.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare eval scripts
        eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=2 python eval.py --output_dir=./save_denoising_v3_mod/{noise_type}_{denoise_type} \
                --model_name_or_path=save_ao/14045/checkpoint-4500  \
                --test_manifest_path=./cache_denoising_v3_mod/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

        # Prepare mm preprocess scripts
        mm_preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_mm_denoising_v3_mod/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=datasets/mm_{noise_type}_{denoise_type}_metadata_v3_mod.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path  --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare mm eval scripts
        mm_eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=2 python eval.py --output_dir=./save_mm_denoising_v3_mod/{noise_type}_{denoise_type} \
                --model_name_or_path=save_mm/14045/checkpoint-8980  \
                --test_manifest_path=./cache_mm_denoising_v3_mod/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

# Write run scripts to files
preprocess_file = open('/home/samuel/mm-ascend-corpus/run_preprocess_test_denoising_v3_mod.sh', 'w')
for command in preprocess_commands:
    preprocess_file.write(command)
preprocess_file.close()

eval_file = open('/home/samuel/mm-ascend-corpus/run_eval_test_denoising_v3_mod.sh', 'w')
for command in eval_commands:
    eval_file.write(command)
eval_file.close()

mm_preprocess_file = open('/home/samuel/mm-ascend-corpus/run_mm_preprocess_test_denoising_v3_mod.sh', 'w')
for command in mm_preprocess_commands:
    mm_preprocess_file.write(command)
mm_preprocess_file.close()

mm_eval_file = open('/home/samuel/mm-ascend-corpus/run_mm_eval_test_denoising_v3_mod.sh', 'w')
for command in mm_eval_commands:
    mm_eval_file.write(command)
mm_eval_file.close()

In [3]:
# Load all dataset
df = pd.read_csv('mm_all_metadata.csv')

# Generate new denoised metadata
preprocess_commands, eval_commands = [], []
mm_preprocess_commands, mm_eval_commands = [], []
for noise_path in filter(lambda x: '.txt' not in x and '.zip' not in x, glob.glob('./audios/test_denoising_v3i_mod/*')):
    # Extract current noise type
    noise_type = noise_path.split('/')[-1]
    for denoise_type in ['gt', 'fullsubnet', 'noisetasnet_fullsubnet', 'noisetasnet', 'mix']:
        # Collect audio path & original audio path
        audio_paths, orig_audio_paths = [], []
        for audio_path in glob.glob(f'{noise_path}/{denoise_type}/*/*.wav'):
            speaker_id, orig_audio_name = audio_path.split('/')[-2:]
            orig_audio_path = f'./audios/{speaker_id}/{orig_audio_name}'
            audio_paths.append(audio_path)
            orig_audio_paths.append(orig_audio_path)

        # Filter all dataset and map audio file path
        audio_file_map = {orig_path: denoise_path for denoise_path, orig_path in zip(audio_paths, orig_audio_paths)}
        denoise_df = df.loc[df['audio_path'].isin(orig_audio_paths),:].reset_index(drop=True)
        denoise_df['audio_path'] = denoise_df['audio_path'].apply(lambda x: audio_file_map[x])            
        
        # Save new metadata files
        denoise_df.to_csv(f'mm_{noise_type}_{denoise_type}_metadata_v3i_mod.csv', index=False)
        
        # Prepare preprocess scripts
        preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_denoising_v3i_mod/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=dataset/mm_{noise_type}_{denoise_type}_metadata_v3i_mod.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare eval scripts
        eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=3 python eval.py --output_dir=./save_denoising_v3i_mod/{noise_type}_{denoise_type} \
                --model_name_or_path=save_ao/14045/checkpoint-4500  \
                --test_manifest_path=./cache_denoising_v3i_mod/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

        # Prepare mm preprocess scripts
        mm_preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_mm_denoising_v3i_mod/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=dataset/mm_{noise_type}_{denoise_type}_metadata_v3i_mod.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path  --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare mm eval scripts
        mm_eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=3 python eval.py --output_dir=./save_mm_denoising_v3i_mod/{noise_type}_{denoise_type} \
                --model_name_or_path=save_mm/14045/checkpoint-8980  \
                --test_manifest_path=./cache_mm_denoising_v3i_mod/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

# Write run scripts to files
preprocess_file = open('/home/samuel/mm-ascend-corpus/run_preprocess_test_denoising_v3i_mod.sh', 'w')
for command in preprocess_commands:
    preprocess_file.write(command)
preprocess_file.close()

eval_file = open('/home/samuel/mm-ascend-corpus/run_eval_test_denoising_v3i_mod.sh', 'w')
for command in eval_commands:
    eval_file.write(command)
eval_file.close()

mm_preprocess_file = open('/home/samuel/mm-ascend-corpus/run_mm_preprocess_test_denoising_v3i_mod.sh', 'w')
for command in mm_preprocess_commands:
    mm_preprocess_file.write(command)
mm_preprocess_file.close()

mm_eval_file = open('/home/samuel/mm-ascend-corpus/run_mm_eval_test_denoising_v3i_mod.sh', 'w')
for command in mm_eval_commands:
    mm_eval_file.write(command)
mm_eval_file.close()

### v4 new

In [6]:
# Load all dataset
df = pd.read_csv('mm_all_metadata.csv')

# Generate new denoised metadata
preprocess_commands, eval_commands = [], []
mm_preprocess_commands, mm_eval_commands = [], []
for noise_path in filter(lambda x: '.txt' not in x and '.zip' not in x, glob.glob('./audios/test_denoising_v4/*')):
    # Extract current noise type
    noise_type = noise_path.split('/')[-1]
    for denoise_type in ['gt', 'fullsubnet', 'noisetasnet_fullsubnet', 'noisetasnet', 'mix']:
        # Collect audio path & original audio path
        audio_paths, orig_audio_paths = [], []
        for audio_path in glob.glob(f'{noise_path}/{denoise_type}/*/*.wav'):
            speaker_id, orig_audio_name = audio_path.split('/')[-2:]
            orig_audio_path = f'./audios/{speaker_id}/{orig_audio_name}'
            audio_paths.append(audio_path)
            orig_audio_paths.append(orig_audio_path)

        # Filter all dataset and map audio file path
        audio_file_map = {orig_path: denoise_path for denoise_path, orig_path in zip(audio_paths, orig_audio_paths)}
        denoise_df = df.loc[df['audio_path'].isin(orig_audio_paths),:].reset_index(drop=True)
        denoise_df['audio_path'] = denoise_df['audio_path'].apply(lambda x: audio_file_map[x])            
        
        # Save new metadata files
        denoise_df.to_csv(f'mm_{noise_type}_{denoise_type}_metadata_v4.csv', index=False)
        
        # Prepare preprocess scripts
        preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_denoising_v4/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=dataset/mm_{noise_type}_{denoise_type}_metadata_v4.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare eval scripts
        eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=2 python eval.py --output_dir=./save_denoising_v4/{noise_type}_{denoise_type} \
                --model_name_or_path=save_ao/14045/checkpoint-4500  \
                --test_manifest_path=./cache_denoising_v4/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

        # Prepare mm preprocess scripts
        mm_preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_mm_denoising_v4/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=dataset/mm_{noise_type}_{denoise_type}_metadata_v4.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path  --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare mm eval scripts
        mm_eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=2 python eval.py --output_dir=./save_mm_denoising_v4/{noise_type}_{denoise_type} \
                --model_name_or_path=save_mm/14045/checkpoint-8980  \
                --test_manifest_path=./cache_mm_denoising_v4/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

# Write run scripts to files
preprocess_file = open('/home/samuel/mm-ascend-corpus/run_preprocess_test_denoising_v4.sh', 'w')
for command in preprocess_commands:
    preprocess_file.write(command)
preprocess_file.close()

eval_file = open('/home/samuel/mm-ascend-corpus/run_eval_test_denoising_v4.sh', 'w')
for command in eval_commands:
    eval_file.write(command)
eval_file.close()

mm_preprocess_file = open('/home/samuel/mm-ascend-corpus/run_mm_preprocess_test_denoising_v4.sh', 'w')
for command in mm_preprocess_commands:
    mm_preprocess_file.write(command)
mm_preprocess_file.close()

mm_eval_file = open('/home/samuel/mm-ascend-corpus/run_mm_eval_test_denoising_v4.sh', 'w')
for command in mm_eval_commands:
    mm_eval_file.write(command)
mm_eval_file.close()

In [5]:
# Load all dataset
df = pd.read_csv('mm_all_metadata.csv')

# Generate new denoised metadata
preprocess_commands, eval_commands = [], []
mm_preprocess_commands, mm_eval_commands = [], []
for noise_path in filter(lambda x: '.txt' not in x and '.zip' not in x, glob.glob('./audios/test_denoising_v4i/*')):
    # Extract current noise type
    noise_type = noise_path.split('/')[-1]
    for denoise_type in ['gt', 'fullsubnet', 'noisetasnet_fullsubnet', 'noisetasnet', 'mix']:
        # Collect audio path & original audio path
        audio_paths, orig_audio_paths = [], []
        for audio_path in glob.glob(f'{noise_path}/{denoise_type}/*/*.wav'):
            speaker_id, orig_audio_name = audio_path.split('/')[-2:]
            orig_audio_path = f'./audios/{speaker_id}/{orig_audio_name}'
            audio_paths.append(audio_path)
            orig_audio_paths.append(orig_audio_path)

        # Filter all dataset and map audio file path
        audio_file_map = {orig_path: denoise_path for denoise_path, orig_path in zip(audio_paths, orig_audio_paths)}
        denoise_df = df.loc[df['audio_path'].isin(orig_audio_paths),:].reset_index(drop=True)
        denoise_df['audio_path'] = denoise_df['audio_path'].apply(lambda x: audio_file_map[x])            
        
        # Save new metadata files
        denoise_df.to_csv(f'mm_{noise_type}_{denoise_type}_metadata_v4i.csv', index=False)
        
        # Prepare preprocess scripts
        preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_denoising_v4i/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=dataset/mm_{noise_type}_{denoise_type}_metadata_v4i.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare eval scripts
        eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=3 python eval.py --output_dir=./save_denoising_v4i/{noise_type}_{denoise_type} \
                --model_name_or_path=save_ao/14045/checkpoint-4500  \
                --test_manifest_path=./cache_denoising_v4i/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

        # Prepare mm preprocess scripts
        mm_preprocess_commands.append(f"""
            python preprocess_data.py --output_dir=./cache_mm_denoising_v4i/{noise_type}_{denoise_type}/ \
                --model_name_or_path=ctl/wav2vec2-large-xlsr-cantonese \
                --test_manifest_path=dataset/mm_{noise_type}_{denoise_type}_metadata_v4i.csv \
                --preprocessing_num_workers=32 --seed=0 \
                --audio_column_name=audio_path --text_column_name=text_path --video_column_name=lip_image_path  --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())
        
        # Prepare mm eval scripts
        mm_eval_commands.append(f"""
            CUDA_VISIBLE_DEVICES=3 python eval.py --output_dir=./save_mm_denoising_v4i/{noise_type}_{denoise_type} \
                --model_name_or_path=save_mm/14045/checkpoint-8980  \
                --test_manifest_path=./cache_mm_denoising_v4i/{noise_type}_{denoise_type}/preprocess_data.arrow \
                --audio_column_name=audio_path --text_column_name=text_path  --video_column_name=lip_image_path \
                --per_device_train_batch_size=16 --per_device_eval_batch_size=8     --dataloader_num_workers=32 \
                --dataloader_pin_memory     --seed=0 --num_train_epochs=20 --learning_rate=5e-5  \
                --logging_strategy=steps --logging_steps=10 \
                --evaluation_strategy=epoch --eval_steps=1 --eval_accumulation_steps=15 \
                --save_steps=1 --save_strategy=epoch --save_total_limit=1 \
                --metric_for_best_model=mer --greater_is_better=False --load_best_model_at_end=True --use_video
        """.replace('            ','').replace('\n        ','\n\n').lstrip())

# Write run scripts to files
preprocess_file = open('/home/samuel/mm-ascend-corpus/run_preprocess_test_denoising_v4i.sh', 'w')
for command in preprocess_commands:
    preprocess_file.write(command)
preprocess_file.close()

eval_file = open('/home/samuel/mm-ascend-corpus/run_eval_test_denoising_v4i.sh', 'w')
for command in eval_commands:
    eval_file.write(command)
eval_file.close()

mm_preprocess_file = open('/home/samuel/mm-ascend-corpus/run_mm_preprocess_test_denoising_v4i.sh', 'w')
for command in mm_preprocess_commands:
    mm_preprocess_file.write(command)
mm_preprocess_file.close()

mm_eval_file = open('/home/samuel/mm-ascend-corpus/run_mm_eval_test_denoising_v4i.sh', 'w')
for command in mm_eval_commands:
    mm_eval_file.write(command)
mm_eval_file.close()