In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
import json
import matplotlib.pyplot as plt
from adjustText import adjust_text

from scipy.signal import spectrogram
import librosa
from utils import parse_yaml, load_ss_model
from dcase_evaluator_analysis import DCASEEvaluatorAnalysis
import torch
import gc

  from .autonotebook import tqdm as notebook_tqdm
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  checkpoint = torch.load(checkpoint_path, map_location=map_location)


In [2]:
def eval(evaluator,
         encoder_checkpoint_path = None, 
         ssnet_checkpoint_path = None, 
         config_yaml=None, 
         device = "cuda",
         encoder_type = None):

    
    assert encoder_type is not None, 'define encoder type'
    
    configs = parse_yaml(config_yaml)
    
    if encoder_type == 'ONE-PEACE':

        from models.one_peace_encoder import ONE_PEACE_Encoder
        # ONE_PEACE modelhub expects some paths to be relative to this dir
        os.chdir('ONE-PEACE/')
        # TODO:path in shared scratch dir for now..., move to class project dir whenever we get that
        query_encoder = ONE_PEACE_Encoder(pretrained_path=encoder_checkpoint_path)
        os.chdir('..')

        # put ONE-PEACE model in eval model (probably unecessary)
        query_encoder.model.model.eval()

    elif encoder_type == 'CLAP':
        from models.clap_encoder import CLAP_Encoder
        query_encoder = CLAP_Encoder(pretrained_path=encoder_checkpoint_path).eval()

    pl_model = load_ss_model(
        configs=configs,
        checkpoint_path=ssnet_checkpoint_path,
        query_encoder=query_encoder
    ).to(device)


    print(f'-------  Start Evaluation  -------')
    df_results = evaluator(pl_model)
    df_results.to_csv(os.path.join(evaluator.output_dir,f'{encoder_type}_output.csv'), index = None)
    print('-------------------------  Done  ---------------------------')

    del pl_model
    del evaluator
    
    torch.cuda.empty_cache()
    gc.collect()

    return df_results

In [3]:
sampling_rate = 16000
pd.set_option('display.max_colwidth', None)

In [4]:
df_onepeace = pd.read_csv('results/one-peace_validation_results.csv')
df_clap = pd.read_csv('results/baseline_validation_results.csv')

#### Sword Swoosh/ Wave noise Example

In [5]:
clap_sample = df_clap.loc[285]
clap_sample

caption                                                  The sword swooshes through the air as someone waves it, making a whooshing sound.
source_path                                                                                         lass_validation/699930_10996917-hq.wav
noise_path                                                                                          lass_validation/683018_10624050-hq.wav
input_path            /fs/nexus-scratch/vla/lass_validation_baseline_output/('699930_10996917-hq.wav', '683018_10624050-hq.wav')_input.wav
output_path          /fs/nexus-scratch/vla/lass_validation_baseline_output/('699930_10996917-hq.wav', '683018_10624050-hq.wav')_output.wav
input_similarity                                                                                                                  0.390963
output_similarity                                                                                                                 0.560391
target_similarity          

In [6]:
onepeace_sample = df_onepeace.loc[285]
onepeace_sample

caption                                                  The sword swooshes through the air as someone waves it, making a whooshing sound.
source_path                                                                                         lass_validation/699930_10996917-hq.wav
noise_path                                                                                          lass_validation/683018_10624050-hq.wav
input_path            /fs/nexus-scratch/vla/lass_validation_onepeace_output/('699930_10996917-hq.wav', '683018_10624050-hq.wav')_input.wav
output_path          /fs/nexus-scratch/vla/lass_validation_onepeace_output/('699930_10996917-hq.wav', '683018_10624050-hq.wav')_output.wav
input_similarity                                                                                                                  0.163938
output_similarity                                                                                                                 0.315517
target_similarity          

In [7]:
caption = onepeace_sample.caption
caption

'The sword swooshes through the air as someone waves it, making a whooshing sound.'

In [8]:
df_synth_validation = pd.read_csv('lass_synthetic_validation.csv')
df_synth_validation[:2]

Unnamed: 0,source,noise,snr,caption
0,692211_12333864-hq,701692_6014995-hq,10,"Someone is playing a kind of musical instrument, which makes a buzzing sound."
1,692211_12333864-hq,708399_14710576-hq,1,"Someone is playing the musical instrument, which produces buzzing sounds."


In [9]:
# oops theres a space
df_synth_validation.columns

Index(['source', ' noise', ' snr', ' caption'], dtype='object')

In [10]:
df_eval = df_synth_validation[df_synth_validation[' caption'] == caption]
df_eval

Unnamed: 0,source,noise,snr,caption
285,699930_10996917-hq,683018_10624050-hq,15,"The sword swooshes through the air as someone waves it, making a whooshing sound."


In [11]:
op_config_yaml = 'config/audiosep_onepeace.yaml'
op_encoder_checkpoint_path = '/fs/nexus-scratch/vla/finetune_al_retrieval.pt'
op_ssnet_checkpoint_path = '/fs/nexus-scratch/vla/checkpoints/train/audiosep_onepeace,devices=1/step=140000.ckpt'

clap_config_yaml = 'config/audiosep_base.yaml'
clap_encoder_checkpoint_path = './checkpoint/music_speech_audioset_epoch_15_esc_89.98.pt'
clap_ssnet_checkpoint_path = 'checkpoint/audiosep_baseline.ckpt'

In [None]:
# slow and inefficient loop but im tired
captions = ['The sword slashes through the air.',
            'The sword makes a slashing sound.',
            'The sword slices through the air.',
            'The sword makes a slicing sound.',
            'Sword waving around.',
            'The waving sword swooshes around.',
            'The waving sword waves around wavily',
           ]

for i,caption in enumerate(captions):

    output_dir = f'sword_wave_{i}'
    os.makedirs(output_dir, exist_ok=True)

    df_eval.loc[:,' caption'] = caption
    df_eval.to_csv('sword_wave_eval.csv', index = None)


    # ONE-PEACE eval
    dcase_evaluator = DCASEEvaluatorAnalysis(
        sampling_rate=16000,
        eval_indexes='sword_wave_eval.csv',
        audio_dir='lass_validation',
        output_dir = output_dir,
        encoder_type = 'ONE-PEACE'
    )

    df_results = eval(dcase_evaluator,
                    encoder_checkpoint_path = op_encoder_checkpoint_path,
                    ssnet_checkpoint_path = op_ssnet_checkpoint_path,
                    config_yaml = op_config_yaml,
                    device = "cuda",
                    encoder_type='ONE-PEACE'                  
    )
    torch.cuda.empty_cache()
    gc.collect()

    # CLAP eval
    dcase_evaluator = DCASEEvaluatorAnalysis(
        sampling_rate=16000,
        eval_indexes='sword_wave_eval.csv',
        audio_dir='lass_validation',
        output_dir = output_dir,
        encoder_type = 'CLAP'
    )

    df_results = eval(dcase_evaluator,
                    encoder_checkpoint_path = clap_encoder_checkpoint_path,
                    ssnet_checkpoint_path = clap_ssnet_checkpoint_path,
                    config_yaml = clap_config_yaml,
                    device = "cuda",
                    encoder_type='CLAP'                  
    )
    torch.cuda.empty_cache()
    gc.collect()
    
    with open(os.path.join(output_dir, 'caption.txt'), 'w') as f:
        f.write(caption)
        f.close()
    


In [27]:
# 0,2,6 + original

df_clap = pd.DataFrame()
df_op = pd.DataFrame()
for i in [0,2,6]:
    result = {}
    output_dir = f'sword_wave_{i}'
    clap_output = pd.read_csv(os.path.join(output_dir, 'CLAP_output.csv'))

    df_clap = pd.concat([df_clap, clap_output], ignore_index=True)


    op_output = pd.read_csv(os.path.join(output_dir, 'ONE-PEACE_output.csv'))
    df_op = pd.concat([df_op, op_output], ignore_index=True)

 

In [30]:
df_clap[['caption', 'input_similarity', 'output_similarity', 'target_similarity', 'sisdr', 'sdri', 'sdr']]

Unnamed: 0,caption,input_similarity,output_similarity,target_similarity,sisdr,sdri,sdr
0,The sword slashes through the air.,0.307211,0.313242,0.304953,28.022077,12.944131,27.94413
1,The sword slices through the air.,0.294347,0.29073,0.255574,28.422885,13.386739,28.386738
2,The waving sword waves around wavily,0.238028,0.218852,0.112427,5.634134,-8.31842,6.681579


In [32]:
df_op[['caption', 'input_similarity', 'output_similarity', 'target_similarity', 'sisdr', 'sdri', 'sdr']]

Unnamed: 0,caption,input_similarity,output_similarity,target_similarity,sisdr,sdri,sdr
0,The sword slashes through the air.,0.161676,0.414065,0.382297,0.859136,-11.784411,3.215588
1,The sword slices through the air.,0.242677,0.435143,0.456006,27.272389,12.102124,27.102123
2,The waving sword waves around wavily,0.158412,0.136753,0.243362,-39.099274,-15.075331,-0.075332


In [36]:
df_clap.to_csv('results/clap_sword_wave.csv', index = None)

In [35]:
df_op.to_csv('results/op_sword_wave.csv', index = None)