In [1]:
import os
import sys
import json
import pandas as pd

import random
random.seed(1234)

In [2]:
clotho_dir = '/fs/nexus-scratch/vla/Clotho'
fsd_dir = '/fs/nexus-scratch/vla/FSD50K'

In [3]:
%ls '/fs/nexus-scratch/vla/Clotho'

[0m[01;32mclotho_captions_development.csv[0m*  [01;32mclotho_metadata_evaluation.csv[0m*  LICENSE
[01;32mclotho_captions_evaluation.csv[0m*   [01;32mclotho_metadata_validation.csv[0m*  [34;42mvalidation[0m/
[01;32mclotho_captions_validation.csv[0m*   [34;42mdevelopment[0m/
[01;32mclotho_metadata_development.csv[0m*  [34;42mevaluation[0m/


In [4]:
# generate clotho datafile
splits = ['development', 'validation', 'evaluation']
gather = []

for split in splits:

    captions_path = os.path.join(clotho_dir, 'clotho_captions_' + split + '.csv') 
    captions_df = pd.read_csv(captions_path)

    for row in captions_df.to_dict(orient='records'):
        entry = {}
        entry['wav'] = os.path.join(clotho_dir, split, row['file_name'])

        # NOTE: clotho has 5 captions for each sound file, picking one at random for now but can also like 5x the dataset size technically
        caption_number = random.randint(1, 5)
        entry['caption'] = row[f'caption_{caption_number}']

        gather.extend([entry])
    
    print(len(gather))


3839
4884
5929


In [5]:
len(gather)

5929

In [6]:
gather[-1]

{'wav': '/fs/nexus-scratch/vla/Clotho/evaluation/FR.BirdChatAmbience.26.wav',
 'caption': 'Several birds are singing outside as people speak in the background.'}

In [7]:
# write to json
datafile = {}
datafile['data'] = gather

with open('clotho.json', 'w', encoding='utf-8') as f:
    json.dump(datafile, f, ensure_ascii=False, indent=4)

In [None]:
# smaller sample set for testing
datafile = {}
datafile['data'] = gather[:100]

with open('clotho_sample.json', 'w', encoding='utf-8') as f:
    json.dump(datafile, f, ensure_ascii=False, indent=4)

In [16]:
%ls '/fs/nexus-scratch/vla/FSD50K'

[0m[01;34mFSD50K.dev_audio[0m/             [01;34mFSD50K.eval_audio[0m/             [01;34mFSD50K.metadata[0m/
fsd50k_dev_auto_caption.json  fsd50k_eval_auto_caption.json  links.txt
[01;34mFSD50K.doc[0m/                   [01;34mFSD50K.ground_truth[0m/


In [31]:
# generate FSD50K datafile
splits = ['dev', 'eval']

gather = []
for split in splits:

    audio_dir = os.path.join(fsd_dir, 'FSD50K.' + split + '_audio')
    print(audio_dir)
    captions_path = os.path.join(fsd_dir, 'fsd50k_' + split + '_auto_caption.json') 
    # captions_df = pd.read_csv(captions_path)

    with open(captions_path, 'r') as f:
        captions_json = json.load(f)
    
    for entry in captions_json['data']:
        entry['wav'] = os.path.join(audio_dir, entry['wav'])

    gather.extend(captions_json['data'])


/fs/nexus-scratch/vla/FSD50K/FSD50K.dev_audio
/fs/nexus-scratch/vla/FSD50K/FSD50K.eval_audio


In [None]:
# size of FSD50K
len(gather)

51197

In [47]:
gather[0]

{'wav': '/fs/nexus-scratch/vla/FSD50K/FSD50K.dev_audio/10000.wav',
 'caption': 'The act of breathing creates audible respiratory sounds.'}

In [None]:
# write to json
datafile = {}
datafile['data'] = gather

with open('FSD50K.json', 'w', encoding='utf-8') as f:
    json.dump(datafile, f, ensure_ascii=False, indent=4)