In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pickle
from collections import defaultdict
from os.path import join, exists, splitext, basename, isdir
from os import listdir, symlink, makedirs
from glob import glob
from praatio import tgio
from termcolor import colored
from tqdm import tqdm
import pandas as pd
import numpy as np
from librosa import get_duration
import scipy.io.wavfile as wav

from cac.utils.pandas import apply_antifilters

In [None]:
# directory where the data resides
data_root = '/data/freesound-kaggle/'

# src and destination directories
load_dir = join(data_root, 'raw')
save_root = join(data_root, 'processed')

makedirs(save_root, exist_ok=True)

load_audio_dir = join(load_dir, 'audio')
save_audio_dir = join(save_root, 'audio')
makedirs(save_audio_dir, exist_ok=True)

In [None]:
files = glob(join(load_audio_dir, '*.wav'))

In [None]:
len(files)

In [None]:
invalid_files = []

for file in tqdm(files, desc='Checking valid files'):
    try:
        fs,signal = wav.read(file)
    except:
        import ipdb; ipdb.set_trace()
        invalid_files.append(file)

In [None]:
len(invalid_files)

In [None]:
# -------- Creating `processed/audio` -------- #

In [None]:
files = []

for file in tqdm(glob(join(load_audio_dir, '*.wav')), desc='Creating symlinks processed/ <- raw/'):
#     print(file)
        
    save_filename = basename(file)
    save_path = join(save_audio_dir, save_filename)

    # ignore .wav
    files.append(splitext(save_filename)[0])

    if not exists(save_path):
        symlink(file, save_path)

In [None]:
# -------- Creating `processed/annotation.csv` -------- #

In [None]:
train_annotations = pd.read_csv(join(load_dir, 'train_post_competition.csv'))
test_annotations = pd.read_csv(join(load_dir, 'test_post_competition_scoring_clips.csv'))

In [None]:
# making both the DFs have the same columns

In [None]:
train_annotations['usage'] = 'Public'

In [None]:
train_annotations.head()

In [None]:
test_annotations['manually_verified'] = 1

In [None]:
test_annotations.head()

In [None]:
len(train_annotations), len(test_annotations)

In [None]:
attributes = train_annotations.append(test_annotations)

In [None]:
# removing rows for which audio file was not extracted properly or does not exist

In [None]:
len(invalid_files)

In [None]:
attributes = apply_antifilters(attributes, {'fname': [basename(x) for x in invalid_files]})

In [None]:
attributes.shape

In [None]:
attributes['label'] = attributes['label'].apply(lambda x: x.lower())

In [None]:
files = [splitext(f)[0] for f in attributes['fname']]
classification_targets = [[label] for label in attributes['label']]
len(files), len(classification_targets)

In [None]:
starts = [0.0 for _ in files]
ends = [get_duration(filename=join(load_dir, 'audio', x + '.wav')) for x in tqdm(files)]

In [None]:
# create dataframe storing the data
final_df = pd.DataFrame(
    {'file': files, 'classification': classification_targets, 'manually_verified': attributes['manually_verified'], 'start': starts, 'end': ends}
)

In [None]:
final_df.head()

In [None]:
# save the dataframe
annotation_save_path = join(save_root, 'annotation.csv')
final_df.to_csv(annotation_save_path, index=False)

In [None]:
# -------- Creating `processed/attributes.csv` -------- #

In [None]:
attributes.head()

In [None]:
# save the dataframe
attribute_save_path = join(save_root, 'attributes.csv')
attributes.to_csv(attribute_save_path, index=False)

In [None]:
# -------- Creating `processed/description.txt` -------- #

In [None]:
description = '\
Annotation columns: \n \
`classification`: valid labels = ["Acoustic_guitar", "Applause", "Bark", "Bass_drum", "Burping_or_eructation", "Bus", \n \
    "Cello", "Chime", "Clarinet", "Computer_keyboard", "Cough", "Cowbell", "Double_bass", "Drawer_open_or_close", \n \
    "Electric_piano", "Fart", "Finger_snapping", "Fireworks", "Flute", "Glockenspiel", "Gong", "Gunshot_or_gunfire", \n \
    "Harmonica", "Hi-hat", "Keys_jangling", "Knock", "Laughter", "Meow", "Microwave_oven", "Oboe", "Saxophone", "Scissors", \n \
    "Shatter", "Snare_drum", "Squeak", "Tambourine", "Tearing", "Telephone", "Trumpet", "Violin_or_fiddle", "Writing"\n \
\n \
Attributes: \n \
`names`: ["fname", "label", "manually_verified", "freesound_id", "license", "usage"]'

In [None]:
with open(join(save_root, 'description.txt'), 'w') as f:
    f.write(description)