In [1]:
import concurrent.futures
import json
import pathlib
import subprocess

import pandas as pd
import requests

from common import *

# need ffmpeg and youtube-dl installed

In [2]:
# download mid to name mapping
if not MID_TO_NAME_PATH.exists():
    MID_TO_NAME_PATH.write_text(requests.get("http://storage.googleapis.com/us_audioset/youtube_corpus/strong/mid_to_display_name.tsv").text)

In [3]:
# download dataset from https://research.google.com/audioset/download_strong.html
for dataset in GOOGLE_AUDIO_DATASETS:
    if not dataset.exists():
        dataset.write_text(requests.get(f"http://storage.googleapis.com/us_audioset/youtube_corpus/strong/{dataset.name}").text)

In [4]:
# read in all the data
df = pd.concat(pd.read_csv(dataset, sep="\t") for dataset in GOOGLE_AUDIO_DATASETS)
df

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label
0,s9d-2nhuJCQ_30000,0.000,10.000,/m/04rlf
1,s9d-2nhuJCQ_30000,2.627,7.237,/m/053hz1
2,s9d-2nhuJCQ_30000,2.627,9.239,/m/03qtwd
3,s9d-2nhuJCQ_30000,5.634,6.649,/m/01w250
4,s9d-2nhuJCQ_30000,7.201,8.560,/m/0l15bq
...,...,...,...,...
934816,cq-vfngNXMc_70000,7.836,8.015,/m/07qjznt
934817,cq-vfngNXMc_70000,8.226,8.511,/t/dd00099
934818,cq-vfngNXMc_70000,8.503,8.868,/m/05zppz
934819,cq-vfngNXMc_70000,9.217,9.624,/t/dd00099


In [5]:
df[["ytid", "start_seconds"]] = df["segment_id"].str.rsplit("_", n=1, expand=True)
df["start_seconds"] = df["start_seconds"].astype(int) / 1000
df["duration"] = 10.0   # all clips of same length
df

Unnamed: 0,segment_id,start_time_seconds,end_time_seconds,label,ytid,start_seconds,duration
0,s9d-2nhuJCQ_30000,0.000,10.000,/m/04rlf,s9d-2nhuJCQ,30.0,10.0
1,s9d-2nhuJCQ_30000,2.627,7.237,/m/053hz1,s9d-2nhuJCQ,30.0,10.0
2,s9d-2nhuJCQ_30000,2.627,9.239,/m/03qtwd,s9d-2nhuJCQ,30.0,10.0
3,s9d-2nhuJCQ_30000,5.634,6.649,/m/01w250,s9d-2nhuJCQ,30.0,10.0
4,s9d-2nhuJCQ_30000,7.201,8.560,/m/0l15bq,s9d-2nhuJCQ,30.0,10.0
...,...,...,...,...,...,...,...
934816,cq-vfngNXMc_70000,7.836,8.015,/m/07qjznt,cq-vfngNXMc,70.0,10.0
934817,cq-vfngNXMc_70000,8.226,8.511,/t/dd00099,cq-vfngNXMc,70.0,10.0
934818,cq-vfngNXMc_70000,8.503,8.868,/m/05zppz,cq-vfngNXMc,70.0,10.0
934819,cq-vfngNXMc_70000,9.217,9.624,/t/dd00099,cq-vfngNXMc,70.0,10.0


In [6]:
labels = LABEL_TO_NAME_DF["label"][LABEL_TO_NAME_DF["name"].isin(NAMES)]
labels

2       /m/01280g
42      /m/01j4z9
126    /m/03m9d0z
181      /m/06mb1
394      /m/0jb2l
Name: label, dtype: object

In [7]:
# add names along with labels
df = pd.merge(LABEL_TO_NAME_DF, df[df["label"].isin(labels)].drop(columns=["start_time_seconds", "end_time_seconds"]).drop_duplicates())

# take maximum 500 samples from each group
df = df.groupby("name").apply(lambda x: x.sample(min(500, len(x)))).reset_index(drop=True)
df

Unnamed: 0,label,name,segment_id,ytid,start_seconds,duration
0,/m/01j4z9,Chainsaw,FbUfifj5GZo_30000,FbUfifj5GZo,30.0,10.0
1,/m/01j4z9,Chainsaw,AdCQrrGANTM_30000,AdCQrrGANTM,30.0,10.0
2,/m/01j4z9,Chainsaw,pjtd_BgAg-I_130000,pjtd_BgAg-I,130.0,10.0
3,/m/01j4z9,Chainsaw,rTKuABX5LJ4_260000,rTKuABX5LJ4,260.0,10.0
4,/m/01j4z9,Chainsaw,wE1Y12sIk20_40000,wE1Y12sIk20,40.0,10.0
...,...,...,...,...,...,...
1934,/m/03m9d0z,Wind,-YjW4r2hUTA_30000,-YjW4r2hUTA,30.0,10.0
1935,/m/03m9d0z,Wind,32f7FdCLKSY_30000,32f7FdCLKSY,30.0,10.0
1936,/m/03m9d0z,Wind,SDeYz-Z07J8_30000,SDeYz-Z07J8,30.0,10.0
1937,/m/03m9d0z,Wind,UQHdzPdO7aU_110000,UQHdzPdO7aU,110.0,10.0


In [8]:
def download_yt_strip(d):
    """
    download youtube video's audio as mp3 from start_seconds for given duration
    """
    segid, name, ytid, start_seconds, duration = d["segment_id"], d["name"], d["ytid"], d["start_seconds"], d["duration"]
    outfile = AUDIO_DIR / name / f"{segid}.mp3"
    if outfile.exists():
        print(f"skipping {ytid}")
        return
    res = subprocess.run(["youtube-dl", "--get-url", "--extract-audio", f"http://youtu.be/{ytid}"],
                         capture_output=True,
                         text=True)
    if res.returncode != 0:
        print(f"error with ytid {ytid}: {res.stderr}")
    else:
        url = res.stdout.strip()
        print(f"grabbing {ytid}")
        subprocess.run(["ffmpeg", "-loglevel", "error", "-ss", f"{start_seconds}", "-i", url, "-t", f"{duration}", "-c", "mp3", outfile])

In [9]:
# create directories for each of the classes
for name in df["name"].unique():
    (AUDIO_DIR / name).mkdir(exist_ok=True)

In [13]:
with concurrent.futures.ThreadPoolExecutor() as executor:
    executor.map(download_yt_strip, df.to_dict(orient="records"))

skipping FbUfifj5GZo
skipping AdCQrrGANTMskipping pjtd_BgAg-I

skipping wE1Y12sIk20skipping rTKuABX5LJ4

skipping Vsz6dpRCwSoskipping vP0eDTXyf6sskipping 7BMVg6l2qB0skipping ttsVYFpBAPA
skipping nbZVFniKeOU
skipping 8Da6w-Eg3Ko
skipping 6jf2GKqGHeI



skipping 6iDiC8UJeCIskipping WRdF3NCz9RM
skipping 0okOaucWIzo
skipping DjATl6PNShw
skipping _fNnM9pr1kM
skipping km6wBBns7T8
skipping NJUl3gPX07o
skipping wI8Qs7C-JvA
skipping Gi-Qt6cYeqA
skipping UBEGUXdIZD8
skipping xwN_Rol5tEY
skipping l2EyMhD0IBoskipping ikreHctdx9k
skipping 36yvY7tUqIY
skipping 4f86keIsw7s
skipping B7bNn4BeyKc
skipping bLmeA-3vAN8
skipping VaGwYIZA5mk
skipping zDqI6A-hpYc
skipping p6C0ZGTj1Qw
skipping BgAnsqHXWhI
skipping Bv_-ogBjngI
skipping RhE-dwM8xlw

skipping FZ9i8-5cXy8
skipping XZb_KCgzAKI
skipping n6TL_lCalGU

skipping EoJxkGYGdxUskipping SDNG7SKNUm8skipping C46X66FU_Dw
skipping 0tray-DsgLA
skipping EZpffF9di-Y
skipping 2QGkgIAKaQE
skipping Yf-XSqHz148
skipping KpcEcWJknBs
skipping wZr_M1dUsFw
skipping 6Z1eAb