In [2]:
import os
import zipfile
import shutil
import requests
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
def get_project_root():
    try:
        # Script mode
        return Path(__file__).resolve().parents[1]
    except NameError:
        # Notebook or interactive mode
        return Path.cwd().parents[0]


In [4]:
ROOT_DIR = get_project_root()
AUDIO_DATA_DIR = ROOT_DIR / "audio_data"
AUDIO_DATA_DIR.mkdir(parents=True, exist_ok=True)

DOWNLOADS = [
    {
        "name": "UrbanSound8K",
        "url": "https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz",
        "filename": "UrbanSound8K.tar.gz",
        "extract_folder": "urbansound8k"
    },
    {
        "name": "FSD50K Dev Audio",
        "url": "https://zenodo.org/record/4060432/files/FSD50K.dev_audio.tar.gz",
        "filename": "FSD50K.dev_audio.tar.gz",
        "extract_folder": "fsd50k_dev_audio"
    }
]

# === Config ===
DOWNLOAD_URL = "https://github.com/karoldvl/ESC-50/archive/master.zip"
ARCHIVE_NAME = "ESC-50-master.zip"
EXTRACT_DIR = Path("audio_data/ESC-50-master")
GUNSHOT_DIR = Path("audio_data/ESC50_gunshot")

In [13]:
def download_and_extract(url, filename, extract_subdir):
    archive_path = AUDIO_DATA_DIR / filename
    extract_path = AUDIO_DATA_DIR / extract_subdir

    # Download
    if not archive_path.exists():
        print(f"⬇️ Downloading: {filename}")
        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(archive_path, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"✅ Downloaded {archive_path}")
    else:
        print(f"✅ Already downloaded: {archive_path}")

    # Extract
    if extract_path.exists() and any(extract_path.iterdir()):
        print(f"✅ Already extracted: {extract_path}")
        return

    print(f"📦 Extracting to: {extract_path}")
    extract_path.mkdir(parents=True, exist_ok=True)
    if filename.endswith(".zip"):
        with zipfile.ZipFile(archive_path, "r") as zip_ref:
            zip_ref.extractall(extract_path)
    elif filename.endswith(".tar.gz"):
        with tarfile.open(archive_path, "r:gz") as tar_ref:
            tar_ref.extractall(extract_path)
    print(f"✅ Extracted to: {extract_path}")

In [None]:
for d in DOWNLOADS:   
    download_and_extract(d["url"], d["filename"], d["extract_folder"])

In [27]:
def download_with_progress(url, output_path):
    """Download file using requests + tqdm progress bar"""
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    block_size = 1024  # 1KB
    progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=output_path.name)

    with open(output_path, "wb") as f:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            f.write(data)
    progress_bar.close()


def download_esc50():
    if not Path(ARCHIVE_NAME).exists():
        print("⬇️ Downloading ESC-50...")
        download_with_progress(DOWNLOAD_URL, Path(ARCHIVE_NAME))
        print("✅ Downloaded.")
    else:
        print("⚠️ ESC-50 ZIP already exists.")


def extract_zip():
    if not EXTRACT_DIR.exists():
        print("📦 Extracting ZIP...")
        with zipfile.ZipFile(ARCHIVE_NAME, "r") as zip_ref:
            zip_ref.extractall("audio_data")
        print("✅ Extracted.")
    else:
        print("⚠️ Already extracted.")


def filter_gunshot():
    meta_csv = EXTRACT_DIR / "meta/esc50.csv"
    audio_dir = EXTRACT_DIR / "audio"
    
    if not meta_csv.exists():
        raise FileNotFoundError("Metadata CSV not found")

    df = pd.read_csv(meta_csv)
    df_gun = df[df['category'] == 'gun_shot']

    print(f"🔫 Found {len(df_gun)} gun_shot samples.")

    GUNSHOT_DIR.mkdir(parents=True, exist_ok=True)

    for fname in df_gun['filename']:
        src = audio_dir / fname
        dst = GUNSHOT_DIR / fname
        if src.exists():
            shutil.copy(src, dst)

    print(f"✅ Copied to: {GUNSHOT_DIR.resolve()}")

In [28]:
download_esc50()
extract_zip()
filter_gunshot()

⬇️ Downloading ESC-50...


ESC-50-master.zip: 646MB [02:24, 4.46MB/s] 


✅ Downloaded.
📦 Extracting ZIP...
✅ Extracted.
🔫 Found 0 gun_shot samples.
✅ Copied to: /Users/ittichaiboonyarakthunya/Documents/WorkDir/ai-ml/labs-gunshot-classification/notebooks/audio_data/esc50_gunshot


In [31]:
df = pd.read_csv(f"{AUDIO_DATA_DIR}/ESC-50-master/meta/esc50.csv")
print("✅ Unique categories:\n", df['category'].unique())

✅ Unique categories:
 ['dog' 'chirping_birds' 'vacuum_cleaner' 'thunderstorm' 'door_wood_knock'
 'can_opening' 'crow' 'clapping' 'fireworks' 'chainsaw' 'airplane'
 'mouse_click' 'pouring_water' 'train' 'sheep' 'water_drops'
 'church_bells' 'clock_alarm' 'keyboard_typing' 'wind' 'footsteps' 'frog'
 'cow' 'brushing_teeth' 'car_horn' 'crackling_fire' 'helicopter'
 'drinking_sipping' 'rain' 'insects' 'laughing' 'hen' 'engine' 'breathing'
 'crying_baby' 'hand_saw' 'coughing' 'glass_breaking' 'snoring'
 'toilet_flush' 'pig' 'washing_machine' 'clock_tick' 'sneezing' 'rooster'
 'sea_waves' 'siren' 'cat' 'door_wood_creaks' 'crickets']


In [38]:
URBANSOUND8K_ZIP_URL = "https://zenodo.org/record/1203745/files/UrbanSound8K.tar.gz"
URBANSOUND8K_ZIP_NAME = ("../audio_data/UrbanSound8K.tar.gz")
EXTRACT_BASE_DIR = Path("../audio_data/UrbanSound8K")
GUNSHOT_DIR = Path("../audio_data/UrbanSound8K_gunshot")
METADATA_CSV = EXTRACT_BASE_DIR / "metadata" / "UrbanSound8K.csv"

In [34]:
def download_with_progress(url, output_path):
    response = requests.get(url, stream=True)
    total_size = int(response.headers.get("content-length", 0))
    block_size = 1024
    progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=output_path.name)
    with open(output_path, "wb") as f:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            f.write(data)
    progress_bar.close()

def download_metadata_csv():
    # UrbanSound8K metadata CSV is part of the archive,
    # but to avoid full download, we can download metadata only from GitHub repo:
    meta_url = "https://raw.githubusercontent.com/urbansounddataset/urbansound8k/master/metadata/UrbanSound8K.csv"
    meta_path = EXTRACT_BASE_DIR / "metadata"
    meta_path.mkdir(parents=True, exist_ok=True)
    meta_csv_path = meta_path / "UrbanSound8K.csv"
    if not meta_csv_path.exists():
        print("⬇️ Downloading UrbanSound8K metadata CSV...")
        download_with_progress(meta_url, meta_csv_path)
    else:
        print("⚠️ Metadata CSV already exists.")
    return meta_csv_path

def extract_gunshot_files_from_archive(archive_path, metadata_csv_path, output_dir):
    import tarfile

    df = pd.read_csv(metadata_csv_path)
    gunshot_files = df[df['class'] == 'gun_shot']
    print(f"🔫 Found {len(gunshot_files)} gun_shot samples in metadata.")

    output_dir.mkdir(parents=True, exist_ok=True)

    with tarfile.open(archive_path, "r:gz") as tar:
        members = tar.getmembers()

        # Debug: preview archive structure
        print("\n📦 Archive structure preview:")
        for m in members[:10]:
            print(" -", m.name)

        # Build expected file paths
        files_to_extract = set()
        for _, row in gunshot_files.iterrows():
            rel_path = f"UrbanSound8K/audio/fold{row['fold']}/{row['slice_file_name']}"
            files_to_extract.add(rel_path)

        # Debug: preview expected match targets
        print("\n📋 First few expected paths to extract:")
        for p in list(files_to_extract)[:5]:
            print(" -", p)

        print(f"\n🎯 Extracting {len(files_to_extract)} gun_shot files...")
        extracted_count = 0

        for member in tqdm(members, desc="Extracting files"):
            if any(member.name.endswith(p) for p in files_to_extract):
                f = tar.extractfile(member)
                if f is not None:
                    out_file_path = output_dir / Path(member.name).name
                    with open(out_file_path, "wb") as out_f:
                        out_f.write(f.read())
                    extracted_count += 1

    print(f"✅ Extracted {extracted_count} files to {output_dir.resolve()}")


In [14]:
URBANSOUND8K_ZIP_NAME

PosixPath('audio_data/UrbanSound8K.tar.gz')

In [15]:
if not Path(URBANSOUND8K_ZIP_NAME).exists():
    print("⬇️ Downloading UrbanSound8K archive (4GB+)...")
    download_with_progress(URBANSOUND8K_ZIP_URL, Path(URBANSOUND8K_ZIP_NAME))
else:
    print("⚠️ Archive already exists.")

⚠️ Archive already exists.


In [18]:
# Step 2: Download metadata CSV if needed
metadata_csv_path = download_metadata_csv()
metadata_csv_path

⚠️ Metadata CSV already exists.


PosixPath('audio_data/UrbanSound8K/metadata/UrbanSound8K.csv')

In [24]:
print(Path(URBANSOUND8K_ZIP_NAME))
print(metadata_csv_path)
print(GUNSHOT_DIR)

audio_data/UrbanSound8K.tar.gz
audio_data/UrbanSound8K/metadata/UrbanSound8K.csv
audio_data/UrbanSound8K_gunshot


In [36]:
audio_metadata_dir = ("../audio_data/UrbanSound8K/metadata/UrbanSound8K.csv")

In [39]:
# Step 3: Extract only gun_shot files
extract_gunshot_files_from_archive(Path(URBANSOUND8K_ZIP_NAME), audio_metadata_dir, GUNSHOT_DIR)

🔫 Found 374 gun_shot samples in metadata.

📦 Archive structure preview:
 - UrbanSound8K
 - UrbanSound8K/.DS_Store
 - UrbanSound8K/audio
 - UrbanSound8K/audio/.DS_Store
 - UrbanSound8K/audio/fold1
 - UrbanSound8K/audio/fold1/.DS_Store
 - UrbanSound8K/audio/fold1/101415-3-0-2.wav
 - UrbanSound8K/audio/fold1/101415-3-0-3.wav
 - UrbanSound8K/audio/fold1/101415-3-0-8.wav
 - UrbanSound8K/audio/fold1/102106-3-0-0.wav

📋 First few expected paths to extract:
 - UrbanSound8K/audio/fold5/164667-6-0-0.wav
 - UrbanSound8K/audio/fold3/135526-6-6-0.wav
 - UrbanSound8K/audio/fold1/76093-6-0-0.wav
 - UrbanSound8K/audio/fold4/135528-6-3-0.wav
 - UrbanSound8K/audio/fold1/46656-6-3-0.wav

🎯 Extracting 374 gun_shot files...


Extracting files:   0%|          | 0/8761 [00:00<?, ?it/s]

✅ Extracted 374 files to /Users/ittichaiboonyarakthunya/Documents/WorkDir/ai-ml/labs-gunshot-classification/audio_data/UrbanSound8K_gunshot


In [40]:
URL = "https://github.com/junwoopark92/PUBG-Gun-Sound-Dataset/archive/main.zip"
ARCHIVE = "BGG_dataset.zip"

def download_with_progress(url, out_path):
    resp = requests.get(url, stream=True)
    total = int(resp.headers.get("content-length", 0))
    with open(out_path, "wb") as f, tqdm(total=total, unit='B', unit_scale=True, desc=out_path) as p:
        for chunk in resp.iter_content(1024):
            f.write(chunk)
            p.update(len(chunk))

if not Path(ARCHIVE).exists():
    download_with_progress(URL, ARCHIVE)
    print("✅ Download complete")
else:
    print("Archive already exists")

BGG_dataset.zip: 0.00B [00:00, ?B/s]

✅ Download complete
