# Data preparation
Get the DCASE dataset and take a subset of it

In [4]:
%load_ext autoreload
%autoreload 2

from util import config
import os
import zipfile
from os.path import join
import random
import pandas as pd
import numpy as np
import shutil

# Random seeds for reproducebility
random.seed(10)
np.random.seed(10)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Step 1: Download and unzip the DCASE 2022 task 1 dataset

In [None]:
# Get the raw development dataset
!zenodo_get -o {config.RAW_FULL_DATA_ZIP_PATH} -d 10.5281/zenodo.6337421

In [None]:
# After the dataset is downloaded, unzip it
for item in os.listdir(config.RAW_FULL_DATA_ZIP_PATH):
    if item.endswith(".zip"):
        # Create a zipfile object and unpack it
        zip_ref = zipfile.ZipFile(join(config.RAW_FULL_DATA_ZIP_PATH, item)) 
        zip_ref.extractall(config.RAW_FULL_DATA_PATH)
        zip_ref.close()

## Step 2: Randomly pick a subset of the dataset

In [16]:
# Since the 139 970 train and 29 680 test data points provided by the DCASE dataset are to many for my current computational resources, 
# randomly take a subset of 10 000 train and 2 000 test data points

# Load the test and train fold csvs
train_metadata = pd.read_csv(config.TRAIN_METADATA_FILE, sep='\t', delimiter=None, names=['filename', 'scene_label'])
test_metadata = pd.read_csv(config.TEST_METADATA_FILE, sep='\t', delimiter=None, names=['filename'])

# Randomly sample 10 000 train and 2 000 test data points
train_metadata_subset = train_metadata.sample(n=10000)
test_metadata_subset = test_metadata.sample(n=2000)

# Get the train files list
train_files_list = train_metadata_subset["filename"]
test_files_list = test_metadata_subset["filename"]

# Copy the files to the raw subset folder
for train_file in train_files_list:
    shutil.copyfile(join(config.RAW_FULL_DATA_PATH, train_file), join(config.RAW_SUBSET_DATA_PATH, f'train/{train_file}'))
for test_file in test_files_list:
    shutil.copyfile(join(config.RAW_FULL_DATA_PATH, test_file), join(config.RAW_SUBSET_DATA_PATH, f'test/{test_file}'))
