In [11]:
import shutil
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
DATA_PATH = "tensorflow-speech-recognition-challenge/train/audio"
labels_names = [file for file in os.listdir(DATA_PATH)]

In [3]:
map_dict = {}
commands = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "silence"]

for i, key in enumerate(labels_names):
    if key in commands:
        map_dict[key] = key
    else:
        map_dict[key] = 'unknown'
map_dict

{'bed': 'unknown',
 'bird': 'unknown',
 'cat': 'unknown',
 'dog': 'unknown',
 'down': 'down',
 'eight': 'unknown',
 'five': 'unknown',
 'four': 'unknown',
 'go': 'go',
 'happy': 'unknown',
 'house': 'unknown',
 'left': 'left',
 'marvin': 'unknown',
 'nine': 'unknown',
 'no': 'no',
 'off': 'off',
 'on': 'on',
 'one': 'unknown',
 'right': 'right',
 'seven': 'unknown',
 'sheila': 'unknown',
 'silence': 'silence',
 'six': 'unknown',
 'stop': 'stop',
 'three': 'unknown',
 'tree': 'unknown',
 'two': 'unknown',
 'up': 'up',
 'wow': 'unknown',
 'yes': 'yes',
 'zero': 'unknown'}

In [4]:
frame = []
for root, dirs, files in os.walk(DATA_PATH):
    if root != "tensorflow-speech-recognition-challenge/train/audio":
        current_dir = root.split("\\")[-1]
        frame += [{
            'class': [map_dict[current_dir]] * len(files),
            'original_label': [current_dir] * len(files),
            'files': files
        }]
frame = pd.concat([pd.DataFrame(f) for f in frame], axis=0)

In [5]:
frame

Unnamed: 0,class,original_label,files
0,unknown,bed,00176480_nohash_0.wav
1,unknown,bed,004ae714_nohash_0.wav
2,unknown,bed,004ae714_nohash_1.wav
3,unknown,bed,00f0204f_nohash_0.wav
4,unknown,bed,00f0204f_nohash_1.wav
...,...,...,...
2371,unknown,zero,ffd2ba2f_nohash_1.wav
2372,unknown,zero,ffd2ba2f_nohash_2.wav
2373,unknown,zero,ffd2ba2f_nohash_3.wav
2374,unknown,zero,ffd2ba2f_nohash_4.wav


In [8]:
X_train, X_test = train_test_split(frame, stratify=frame['original_label'], test_size=0.3, random_state=123)
X_valid, X_test = train_test_split(X_test, stratify=X_test['original_label'], test_size=0.6666, random_state=42)
X_train.shape, X_valid.shape, X_test.shape

((45586, 3), (6513, 3), (13024, 3))

In [12]:
def copy_files(frame, path_to_save):
    for label, orig, fname in tqdm(zip(frame['class'], frame['original_label'], frame['files'])):
        os.makedirs(os.path.join(path_to_save, label), exist_ok=True)
        src = os.path.join(DATA_PATH, orig, fname)
        dst = os.path.join(path_to_save, label, orig+fname)
        shutil.copy(src, dst)

copy_files(X_test, os.path.join("data", 'test'))

In [None]:
copy_files(X_train, os.path.join("data", 'train'))
copy_files(X_valid, os.path.join("data", 'val'))