In [1]:
import os
import shutil
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit
from tqdm import tqdm

In [2]:
DATA_PATH = "tensorflow-speech-recognition-challenge/train/audio"
labels_names = [file for file in os.listdir(DATA_PATH)]

In [3]:
map_dict = {}
commands = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "silence"]

for i, key in enumerate(labels_names):
    if key in commands:
        map_dict[key] = key
    else:
        map_dict[key] = 'unknown'
map_dict

{'bed': 'unknown',
 'bird': 'unknown',
 'cat': 'unknown',
 'dog': 'unknown',
 'down': 'down',
 'eight': 'unknown',
 'five': 'unknown',
 'four': 'unknown',
 'go': 'go',
 'happy': 'unknown',
 'house': 'unknown',
 'left': 'left',
 'marvin': 'unknown',
 'nine': 'unknown',
 'no': 'no',
 'off': 'off',
 'on': 'on',
 'one': 'unknown',
 'right': 'right',
 'seven': 'unknown',
 'sheila': 'unknown',
 'silence': 'silence',
 'six': 'unknown',
 'stop': 'stop',
 'three': 'unknown',
 'tree': 'unknown',
 'two': 'unknown',
 'up': 'up',
 'wow': 'unknown',
 'yes': 'yes',
 'zero': 'unknown'}

In [4]:
frame = []
for root, dirs, files in os.walk(DATA_PATH):
    if root != "tensorflow-speech-recognition-challenge/train/audio":
        current_dir = root.split("\\")[-1]
        frame += [{
            'class': [map_dict[current_dir]] * len(files),
            'original_label': [current_dir] * len(files),
            'files': files
        }]
frame = pd.concat([pd.DataFrame(f) for f in frame], axis=0)

In [5]:
frame

Unnamed: 0,class,original_label,files
0,unknown,bed,00176480_nohash_0.wav
1,unknown,bed,004ae714_nohash_0.wav
2,unknown,bed,004ae714_nohash_1.wav
3,unknown,bed,00f0204f_nohash_0.wav
4,unknown,bed,00f0204f_nohash_1.wav
...,...,...,...
2371,unknown,zero,ffd2ba2f_nohash_1.wav
2372,unknown,zero,ffd2ba2f_nohash_2.wav
2373,unknown,zero,ffd2ba2f_nohash_3.wav
2374,unknown,zero,ffd2ba2f_nohash_4.wav


In [6]:
frame.groupby("class").count()

Unnamed: 0_level_0,original_label,files
class,Unnamed: 1_level_1,Unnamed: 2_level_1
down,2359,2359
go,2372,2372
left,2353,2353
no,2375,2375
off,2357,2357
on,2367,2367
right,2367,2367
silence,402,402
stop,2380,2380
unknown,41039,41039


In [7]:
frame["person_id"] = frame.files.str.split("_").str[0]
frame

Unnamed: 0,class,original_label,files,person_id
0,unknown,bed,00176480_nohash_0.wav,00176480
1,unknown,bed,004ae714_nohash_0.wav,004ae714
2,unknown,bed,004ae714_nohash_1.wav,004ae714
3,unknown,bed,00f0204f_nohash_0.wav,00f0204f
4,unknown,bed,00f0204f_nohash_1.wav,00f0204f
...,...,...,...,...
2371,unknown,zero,ffd2ba2f_nohash_1.wav,ffd2ba2f
2372,unknown,zero,ffd2ba2f_nohash_2.wav,ffd2ba2f
2373,unknown,zero,ffd2ba2f_nohash_3.wav,ffd2ba2f
2374,unknown,zero,ffd2ba2f_nohash_4.wav,ffd2ba2f


In [8]:
frame.loc[frame["class"] == "silence", 'person_id'] = np.array(range(0, 402)).astype(dtype=str)
frame

Unnamed: 0,class,original_label,files,person_id
0,unknown,bed,00176480_nohash_0.wav,00176480
1,unknown,bed,004ae714_nohash_0.wav,004ae714
2,unknown,bed,004ae714_nohash_1.wav,004ae714
3,unknown,bed,00f0204f_nohash_0.wav,00f0204f
4,unknown,bed,00f0204f_nohash_1.wav,00f0204f
...,...,...,...,...
2371,unknown,zero,ffd2ba2f_nohash_1.wav,ffd2ba2f
2372,unknown,zero,ffd2ba2f_nohash_2.wav,ffd2ba2f
2373,unknown,zero,ffd2ba2f_nohash_3.wav,ffd2ba2f
2374,unknown,zero,ffd2ba2f_nohash_4.wav,ffd2ba2f


In [9]:
frame.loc[frame["class"] == "silence"]

Unnamed: 0,class,original_label,files,person_id
0,silence,silence,doing_the_dishes_0.wav,0
1,silence,silence,doing_the_dishes_1.wav,1
2,silence,silence,doing_the_dishes_10.wav,2
3,silence,silence,doing_the_dishes_11.wav,3
4,silence,silence,doing_the_dishes_12.wav,4
...,...,...,...,...
397,silence,silence,white_noise_59.wav,397
398,silence,silence,white_noise_6.wav,398
399,silence,silence,white_noise_7.wav,399
400,silence,silence,white_noise_8.wav,400


In [10]:
frame.groupby("person_id").count()

Unnamed: 0_level_0,class,original_label,files
person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1,1,1
00176480,7,7,7
004ae714,30,30,30
00b01445,30,30,30
00f0204f,30,30,30
...,...,...,...
ffa76c4a,27,27,27
ffb86d3c,30,30,30
ffbb695d,30,30,30
ffd2ba2f,110,110,110


In [11]:
splitter = GroupShuffleSplit(test_size=.2, n_splits=2, random_state=7)
split = splitter.split(frame, y=frame["class"], groups=frame['person_id'])
train_inds, test_inds = next(split)
X_train, X_test = frame.iloc[train_inds].reset_index(drop=True), frame.iloc[test_inds].reset_index(drop=True)
X_train.shape, X_test.shape

((52483, 4), (12640, 4))

In [12]:
splitter = GroupShuffleSplit(test_size=.125, n_splits=2, random_state=7)
split = splitter.split(X_train, y=X_train["class"], groups=X_train['person_id'])
train_inds, val_ids = next(split)
X_train, X_valid = X_train.iloc[train_inds], X_train.iloc[val_ids]
X_train.shape, X_valid.shape, X_test.shape

((46564, 4), (5919, 4), (12640, 4))

In [13]:
X_train.groupby("class").count()

Unnamed: 0_level_0,original_label,files,person_id
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
down,1707,1707,1707
go,1704,1704,1704
left,1707,1707,1707
no,1752,1752,1752
off,1680,1680,1680
on,1691,1691,1691
right,1716,1716,1716
silence,266,266,266
stop,1719,1719,1719
unknown,29188,29188,29188


In [14]:
X_valid.groupby("class").count()

Unnamed: 0_level_0,original_label,files,person_id
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
down,218,218,218
go,217,217,217
left,206,206,206
no,198,198,198
off,218,218,218
on,215,215,215
right,203,203,203
silence,50,50,50
stop,208,208,208
unknown,3783,3783,3783


In [15]:
X_test.groupby("class").count()

Unnamed: 0_level_0,original_label,files,person_id
class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
down,434,434,434
go,451,451,451
left,440,440,440
no,425,425,425
off,459,459,459
on,461,461,461
right,448,448,448
silence,86,86,86
stop,453,453,453
unknown,8068,8068,8068


In [16]:
def copy_files(frame, path_to_save):
    for label, orig, fname in tqdm(zip(frame['class'], frame['original_label'], frame['files'])):
        os.makedirs(os.path.join(path_to_save, label), exist_ok=True)
        src = os.path.join(DATA_PATH, orig, fname)
        dst = os.path.join(path_to_save, label, orig + fname)
        shutil.copy(src, dst)


copy_files(X_test, os.path.join("group_data", 'test'))

12640it [00:10, 1188.74it/s]


In [17]:
copy_files(X_train, os.path.join("group_data", 'train'))
copy_files(X_valid, os.path.join("group_data", 'val'))

46564it [00:39, 1184.96it/s]
5919it [00:05, 1076.32it/s]
