In [33]:
import os, subprocess
from pathlib import Path, PurePath
from shutil import copyfile
import pandas as pd


In [None]:
## Prepare data experiment very similar to the way described in the paper: 

## Paths

In [2]:
GOOGLE_DATA_PATH=Path('data/audio_google_original/')
GOOGLE_ORIG_DATA=GOOGLE_DATA_PATH/'original/'
TEST_LIST_FILE=GOOGLE_ORIG_DATA/'testing_list.txt'
VALIDATION_LIST_FILE=GOOGLE_ORIG_DATA/'validation_list.txt'
GOOGLE_DATA_BACKGROUND_PATH=GOOGLE_ORIG_DATA/'_background_noise_'
GOOGLE_DATA_BACKGROUND_SPLITTED_PATH=GOOGLE_ORIG_DATA/'_background_noise_splitted_'
GOOGLE_DATA_BACKGROUND_SPLITTED_PATH.mkdir(exist_ok=True)

TEST_FOLDER=GOOGLE_DATA_PATH/'test'
TRAIN_FOLDER=GOOGLE_DATA_PATH/'train'

LABELS_USED=["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", '_background_noise_splitted_']
UNKNOW_CATEGORY='unknown'
NO_TALK_CATEGORY='_background_noise_'

## Split noize files 1 sec files

In [3]:
def split_wav(input_file, output_path):
    filename_wo_suffix=str(input_file).split('\\')[-1].replace('.wav','')
    print(filename_wo_suffix)
    output_file_begin=str(output_path/filename_wo_suffix)
    print(output_file_begin)
    process = subprocess.Popen(['ffmpeg', '-i',  str(input_file), '-f', 'segment', '-segment_time', '1',
                                '-c', 'copy', f'{output_file_begin}%03d.wav'],
                     stdout=subprocess.PIPE, 
                     stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    return stdout, stderr
    

In [4]:
background_files=list(GOOGLE_DATA_BACKGROUND_PATH.glob('*.wav'))
len(background_files)

6

In [5]:
for background_file in background_files:
    split_wav(background_file, GOOGLE_DATA_BACKGROUND_SPLITTED_PATH)

doing_the_dishes
data\audio_google_original\original\_background_noise_splitted_\doing_the_dishes
dude_miaowing
data\audio_google_original\original\_background_noise_splitted_\dude_miaowing
exercise_bike
data\audio_google_original\original\_background_noise_splitted_\exercise_bike
pink_noise
data\audio_google_original\original\_background_noise_splitted_\pink_noise
running_tap
data\audio_google_original\original\_background_noise_splitted_\running_tap
white_noise
data\audio_google_original\original\_background_noise_splitted_\white_noise


In [6]:
splitted_background_files=[str(fl).replace('\\','/') for fl in GOOGLE_DATA_BACKGROUND_SPLITTED_PATH.rglob('*.wav')]
len(splitted_background_files)

402

## Get list of all files

In [7]:
all_files=[str(fl).replace('\\','/') for fl in GOOGLE_ORIG_DATA.rglob('*.wav')]

len(all_files)

106237

In [8]:
all_files[:5]

['data/audio_google_original/original/backward/0165e0e8_nohash_0.wav',
 'data/audio_google_original/original/backward/017c4098_nohash_0.wav',
 'data/audio_google_original/original/backward/017c4098_nohash_1.wav',
 'data/audio_google_original/original/backward/017c4098_nohash_2.wav',
 'data/audio_google_original/original/backward/017c4098_nohash_3.wav']

In [9]:
### exclude original noize files
all_files=[fl for fl in all_files if '/_background_noise_/' not in fl]
len(all_files)

106231

## Make train file list

In [10]:
TEST_LIST_FILE.as_posix()

'data/audio_google_original/original/testing_list.txt'

In [11]:
test_files=TEST_LIST_FILE.read_text().split('\n')
len(test_files)

11006

In [12]:
test_files[:5]

['right/bb05582b_nohash_3.wav',
 'right/97f4c236_nohash_2.wav',
 'right/f2e59fea_nohash_3.wav',
 'right/fdb5155e_nohash_2.wav',
 'right/dc75148d_nohash_0.wav']

In [13]:
validation_files=VALIDATION_LIST_FILE.read_text().split('\n')
len(validation_files)

9982

In [14]:
validation_files[:5]

['right/a69b9b3e_nohash_0.wav',
 'right/439c84f4_nohash_1.wav',
 'right/409c962a_nohash_1.wav',
 'right/dbaf8fc6_nohash_2.wav',
 'right/a6d586b7_nohash_1.wav']

In [15]:
def make_train_list(all_files, test_files, validation_files, orig_files_folder='data/audio_google_original/original/'):
    test_files_fullpath=[]
    for file in test_files:
        test_files_fullpath.append(orig_files_folder+file)
        
    validation_files_fullpath=[]
    for file in validation_files:
        validation_files_fullpath.append(orig_files_folder+file)
        
    train_files=set(all_files)-set(test_files_fullpath)-set(validation_files_fullpath)
    
    train_files=list(train_files)
    return train_files, test_files_fullpath, validation_files_fullpath, 

In [16]:
train_files, test_files, valid_files=make_train_list(all_files, test_files, validation_files)
len(train_files), len(test_files), len(valid_files)

(85245, 11006, 9982)

In [17]:
[fl for fl in test_files if '_background_noise_' in fl]

[]

In [18]:
## Add backgournd noise files to test set (yes I knwo they are also in train set but there arent many of them so lets use them)
test_files=test_files+splitted_background_files

In [19]:
len(test_files)

11408

## Copy files to destination folders

In [20]:
#as this https://arxiv.org/pdf/1804.03209.pdf says that validation set is used to adjust metrics in training I'll add it to traiingset
train_files_w_valid=train_files+valid_files
len(train_files_w_valid)

95227

In [31]:
def copy_orig2model_folder(files, folder2copy):
    for i, fl in enumerate(files):
        if i%1000==0:
            print(f'working on file {i}')
        folder, filename =fl.split('/')[-2:]
        if folder not in LABELS_USED:
            filename=f'{folder}_{filename}'
            folder=UNKNOW_CATEGORY
        dest_folder=folder2copy/folder
        if not os.path.isdir(dest_folder):
            dest_folder.mkdir()
        dest_file=dest_folder/filename
        try:
            copyfile(fl, dest_file)
        except Exception as e:
            print(f'Exception occured at file {fl}')
            
copy_orig2model_folder(test_files, TEST_FOLDER)

working on file 0
working on file 1000
working on file 2000
working on file 3000
working on file 4000
working on file 5000
working on file 6000
working on file 7000
working on file 8000
working on file 9000
working on file 10000
working on file 11000
Exception occured at file data/audio_google_original/original/


In [32]:
copy_orig2model_folder(train_files, TRAIN_FOLDER)

working on file 0
working on file 1000
working on file 2000
working on file 3000
working on file 4000
working on file 5000
working on file 6000
working on file 7000
working on file 8000
working on file 9000
working on file 10000
working on file 11000
working on file 12000
working on file 13000
working on file 14000
working on file 15000
working on file 16000
working on file 17000
working on file 18000
working on file 19000
working on file 20000
working on file 21000
working on file 22000
working on file 23000
working on file 24000
working on file 25000
working on file 26000
working on file 27000
working on file 28000
working on file 29000
working on file 30000
working on file 31000
working on file 32000
working on file 33000
working on file 34000
working on file 35000
working on file 36000
working on file 37000
working on file 38000
working on file 39000
working on file 40000
working on file 41000
working on file 42000
working on file 43000
working on file 44000
working on file 45000
w

## Overview of files

In [35]:
train_files_fullpath=list(TRAIN_FOLDER.rglob('*.wav'))
len(train_files_fullpath)

85245

In [36]:
test_files_fullpath=list(TEST_FOLDER.rglob('*.wav'))
len(test_files_fullpath)

11407

In [88]:
def get_wav_info(wav_path):
    wav_path=str(wav_path)
    process = subprocess.Popen(['sox','--i',  wav_path],
                     stdout=subprocess.PIPE, 
                     stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    return stdout, stderr

def format_output(output_str):
    info_dict={}
    for row in output_str.split('\n'):

        pieces=row.split(' : ')
        if len(pieces)==2:
            info_dict[pieces[0].strip()]=pieces[1].strip()
    return info_dict

def get_wavs_info(wavs_path):
    files_info=[]
    for wav in wavs_path:
        info_raw=get_wav_info(wav)
        info_formatted=format_output(info_raw[0].decode('utf-8'))
        files_info.append(info_formatted)
    return pd.DataFrame(files_info)
        

In [89]:
df_test_info=get_wavs_info(test_files_fullpath)

In [90]:
df_test_info.shape

(11407, 7)

In [91]:
df_train_info=get_wavs_info(train_files_fullpath)

In [92]:
df_train_info.shape

(85245, 7)

In [93]:
df_test_info.tail()

Unnamed: 0,Input File,Channels,Sample Rate,Precision,Duration,File Size,Bit Rate
11402,'data\audio_google_original\test\_background_n...,1,16000,16-bit,00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors,32.8k,257k
11403,'data\audio_google_original\test\_background_n...,1,16000,16-bit,00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors,32.8k,257k
11404,'data\audio_google_original\test\_background_n...,1,16000,16-bit,00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors,32.8k,257k
11405,'data\audio_google_original\test\_background_n...,1,16000,16-bit,00:00:00.90 = 14336 samples ~ 67.2 CDDA sectors,28.8k,257k
11406,'data\audio_google_original\test\_background_n...,1,16000,16-bit,00:00:00.99 = 15872 samples ~ 74.4 CDDA sectors,31.8k,257k


In [94]:
df_test_info.describe(include='all')

Unnamed: 0,Input File,Channels,Sample Rate,Precision,Duration,File Size,Bit Rate
count,11407,11407,11407,11407,11407,11407,11407
unique,11407,1,1,1,53,40,6
top,'data\audio_google_original\test\unknown\bed_8...,1,16000,16-bit,00:00:01.00 = 16000 samples ~ 75 CDDA sectors,32.0k,256k
freq,1,11407,11407,11407,10083,10083,10760


In [95]:
df_train_info.describe(include='all')

Unnamed: 0,Input File,Channels,Sample Rate,Precision,Duration,File Size,Bit Rate
count,85245,85245,85245,85245,85245,85245,85245
unique,85245,1,1,1,142,94,5
top,'data\audio_google_original\train\up\a489191a_...,1,16000,16-bit,00:00:01.00 = 16000 samples ~ 75 CDDA sectors,32.0k,256k
freq,1,85245,85245,85245,76364,76367,83090


In [101]:
df_test_info.Duration.value_counts()

00:00:01.00 = 16000 samples ~ 75 CDDA sectors         10083
00:00:01.02 = 16384 samples ~ 76.8 CDDA sectors         324
00:00:00.90 = 14336 samples ~ 67.2 CDDA sectors         153
00:00:00.98 = 15702 samples ~ 73.6031 CDDA sectors       80
00:00:00.94 = 15019 samples ~ 70.4016 CDDA sectors       63
00:00:00.98 = 15604 samples ~ 73.1437 CDDA sectors       48
00:00:00.85 = 13654 samples ~ 64.0031 CDDA sectors       45
00:00:00.77 = 12288 samples ~ 57.6 CDDA sectors          42
00:00:00.81 = 12971 samples ~ 60.8016 CDDA sectors       41
00:00:00.93 = 14861 samples ~ 69.6609 CDDA sectors       36
00:00:00.64 = 10240 samples ~ 48 CDDA sectors            32
00:00:00.88 = 14118 samples ~ 66.1781 CDDA sectors       29
00:00:00.75 = 12052 samples ~ 56.4937 CDDA sectors       28
00:00:00.84 = 13375 samples ~ 62.6953 CDDA sectors       26
00:00:00.51 = 8192 samples ~ 38.4 CDDA sectors           25
00:00:00.73 = 11606 samples ~ 54.4031 CDDA sectors       24
00:00:00.73 = 11605 samples ~ 54.3984 CD

In [102]:
df_train_info.Duration.value_counts()

00:00:01.00 = 16000 samples ~ 75 CDDA sectors         76364
00:00:00.90 = 14336 samples ~ 67.2 CDDA sectors         614
00:00:00.98 = 15702 samples ~ 73.6031 CDDA sectors      507
00:00:00.98 = 15604 samples ~ 73.1437 CDDA sectors      465
00:00:00.94 = 15019 samples ~ 70.4016 CDDA sectors      457
                                                      ...  
00:00:00.08 = 1266 samples ~ 5.93438 CDDA sectors         1
00:00:00.53 = 8534 samples ~ 40.0031 CDDA sectors         1
00:00:00.10 = 1592 samples ~ 7.4625 CDDA sectors          1
00:00:00.63 = 10069 samples ~ 47.1984 CDDA sectors        1
00:00:00.66 = 10582 samples ~ 49.6031 CDDA sectors        1
Name: Duration, Length: 142, dtype: int64