In [1]:
import os
import glob
import sox
import tqdm
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
from datetime import timedelta
from multiprocessing import Pool

%matplotlib inline  

In [2]:
num_parallel = 16
datapath_root = '/home/tracek/Data/gender/raw/'
datapath_male = os.path.join(datapath_root, 'male/')
datapath_female = os.path.join(datapath_root, 'female/')

In [3]:
waves_male_paths = glob.glob(datapath_male + '/**/*.wav', recursive=True)
waves_female_paths = glob.glob(datapath_female + '/**/*.wav', recursive=True)
readme_paths = glob.glob(datapath_root + '/**/README', recursive=True)

In [4]:
assert len(os.listdir(datapath_male)) + len(os.listdir(datapath_female)) == len(readme_paths)

In [5]:
def get_info(path):
    info = sox.file_info.info(path)
    info['path'] = path
    if 'num_samples' not in info:
        print('No samples in ', path)
    return info

In [6]:
pool = Pool(processes=num_parallel)
male_info = pool.map(get_info, waves_male_paths)
female_info = pool.map(get_info, waves_female_paths)

In [None]:
def get_readme_info(path):
    d = {}
    with open(path, 'r') as readme:
        for line in readme:
            gender_match = re.search("Gender: (\W*\w+\W*)", line, re.IGNORECASE)
            age_match = re.search("Age Range: (\W*\w+\W*)", line, re.IGNORECASE)
            lang_match = re.search("Language: (\W*\w+\W*)", line, re.IGNORECASE)

In [None]:
duration_male = np.array([info['duration'] for info in male_info])
duration_female = np.array([info['duration'] for info in female_info])
total_male = int(duration_male.sum())
total_female = int(duration_female.sum())
print('Total duration of male recordings: {} '.format(str(timedelta(seconds=total_male))))
print('Total duration of female recordings: {} '.format(str(timedelta(seconds=total_female))))

In [None]:
no_bins = 50
fig, ax = plt.subplots(1,2, figsize=(15,7))
ax[0].set_yscale('symlog')
_ = ax[0].hist(duration_male, bins=no_bins, alpha=0.5, label='male')
_ = ax[0].hist(duration_female, bins=no_bins, alpha=0.5, label='female')
bins = np.linspace(0, 10, no_bins)
_ = ax[1].hist(duration_male, bins=bins, alpha=0.5, label='male')
_ = ax[1].hist(duration_female, bins=bins, alpha=0.5, label='female')
ax[0].legend(loc='upper right')
ax[0].set_title('Duration histogram - full length')
ax[1].legend(loc='upper right')
ax[1].set_title('Duration histogram - up to 10s')

In [None]:
name_duration_tuples_m = [(info['path'], info['duration']) for info in male_info]
name_duration_tuples_m_short = [(info['path'], info['duration']) for info in male_info if info['duration'] < 0.5]

In [None]:
len(name_duration_tuples_m_short)

In [9]:
featurespecs = \
    ['LSF: LSF blockSize=%d stepSize=%d LSFNbCoeffs=10 LSFDisplacement=1',
     'MelSpectrum: MelSpectrum blockSize=%d stepSize=%d',
     'SpectralCrestFactorPerBand: SpectralCrestFactorPerBand blockSize=%d stepSize=%d',
     'SpectralDecrease: SpectralDecrease blockSize=%d stepSize=%d',
     'SpectralFlatnessPerBand: SpectralFlatnessPerBand blockSize=%d stepSize=%d',
     'SpectralFlux: SpectralFlux blockSize=%d stepSize=%d'
     'SpectralSlope: SpectralSlope blockSize=%d stepSize=%d',
     'SpectralRolloff: SpectralRolloff blockSize=%d stepSize=%d']


In [15]:
block_size = 1024
for featurespec in featurespecs:
    print(featurespec % (block_size, block_size / 2))

LSF: LSF blockSize=1024 stepSize=512 LSFNbCoeffs=10 LSFDisplacement=1
MelSpectrum: MelSpectrum blockSize=1024 stepSize=512
SpectralCrestFactorPerBand: SpectralCrestFactorPerBand blockSize=1024 stepSize=512
SpectralDecrease: SpectralDecrease blockSize=1024 stepSize=512
SpectralFlatnessPerBand: SpectralFlatnessPerBand blockSize=1024 stepSize=512


TypeError: not enough arguments for format string

In [14]:
'SpectralFlux: SpectralFlux blockSize=%d stepSize=%d' % (block_size, block_size / 2)

'SpectralFlux: SpectralFlux blockSize=1024 stepSize=512'