In [1]:
import os
import sys
from scipy.io import wavfile
import wave
import numpy as np
import pandas as pd

# Conversion between types
This part and the following require [SoX package](http://sox.sourceforge.net/) (Sound eXchange). For Max OS, run
```
brew install sox
```

In [2]:
def wav_to_data(wav_filepath):
  sample_rate, data = wavfile.read(wav_filepath)
  return data

In [3]:
def sphere_to_data(sphere_filepath):
  try:
    tmp_filepath = '/tmp/haha.wav'
    os.system('sox -t sph ' + sphere_filepath + ' ' + tmp_filepath)
    res = wav_to_data(tmp_filepath)
    return res
  except:
    raise ValueError("Converting SPHERE files requires SoX installed. For Max OS, run 'brew install sox'.")
    
def sphere_to_bytes(sphere_filepath):
  with open(sphere_filepath, 'rb') as f:
    f.read(1024)
    data_in_bytes = f.read()
  return data_in_bytes

# Play audio files

In [4]:
PLAY_CMD = 'play' # For Mac OS with SoX installed

def play_audio_from_file(audio_filename, cmd=PLAY_CMD):
  os.system(cmd + ' ' + audio_filename)

In [5]:
def play_audio_from_bytes(data_in_bytes, nchannels=1, sampwidth=2, 
                          framerate=16000, comptype='NONE', 
                          compname='not compressed'):
  
  if not isinstance(data_in_bytes, bytes):
    data_in_bytes = bytes(data_in_bytes)
    
  # Create a tmp file
  tmp_filepath = '/tmp/haha.wav'
  
  with wave.open(tmp_filepath, 'w') as f:
    
    # Set audio file parameters
    f.setcomptype(comptype, compname)
    f.setframerate(framerate)
    f.setnchannels(nchannels)
    f.setsampwidth(sampwidth)
    f.setnframes(len(data_in_bytes)//sampwidth)
    
    # Write data
    f.writeframes(data_in_bytes)
    
  play_audio_from_file(tmp_filepath)
  
  # Delete the tmp file
  os.system('rm ' + tmp_filepath)

In [6]:
filename1 = 'example1.sph' # NIST SPHERE file
filename2 = 'example2.sph' # NIST SPHERE file

In [7]:
data1 = sphere_to_data(filename1)
data1

ValueError: Converting SPHERE files requires SoX installed. For Max OS, run 'brew install sox'.

In [84]:
data2 = sphere_to_data(filename2)
data2

array([-1,  0,  1, ...,  3,  1, -2], dtype=int16)

In [85]:
# Play audio
play_audio_from_bytes(data1)

In [10]:
play_audio_from_bytes(data2)

In [11]:
# Combine 2 audio files
play_audio_from_bytes(data1 + np.pad(data2, (0, data1.size - data2.size), mode='constant'))

In [12]:
# Play an interval
play_audio_from_bytes(data1[10000:20000])

# Dealing with TIMIT dataset

In [8]:
%ls ../../datasets/speech/timit/

[1m[34mCONVERT[m[m/    README.txt  [1m[34mSPHERE[m[m/     [1m[34mTIMIT[m[m/


In [43]:
timit_dir = '../../datasets/speech/timit/'

In [10]:
os.listdir(timit_dir)

['.DS_Store', 'SPHERE', 'README.txt', 'TIMIT', 'CONVERT']

In [301]:
# traverse training set of TIMIT
path = timit_dir + 'TIMIT/train'

li = []

for (dirpath, dirnames, filenames) in os.walk(path):
  for filename in filenames:
    if filename.endswith('.WAV'):
      dir_split = dirpath.split(os.sep)
      data_type, region, speaker =  dir_split[6:9]
      gender = speaker[0]
      basename = filename.split('.')[0]
      phonetic_label = basename + '.PHN'
      word_label = basename + '.WRD'
      sentence_label = basename + '.TXT'
      li.append((dirpath, filename, phonetic_label, word_label, sentence_label, data_type, region, speaker, gender))

In [302]:
# Verify that all files found are indeed valid files on the disk

for x in li:
  for idx in range(1,5):
    filepath = os.path.abspath(os.path.join(x[0], x[idx]))
    if not os.path.isfile(filepath):
      print("Holy shit! The file {} doesn't exist!".format())
      break
else:
  print("It's all good. All files exist on the disk.")

It's all good. All files exist on the disk.


In [303]:
li[0]

('../../datasets/speech/timit/TIMIT/train/DR4/MMDM0',
 'SI681.WAV',
 'SI681.PHN',
 'SI681.WRD',
 'SI681.TXT',
 'train',
 'DR4',
 'MMDM0',
 'M')

In [304]:
# Convert to pandas dataframes

timit_df = pd.DataFrame({'dirpath':        [x[0] for x in li],
                         'wavfile':        [x[1] for x in li],
                         'phonetic_label': [x[2] for x in li],
                         'word_label':     [x[3] for x in li],
                         'sentence_label': [x[4] for x in li],
                         'data_type':      [x[5] for x in li],
                         'region':         [x[6] for x in li],
                         'speaker':        [x[7] for x in li],
                         'gender':         [x[8] for x in li]})

In [561]:
timit_df.head()

Unnamed: 0,data_type,dirpath,gender,phonetic_label,region,sentence_label,speaker,wavfile,word_label
0,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
1,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SA2.PHN,DR4,SA2.TXT,MMDM0,SA2.WAV,SA2.WRD
2,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SX411.PHN,DR4,SX411.TXT,MMDM0,SX411.WAV,SX411.WRD
3,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SA1.PHN,DR4,SA1.TXT,MMDM0,SA1.WAV,SA1.WRD
4,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SX231.PHN,DR4,SX231.TXT,MMDM0,SX231.WAV,SX231.WRD


In [65]:
# Save...
timit_df.to_csv('timit_files_info.csv', index=False)

# and load
timit_df = pd.read_csv('timit_files_info.csv')

In [None]:
# and load
timit_df = pd.read_csv('timit_files_info.csv')

In [86]:
# and we see we get the same thing back
timit_df.head()

Unnamed: 0,data_type,dirpath,gender,phonetic_label,region,sentence_label,speaker,wavfile,word_label
0,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
1,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SA2.PHN,DR4,SA2.TXT,MMDM0,SA2.WAV,SA2.WRD
2,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SX411.PHN,DR4,SX411.TXT,MMDM0,SX411.WAV,SX411.WRD
3,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SA1.PHN,DR4,SA1.TXT,MMDM0,SA1.WAV,SA1.WRD
4,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SX231.PHN,DR4,SX231.TXT,MMDM0,SX231.WAV,SX231.WRD


## Extract features and labels on phonetic level

In [562]:
def parse_label_file(label_file):
  """Parse a label file in TIMIT dataset to a pandas.DataFrame object.
  
  Returns:
    a pandas.DataFrame object containing 3 columns: begin, end, label
  """
  with open(label_file, 'r') as f:
    lines = f.readlines()
  begins = [int(line.split(' ')[0]) for line in lines]
  ends = [int(line.split(' ')[1]) for line in lines]
  labels = [' '.join(line.split(' ')[2:])[:-1] for line in lines]
  df = pd.DataFrame({'begin': begins, 'end': ends, 'label': labels})
  df['label'] = df['label'].astype('category')
  df['label_file'] = label_file
  return df

In [563]:
label_file = os.path.join(li[0][0], li[0][2])
label_file

'../../datasets/speech/timit/TIMIT/train/DR4/MMDM0/SI681.PHN'

In [564]:
haha = parse_label_file(label_file)
haha.head()

Unnamed: 0,begin,end,label,label_file
0,0,1960,h#,../../datasets/speech/timit/TIMIT/train/DR4/MM...
1,1960,2466,w,../../datasets/speech/timit/TIMIT/train/DR4/MM...
2,2466,3480,ix,../../datasets/speech/timit/TIMIT/train/DR4/MM...
3,3480,4000,dcl,../../datasets/speech/timit/TIMIT/train/DR4/MM...
4,4000,5960,s,../../datasets/speech/timit/TIMIT/train/DR4/MM...


In [565]:
merged_df.index

Int64Index([], dtype='int64')

In [567]:
level='phonetic'

li = []

for index, row in timit_df.iterrows():
  label_file = os.path.join(row['dirpath'], row[level + '_label'])
  li.append(parse_label_file(label_file))

In [568]:
phonetic_df = pd.concat(li, ignore_index=True)

In [569]:
phonetic_df

Unnamed: 0,begin,end,label,label_file
0,0,1960,h#,../../datasets/speech/timit/TIMIT/train/DR4/MM...
1,1960,2466,w,../../datasets/speech/timit/TIMIT/train/DR4/MM...
2,2466,3480,ix,../../datasets/speech/timit/TIMIT/train/DR4/MM...
3,3480,4000,dcl,../../datasets/speech/timit/TIMIT/train/DR4/MM...
4,4000,5960,s,../../datasets/speech/timit/TIMIT/train/DR4/MM...
5,5960,7480,ah,../../datasets/speech/timit/TIMIT/train/DR4/MM...
6,7480,7880,tcl,../../datasets/speech/timit/TIMIT/train/DR4/MM...
7,7880,9400,ch,../../datasets/speech/timit/TIMIT/train/DR4/MM...
8,9400,9960,ix,../../datasets/speech/timit/TIMIT/train/DR4/MM...
9,9960,10680,n,../../datasets/speech/timit/TIMIT/train/DR4/MM...


In [571]:
timit_df['label_file'] = timit_df.apply(lambda row:
      os.path.join(row['dirpath'], row[level + '_label']), axis=1)

In [572]:
merged_df = pd.merge(phonetic_df, timit_df, on='label_file')

In [258]:
useful_df = merged_df[['begin', 'end', 'label', 'dirpath', 'wavfile', 'gender', 'region', 'data_type']]
useful_df.head()

Unnamed: 0,begin,end,label,dirpath,wavfile,gender,region,data_type
0,0,1960,h#,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,SI681.WAV,M,DR4,train
1,1960,2466,w,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,SI681.WAV,M,DR4,train
2,2466,3480,ix,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,SI681.WAV,M,DR4,train
3,3480,4000,dcl,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,SI681.WAV,M,DR4,train
4,4000,5960,s,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,SI681.WAV,M,DR4,train


## Convert categorical labels to integers

In [309]:
labels_df = merged_df[['gender', 'region', 'label']]\
              .astype({'label':'category', 
                       'gender':'category', 
                       'region':'category'})
labels_df.head()

Unnamed: 0,gender,region,label
0,M,DR4,h#
1,M,DR4,w
2,M,DR4,ix
3,M,DR4,dcl
4,M,DR4,s


In [493]:
def total_num_class(categorical_labels_df):
  df = categorical_labels_df
  nums_categories = [len(df[col].cat.categories) for col in df.columns]
  return sum(nums_categories)

total_num_class(labels_df)

def cat_to_num(categorical_labels_df):
  """Convert a pd.DataFrame object having only categorical columns to 
  a pd.DataFrame object having only integer values. 
  
  This can be considered as an extension of `index encoding` for several
  columns at the same time: firstly, each categorical column is  converted 
  to integer value independently then translated by the sum of numbers of
  categories of all columns on the right.
  """
  df = categorical_labels_df
  nums_categories = [len(df[col].cat.categories) for col in df.columns]
  translation = 0
  li =[]
  for idx, col in enumerate(df.columns):
    assert(str(df[col].dtype) == 'category')
    translated_codes = df[col].cat.codes + translation # to avoid index conflict
    translated_codes = translated_codes.rename('label' + str(idx))
    li.append(translated_codes)
    translation += nums_categories[idx]
  return pd.concat(li, axis=1)
  
cat_to_num(labels_df).head()

71

In [308]:
def cat_to_num(categorical_labels_df):
  """Convert a pd.DataFrame object having only categorical columns to 
  a pd.DataFrame object having only integer values. 
  
  This can be considered as an extension of `index encoding` for several
  columns at the same time: firstly, each categorical column is  converted 
  to integer value independently then translated by the sum of numbers of
  categories of all columns on the right.
  """
  df = categorical_labels_df
  nums_categories = [len(df[col].cat.categories) for col in df.columns]
  translation = 0
  li =[]
  for idx, col in enumerate(df.columns):
    assert(str(df[col].dtype) == 'category')
    translated_codes = df[col].cat.codes + translation # to avoid index conflict
    translated_codes = translated_codes.rename('label' + str(idx))
    li.append(translated_codes)
    translation += nums_categories[idx]
  return pd.concat(li, axis=1)
  
cat_to_num(labels_df).head()

Unnamed: 0,label0,label1,label2
0,1,5,37
1,1,5,67
2,1,5,41
3,1,5,23
4,1,5,58


In [295]:
def has_repeated_labels(labels_df):
  has_repeated = labels_df.apply(lambda row: len(row) != len(set(row)), axis=1)
  return any(has_repeated)

In [313]:
# Check if some row has repeated index (a conflict).
# This step can be a bit slow
translated_labels_df = cat_to_num(labels_df)
has_repeated_labels(translated_labels_df)

False

In [318]:
useful_df.shape

(177080, 8)

In [316]:
translated_labels_df.shape

(177080, 3)

## Get interval data

In [321]:
def get_interval_data(wav_filepath, begin, end):
  data = sphere_to_data(wav_filepath)
  return data[begin: end]

def get_interval_data_from_row(row):
  wav_filepath = os.path.join(row['dirpath'], row['wavfile'])
  begin = row['begin']
  end = row['end']
  return get_interval_data(wav_filepath, begin, end)

In [424]:
for index, row in merged_df.iterrows():
  sample_interval = get_interval_data_from_row(row)
  if index > 42:
    print(row)
    break

begin                                                         13200
end                                                           14360
label                                                             k
label_file        ../../datasets/speech/timit/TIMIT/train/DR4/MM...
data_type                                                     train
dirpath           ../../datasets/speech/timit/TIMIT/train/DR4/MMDM0
gender                                                            M
phonetic_label                                              SA2.PHN
region                                                          DR4
sentence_label                                              SA2.TXT
speaker                                                       MMDM0
wavfile                                                     SA2.WAV
word_label                                                  SA2.WRD
Name: 43, dtype: object


In [425]:
# Play the cut audio to see whether it corresponds to the label
play_audio_from_bytes(sample_interval)

In [394]:
len(haha)

1890

# Convert TIMIT to AutoDLDataset (TFRecord)

In [88]:
import tensorflow as tf

In [426]:
time_series = sample_interval

In [434]:
# Helper functions

def _int64_feature(value):
  # Here `value` is a list of integers
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def _bytes_feature(value):
  # Here `value` is a list of bytes
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=value))

def _float_feature(value):
  # Here `value` is a list of floats
  return tf.train.Feature(float_list=tf.train.FloatList(value=value))

def _feature_list(feature):
  # Here `feature` is a list of tf.train.Feature
  return tf.train.FeatureList(feature=feature)

In [433]:
filename = 'timit-train-phonetic'

In [436]:
type(time_series)

numpy.ndarray

In [None]:
time

In [486]:
timit_tfrecord_filename = 'timit-train-phonetic'

In [488]:
def time_series_to_sequence_example_df(merged_df, labels_df, filename):
  """Convert a list of time series (Numpy array) to TFRecords 
  following SequenceExample proto.
  
  Args:
    merged_df: a pd.DataFrame object containing columns: `dirpath`, `wavfile`, `begin`, `end`
    labels_df: a pd.DataFrame object only containing integer labels
    filename: a string
  Returns:
    None. Save a TFRecord to filename
  """
  num_examples = merged_df.shape[0]
  num_labels = labels_df.shape[0] # number of lines of labels
  if num_examples != num_labels:
    raise ValueError("Number of examples {:d} does not match number of labels {:d}."\
                     .format(num_examples, num_labels))
  
  feature_label_generator = zip(merged_df.iterrows(), labels_df.iterrows())
  print("Writing to: {}... Total number of examples: {:d}".format(filename, num_examples))
  with tf.python_io.TFRecordWriter(filename) as writer:
    for (index, feature_row), (_, label_row) in feature_label_generator:
      if index % 1000 == 0:
        print("Writing example of index: ", index)
      le = len(label_row) # number of labels in this line
      label_array = label_row.values
      feature_array = get_interval_data_from_row(feature_row)
      feature_list = [_float_feature([x]) for x in feature_array]
      
      context = tf.train.Features(
            feature={
                'id': _int64_feature([index]), # use index as id
                'label_index': _int64_feature(label_array),
                'label_score': _float_feature([1]*le)
            })
      feature_lists = tf.train.FeatureLists(
          feature_list={
          '0_dense_input': _feature_list(feature_list)
          })
      sequence_example = tf.train.SequenceExample(
          context=context,
          feature_lists=feature_lists)
      writer.write(sequence_example.SerializeToString())

In [485]:
time_series_to_sequence_example_df(merged_df, translated_labels_df, timit_tfrecord_filename)

Writing to: timit-train-phonetic... Total number of examples: 177080
Writing example of index:  0
Writing example of index:  100
Writing example of index:  200
Writing example of index:  300
Writing example of index:  400
Writing example of index:  500
Writing example of index:  600
Writing example of index:  700
Writing example of index:  800
Writing example of index:  900


In [437]:
def time_series_to_sequence_example(li_time_series, labels, filename):
  """Convert a list of time series (Numpy array) to TFRecords 
  following SequenceExample proto.
  
  Args:
    li_time_series: a generator(or list) of numpy array
    labels: a generator(or list) of lists (or tuples) of integers
    filename: a string
  Returns:
    None. Save a TFRecord to filename
  """
  num_examples = len(li_time_series)
  num_labels = len(labels) # number of lines of labels
  if num_examples != num_labels:
    raise ValueError("Number of examples {:d} does not match number of labels {:d}."\
                     .format(num_examples, num_labels))
  
  print("Writing: {}... Total number of examples: {:d}".format(filename, num_examples))
  with tf.python_io.TFRecordWriter(filename) as writer:
    for index in range(num_examples):
      if index == 1000:
        print("Writing example of index: ", index)
        break  # for debugging
      le = len(labels[index]) # number of labels in this line
      feature_list = [_float_feature(x) for x in features[index]]
      context = tf.train.Features(
            feature={
                'id': _int64_feature([index]), # use index as id
                'label_index': _int64_feature(labels[index]),
                'label_score': _float_feature([1]*le)
            })
      feature_lists = tf.train.FeatureLists(
          feature_list={
          '0_dense_input': _feature_list(feature_list)
          })
      sequence_example = tf.train.SequenceExample(
          context=context,
          feature_lists=feature_lists)
      writer.write(sequence_example.SerializeToString())

In [489]:
def print_first_sequence_example(path_to_tfrecord):
  for bytes in tf.python_io.tf_record_iterator(path_to_tfrecord):
    sequence_example = tf.train.SequenceExample.FromString(bytes)
    print(sequence_example)
    break

In [596]:
os.path.pardir('hahahah/haha')

'haha'

In [490]:
print_first_sequence_example(timit_tfrecord_filename)

context {
  feature {
    key: "id"
    value {
      int64_list {
        value: 0
      }
    }
  }
  feature {
    key: "label_index"
    value {
      int64_list {
        value: 1
        value: 5
        value: 37
      }
    }
  }
  feature {
    key: "label_score"
    value {
      float_list {
        value: 1.0
        value: 1.0
        value: 1.0
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "0_dense_input"
    value {
      feature {
        float_list {
          value: -7.0
        }
      }
      feature {
        float_list {
          value: 2.0
        }
      }
      feature {
        float_list {
          value: 1.0
        }
      }
      feature {
        float_list {
          value: 1.0
        }
      }
      feature {
        float_list {
          value: -3.0
        }
      }
      feature {
        float_list {
          value: 0.0
        }
      }
      feature {
        float_list {
          value: -1.0
        }
      }
      feature 

## Test this file with dataset.py

In [494]:
dataset_dir = 'timit-train'
os.path.isdir(dataset_dir)

True

In [498]:
%ls ../../tfrecord_format/autodl-format-definition/

data.proto   data_pb2.py  dataset.py


In [500]:
autodl_def_dir = "../../tfrecord_format/autodl-format-definition/"
sys.path.append(autodl_def_dir)

In [516]:
from dataset import AutoDLDataset

def test_autodldataset(dataset_dir):
  autodl_dataset = AutoDLDataset(dataset_dir)
  autodl_dataset.init()
  dataset = autodl_dataset.get_dataset()
  iterator = dataset.make_one_shot_iterator()
  features, labels = iterator.get_next()
  with tf.Session() as sess:
    val_features = features.eval()
    val_labels = labels.eval()
    print("Successfully parsed the dataset!")
    print("Feature shape: {}, label shape: {}".format(val_features.shape, val_labels.shape))
    print("Features: ", val_features)
    print("Labels: ", val_labels)
  return val_features, val_labels

In [517]:
val_features, val_labels = test_autodldataset(dataset_dir)

INFO:tensorflow:Number of training files: 1.
Successfully parsed the dataset!
Feature shape: (30, 2000, 1, 1), label shape: (30, 71)
Features:  [[[[  -7.]]

  [[   2.]]

  [[   1.]]

  ...

  [[   0.]]

  [[   0.]]

  [[   0.]]]


 [[[   9.]]

  [[  11.]]

  [[   8.]]

  ...

  [[   0.]]

  [[   0.]]

  [[   0.]]]


 [[[ -86.]]

  [[ -92.]]

  [[-105.]]

  ...

  [[   0.]]

  [[   0.]]

  [[   0.]]]


 ...


 [[[ 104.]]

  [[ 102.]]

  [[  93.]]

  ...

  [[   0.]]

  [[   0.]]

  [[   0.]]]


 [[[  -6.]]

  [[  86.]]

  [[ -64.]]

  ...

  [[   0.]]

  [[   0.]]

  [[   0.]]]


 [[[  11.]]

  [[  -5.]]

  [[ -17.]]

  ...

  [[   0.]]

  [[   0.]]

  [[   0.]]]]
Labels:  [[0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]]


In [537]:
def generate_labels_array(test_dataset_dir):
  autodl_dataset = AutoDLDataset(test_dataset_dir)
  autodl_dataset.init(batch_size=10000, repeat=False)
  dataset = autodl_dataset.get_dataset()
  iterator = dataset.make_one_shot_iterator()
  features, labels = iterator.get_next()
  
  li_batch = []
  num_batch = 0
  with tf.train.MonitoredTrainingSession() as sess:
    while not sess.should_stop():
        label_batch = sess.run(labels)
        li_batch.append(label_batch)
        num_batch += 1
        print("Num Batch: ", num_batch)
        
  all_labels = np.concatenate(li_batch)
  np.savetxt(os.path.join(test_dataset_dir, test_dataset_dir + '.solution'), all_labels, fmt='%.0f')

In [538]:
haha = generate_labels_array('timit-train')

INFO:tensorflow:Number of training files: 1.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
Num Batch:  1


In [526]:
haha.shape

(1000, 71)

# Aggregate all to have a global dataset formatter

In [542]:
timit_dir

'../../datasets/speech/timit/'

In [None]:
os.path.isfx

In [None]:
if not num_examples or index < num_examples

In [541]:
1 < 0

False

In [576]:
merged_df[merged_df['data_type'] == 'train']

Unnamed: 0,begin,end,label,label_file,data_type,dirpath,gender,phonetic_label,region,sentence_label,speaker,wavfile,word_label
0,0,1960,h#,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
1,1960,2466,w,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
2,2466,3480,ix,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
3,3480,4000,dcl,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
4,4000,5960,s,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
5,5960,7480,ah,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
6,7480,7880,tcl,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
7,7880,9400,ch,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
8,9400,9960,ix,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD
9,9960,10680,n,../../datasets/speech/timit/TIMIT/train/DR4/MM...,train,../../datasets/speech/timit/TIMIT/train/DR4/MMDM0,M,SI681.PHN,DR4,SI681.TXT,MMDM0,SI681.WAV,SI681.WRD


In [574]:
merged_df.shape

(177080, 13)

In [584]:
labels_df.columns

Index(['gender', 'region', 'label'], dtype='object')

In [585]:
yo = labels_df['region'].cat

In [586]:
yo.categories

Index(['DR1', 'DR2', 'DR3', 'DR4', 'DR5', 'DR6', 'DR7', 'DR8'], dtype='object')

In [591]:
bool('a')

True

In [598]:
'hha/hahah/'.split(os.path.sep)

['hha', 'hahah', '']

# TensorFlow Speech Recognition Challenge

In [92]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline

In [50]:
dataset_dir = '/Users/evariste/projects/autodl-contrib/raw_datasets/speech/speech_file_format/speech_commands_v0.01/'

In [51]:
filename = 'left/0e5193e6_nohash_0.wav'

In [52]:
filepath = os.path.join(dataset_dir, filename)
os.path.isfile(filepath)

True

In [34]:
from scipy.io import wavfile

def wav_to_data(wav_filepath):
  sample_rate, data = wavfile.read(wav_filepath)
  return data

In [35]:
haha = wav_to_data(filepath)

In [37]:
haha

array([  3,   1,   2, ..., -53, -58, -51], dtype=int16)

In [21]:
%ls -al files/zero_24befdb3_nohash_1*

ls: files/zero_24befdb3_nohash_1*: No such file or directory


In [28]:
with open(filepath, 'rb') as f:
  print(f.readlines())

[b'R\x9b\x8f\x8cj\xa1}\x00W\x98\x97\x9b\xab\xd3\xe1\x940\x10\x00\x00\x01\x01\x01\x01\x80\xbe>\x00\x00}}\x00\x02\x02\x10\x10d\xc5\xd5\xd5a}}\x00\x01\x01\xf7\xf6\xf6\xf6\xfa\xfa\xf8\xf8\xf5\xf5\xf6\xf6\xf7\xf7\xf8\xf8\xfa\xfa\xfc\xfc\xf7\xf7\xf6\xf6\xfc\xfc\xff\x00\xf7\xf6\xef\xef\x00\x01\x0e\x0e\xfa\xf9\xea\xea\x05\x06\x1a\x1a\xf6\xf5\xde\xde\t\n', b'--\x01\x01\xcf\xce\xf9\xf967\x07\x07\xc4\xc3\xf3\xf334\n', b'\n', b'\xc9\xc8\xf2\xf267\r\r\xcc\xcb\xf0\xf0/0\x11\x11\xd3\xd2\xf0\xf001\x17\x17\xe3\xe2\xf9\xf9"#\x0e\x0e\xe6\xe5\xf7\xf7\x16\x17\x08\x08\xf6\xf5\xfd\xfd\t\n', b'\x06\x06\x01\x01\x05\x05\x06\x06\x00\x00\xfd\xfc\xfe\xfe\xfe\xfe\x01\x02\x07\x07\x03\x03\xfa\xf9\xfd\xfd\x02\x03\xfc\xfb\xf5\xf5\x03\x04\n', b'\n', b'\xf0\xef\xf0\xf0\t\n', b'\t\t\xeb\xea\xe7\xe7\x12\x13\x13\x13\xe6\xe5\xe8\xe8\x14\x15\x17\x17\xed\xec\xe8\xe8\x16\x17\x15\x15\xe9\xe8\xeb\xeb\x13\x14\x16\x16\xee\xed\xe9\xe9\x0f\x10\x11\x11\xf4\xf3\xf4\xf4\x0b\x0c\x0b\x0b\xf4\xf3\xf2\xf2\x04\x05\x05\x05\xfd\xfc\x01\x02\x06

In [None]:
for index, row in df.iterrows():
  

In [53]:
play_audio_from_bytes(haha)

## Construct a pandas dataframe to store dataset info

In [54]:
dataset_dit = '/Users/evariste/projects/autodl-contrib/raw_datasets/speech/speech_file_format/speech_commands_v0.01'

In [188]:
def get_speech_commands_info_df(dataset_dir, tmp_dir='/tmp/', from_scratch=False, classes=None):
  """Format Speech Commands dataset to AutoDL format. 
  """
  csv_filepath = os.path.join(tmp_dir, 'speech_commands_info.csv')
  if not from_scratch and os.path.isfile(csv_filepath):
    info_df = pd.read_csv(csv_filepath)
    print("Successfully loaded existing info table. Now life is easier.")
    return info_df
  else:
    print("Couldn't load existing info table. Now building from scatch...")
  path = os.path.abspath(dataset_dir)
  li = []
  for (dirpath, dirnames, filenames) in os.walk(path):
    for filename in filenames:
      if filename.endswith('.wav'):
        label = dirpath.split(os.sep)[-1]
        if label != '_background_noise_':
          ext_filename = os.path.join(label, filename)
          assert(os.path.isfile(os.path.join(dataset_dir, ext_filename)))
          li.append((ext_filename, label))
        else:
          print("Background noise file! Passing...")
  info_df = pd.DataFrame({'ext_filename':        [x[0] for x in li],
                     'label':               [x[1] for x in li]})
  info_df['label'] = info_df['label'].astype('category')
  test_df = pd.read_csv(os.path.join(dataset_dir, 'testing_list.txt'), header=None)
  valid_df = pd.read_csv(os.path.join(dataset_dir, 'validation_list.txt'), header=None)
  ext_filenames_test = set(test_df[0])
  ext_filenames_valid = set(valid_df[0])
  def get_subset(ext_filename):
    if ext_filename in ext_filenames_test:
      return 'test'
    elif ext_filename in ext_filenames_valid:
      return 'valid'
    else:
      return 'train'
  info_df['subset'] = info_df['ext_filename'].apply(get_subset).astype('category')
  info_df.to_csv(csv_filepath, index=False)
  return info_df

In [194]:
info_df = get_speech_commands_info_df(dataset_dir, from_scratch=False)

Successfully loaded existing info table. Now life is easier.


In [172]:
haha.loc[haha['label'].isin(['zero'])]

Unnamed: 0,ext_filename,label,subset
35166,zero/988e2f9a_nohash_0.wav,zero,train
35167,zero/93ec8b84_nohash_0.wav,zero,train
35168,zero/439c84f4_nohash_1.wav,zero,valid
35169,zero/2f813234_nohash_1.wav,zero,train
35170,zero/6a27a9bf_nohash_0.wav,zero,valid
35171,zero/b87bdb22_nohash_1.wav,zero,train
35172,zero/97f4c236_nohash_2.wav,zero,test
35173,zero/563aa4e6_nohash_2.wav,zero,test
35174,zero/51055bda_nohash_4.wav,zero,train
35175,zero/5fe4a278_nohash_0.wav,zero,train


In [173]:
import numpy as np

In [175]:
df = haha

In [184]:
df_subsampled = df.loc[np.random.rand(len(df)) < 1.0]

In [181]:
len(df)

64721

In [196]:
import numpy as np
from sklearn.utils import shuffle
np.random.seed(42)

def get_processed_df(info_df, classes=None, proba_keep=1.0, shuffled=True):
  """
  Consider only those examples with label in `classes`
  
  Args:
    classes: an iterable of class names, should be a subset of 
    ['yes', 'no', 'up', 'down', 'left', 'right', 'on', 'off', 'stop', 'go', 
    'zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine', 
    'bed', 'bird', 'cat', 'dog', 'happy', 'house', 'marvin', 'sheila', 'tree', 'wow']
  """
  if classes:
    info_df = info_df.loc[info_df['label'].isin(classes)]
  if proba_keep < 1.0:
    info_df = info_df.loc[np.random.rand(len(info_df)) < proba_keep]
  if shuffled:
    info_df = shuffle(info_df)
  processed_df = info_df.copy()
  processed_df['label'] = processed_df['label'].astype('category')
  processed_df['label_num'] = processed_df['label'].cat.codes
  return processed_df

In [204]:
classes = ['zero', 'one', 'two', 'three', 'four', 'five', 'six', 'seven', 'eight', 'nine']
proba_keep = 0.1
shuffled = True
processed_df = get_processed_df(info_df, classes=classes, proba_keep=proba_keep, shuffled=shuffled)

In [208]:
info_df

Unnamed: 0,ext_filename,label,subset
0,right/988e2f9a_nohash_0.wav,right,train
1,right/1eddce1d_nohash_3.wav,right,train
2,right/93ec8b84_nohash_0.wav,right,train
3,right/6272b231_nohash_1.wav,right,train
4,right/439c84f4_nohash_1.wav,right,valid
5,right/2f813234_nohash_1.wav,right,train
6,right/b87bdb22_nohash_1.wav,right,train
7,right/97f4c236_nohash_2.wav,right,test
8,right/563aa4e6_nohash_2.wav,right,test
9,right/51055bda_nohash_4.wav,right,train


In [207]:
processed_df

Unnamed: 0,ext_filename,label,subset,label_num
39609,seven/b7a0754f_nohash_3.wav,seven,train,5
44236,two/c1d39ce8_nohash_5.wav,two,train,8
39621,seven/099d52ad_nohash_1.wav,seven,valid,5
2911,eight/db9cd41d_nohash_0.wav,eight,train,0
28454,three/e5c48e53_nohash_0.wav,three,train,7
35634,zero/1a5b9ca4_nohash_0.wav,zero,train,9
38508,seven/4ec7d027_nohash_0.wav,seven,train,5
36159,zero/199da2bf_nohash_0.wav,zero,train,9
22133,nine/cd911ace_nohash_0.wav,nine,train,3
50622,six/5b09db89_nohash_4.wav,six,train,6


In [206]:
processed_df.groupby(['label', 'subset']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ext_filename,label_num
label,subset,Unnamed: 2_level_1,Unnamed: 3_level_1
eight,test,30,30
eight,train,196,196
eight,valid,20,20
five,test,27,27
five,train,203,203
five,valid,21,21
four,test,20,20
four,train,198,198
four,valid,30,30
nine,test,26,26


In [231]:
for x in processed_df.iterrows():
  print(x)
  break

(39609, ext_filename    seven/b7a0754f_nohash_3.wav
label                                 seven
subset                                train
label_num                                 5
Name: 39609, dtype: object)


In [243]:
def get_features_labels_pairs_generator(processed_df, subset='train'):
  """Get generator of (features, labels) pairs to be used for 
  dataset_formatter.UniMediaDatasetFormatter.
  """
  def index_row_to_features_labels_pair(index_row):
    index, row = index_row
    ext_filename = row['ext_filename']
    wav_filepath = os.path.join(dataset_dir, ext_filename)
    features = wav_to_data(wav_filepath)
    features = [[x] for x in features]
    labels = row['label_num']
    labels = [labels]
    return features, labels
  subset_s = processed_df['subset']
  if subset=='train':
    processed_df = processed_df.loc[(subset_s=='train') | (subset_s=='valid')]
  elif subset=='test':
    processed_df = processed_df.loc[(subset_s=='test')]
  else:
    raise ValueError("Wrong subset key! Should be 'train' or 'test'.")
  index_row_generator = processed_df.iterrows()
  return map(index_row_to_features_labels_pair, index_row_generator)

In [244]:
features_labels_generator = get_features_labels_pairs_generator(processed_df, subset='train')

In [241]:
for x in features_labels_generator:
  print(x[1])
  break

[5]


In [159]:
all_classes = [x.lower() for x in ["Yes", "No", "Up", "Down", "Left", "Right", "On", "Off", "Stop", "Go", "Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Bed", "Bird", "Cat", "Dog", "Happy", "House", "Marvin", "Sheila", "Tree", "Wow"]]

In [160]:
all_classes

['yes',
 'no',
 'up',
 'down',
 'left',
 'right',
 'on',
 'off',
 'stop',
 'go',
 'zero',
 'one',
 'two',
 'three',
 'four',
 'five',
 'six',
 'seven',
 'eight',
 'nine',
 'bed',
 'bird',
 'cat',
 'dog',
 'happy',
 'house',
 'marvin',
 'sheila',
 'tree',
 'wow']

In [143]:
haha['label_num'] = haha['label'].cat.codes

In [132]:
haha = get_speech_commands_info_df(dataset_dir)

Couldn't load existing info table. Now building from scatch...
Background noise file! Passing...
Background noise file! Passing...
Background noise file! Passing...
Background noise file! Passing...
Background noise file! Passing...
Background noise file! Passing...


In [145]:
from collections import Counter
counter_subset = Counter(haha['subset'])
counter_label = Counter(haha['label'])

In [154]:
haha.groupby(['label', 'subset']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,ext_filename,label_num
label,subset,Unnamed: 2_level_1,Unnamed: 3_level_1
bed,test,176,176
bed,train,1340,1340
bed,valid,197,197
bird,test,158,158
bird,train,1411,1411
bird,valid,162,162
cat,test,166,166
cat,train,1399,1399
cat,valid,168,168
dog,test,180,180


In [151]:
relevant_words = {x.lower() for x in {"Yes", "No", "Up", "Down", "Left", "Right", "On", "Off", "Stop", "Go", "Zero", "One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine"}}

In [153]:
len(relevant_words)

20

In [None]:
def get_transformed_label(label):
  if label in relevant_words:
    return label
  else:
    return 'unknown'

In [139]:
c

Counter({'train': 51088, 'valid': 6798, 'test': 6835})

In [99]:
haha

Unnamed: 0,ext_filename,label
0,right/988e2f9a_nohash_0.wav,right
1,right/1eddce1d_nohash_3.wav,right
2,right/93ec8b84_nohash_0.wav,right
3,right/6272b231_nohash_1.wav,right
4,right/439c84f4_nohash_1.wav,right
5,right/2f813234_nohash_1.wav,right
6,right/b87bdb22_nohash_1.wav,right
7,right/97f4c236_nohash_2.wav,right
8,right/563aa4e6_nohash_2.wav,right
9,right/51055bda_nohash_4.wav,right


In [111]:
all_filenames = {x for x in haha['ext_filename']}

In [112]:
len(all_filenames)

64721

In [113]:
all_filenames_valid = set(valid_df['ext_filename'])

In [114]:
len(all_filenames_valid)

6798

In [115]:
all_filenames_test = set(test_df['ext_filename'])

In [116]:
len(all_filenames_test)

6835

In [117]:
all_filenames_test.issubset(all_filenames)

True

In [118]:
all_filenames_valid.issubset(all_filenames)

True

In [120]:
valid_df['subset'] = 'valid'

In [None]:
test_df['sub']

In [121]:
valid_df

Unnamed: 0,ext_filename,subset
0,bed/026290a7_nohash_0.wav,valid
1,bed/060cd039_nohash_0.wav,valid
2,bed/060cd039_nohash_1.wav,valid
3,bed/099d52ad_nohash_0.wav,valid
4,bed/0e17f595_nohash_0.wav,valid
5,bed/0e17f595_nohash_1.wav,valid
6,bed/105e72bb_nohash_0.wav,valid
7,bed/1657c9fa_nohash_0.wav,valid
8,bed/16db1582_nohash_0.wav,valid
9,bed/171b56dc_nohash_0.wav,valid


In [119]:
haha

Unnamed: 0,ext_filename,label
0,right/988e2f9a_nohash_0.wav,right
1,right/1eddce1d_nohash_3.wav,right
2,right/93ec8b84_nohash_0.wav,right
3,right/6272b231_nohash_1.wav,right
4,right/439c84f4_nohash_1.wav,right
5,right/2f813234_nohash_1.wav,right
6,right/b87bdb22_nohash_1.wav,right
7,right/97f4c236_nohash_2.wav,right
8,right/563aa4e6_nohash_2.wav,right
9,right/51055bda_nohash_4.wav,right


In [100]:
haha.dtypes

ext_filename      object
label           category
dtype: object

In [103]:
haha['label'].cat.codes

0        18
1        18
2        18
3        18
4        18
5        18
6        18
7        18
8        18
9        18
10       18
11       18
12       18
13       18
14       18
15       18
16       18
17       18
18       18
19       18
20       18
21       18
22       18
23       18
24       18
25       18
26       18
27       18
28       18
29       18
         ..
64691     7
64692     7
64693     7
64694     7
64695     7
64696     7
64697     7
64698     7
64699     7
64700     7
64701     7
64702     7
64703     7
64704     7
64705     7
64706     7
64707     7
64708     7
64709     7
64710     7
64711     7
64712     7
64713     7
64714     7
64715     7
64716     7
64717     7
64718     7
64719     7
64720     7
Length: 64721, dtype: int8