This notebook requires running tf_speech_EDA notebook first to generate spectrograms

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.python import debug as tf_debug
import os
import random

In [3]:
PATH = "./data/tf_speech/"
PIC_FOLDER = "picts"
audio_path = f'{PATH}train/audio/'
pict_path = f'{PATH}{PIC_FOLDER}/train/'
test_pict_path = f'{PATH}{PIC_FOLDER}/test/'
test_audio_path = f'{PATH}test/audio/'
PNG_CHANNELS = 3

Function for splitting data into validation and test sets (from README, not used yet)

In [4]:
MAX_NUM_WAVS_PER_CLASS = 2**27 - 1  # ~134M

def which_set(filename, validation_percentage, testing_percentage):
  """Determines which data partition the file should belong to.

  We want to keep files in the same training, validation, or testing sets even
  if new ones are added over time. This makes it less likely that testing
  samples will accidentally be reused in training when long runs are restarted
  for example. To keep this stability, a hash of the filename is taken and used
  to determine which set it should belong to. This determination only depends on
  the name and the set proportions, so it won't change as other files are added.

  It's also useful to associate particular files as related (for example words
  spoken by the same person), so anything after '_nohash_' in a filename is
  ignored for set determination. This ensures that 'bobby_nohash_0.wav' and
  'bobby_nohash_1.wav' are always in the same set, for example.

  Args:
    filename: File path of the data sample.
    validation_percentage: How much of the data set to use for validation.
    testing_percentage: How much of the data set to use for testing.

  Returns:
    String, one of 'training', 'validation', or 'testing'.
  """
  base_name = os.path.basename(filename)
  # We want to ignore anything after '_nohash_' in the file name when
  # deciding which set to put a wav in, so the data set creator has a way of
  # grouping wavs that are close variations of each other.
  hash_name = re.sub(r'_nohash_.*$', '', base_name)
  # This looks a bit magical, but we need to decide whether this file should
  # go into the training, testing, or validation sets, and we want to keep
  # existing files in the same set even if more files are subsequently
  # added.
  # To do that, we need a stable way of deciding based on just the file name
  # itself, so we do a hash of that and then use that to generate a
  # probability value that we use to assign it.
  hash_name_hashed = hashlib.sha1(hash_name).hexdigest()
  percentage_hash = ((int(hash_name_hashed, 16) %
                      (MAX_NUM_WAVS_PER_CLASS + 1)) *
                     (100.0 / MAX_NUM_WAVS_PER_CLASS))
  if percentage_hash < validation_percentage:
    result = 'validation'
  elif percentage_hash < (testing_percentage + validation_percentage):
    result = 'testing'
  else:
    result = 'training'
  return result

Load converted validation and test sets:
(sed -i "s/wav/png/g" testing_list.txt)

In [5]:
with open(f"{PATH}validation_list.txt", "r") as f:
    valid_list = f.read().splitlines()
with open(f"{PATH}testing_list.txt", "r") as f:
    test_list = f.read().splitlines()
print(valid_list[:10])
print(test_list[:10])
len(valid_list)

['bed/026290a7_nohash_0.png', 'bed/060cd039_nohash_0.png', 'bed/060cd039_nohash_1.png', 'bed/099d52ad_nohash_0.png', 'bed/0e17f595_nohash_0.png', 'bed/0e17f595_nohash_1.png', 'bed/105e72bb_nohash_0.png', 'bed/1657c9fa_nohash_0.png', 'bed/16db1582_nohash_0.png', 'bed/171b56dc_nohash_0.png']
['bed/0c40e715_nohash_0.png', 'bed/0ea0e2f4_nohash_0.png', 'bed/0ea0e2f4_nohash_1.png', 'bed/105a0eea_nohash_0.png', 'bed/1528225c_nohash_0.png', 'bed/1528225c_nohash_1.png', 'bed/1528225c_nohash_2.png', 'bed/1528225c_nohash_3.png', 'bed/1b4c9b89_nohash_0.png', 'bed/1cb788bc_nohash_0.png']


6798

In [6]:
labels = ["yes", "no", "up", "down", "left", "right", "on", "off", "stop", "go", "silence","unknown"]

Create train list by walking through folders and checking if folder name is one of my labels:

In [7]:
train_list = []
silence_list = []
for x in os.listdir(pict_path):#audio_path):
    if os.path.isdir(pict_path + x):
        if "silence" in x:
            silence_list.extend([x+"/"+y for y in os.listdir(pict_path + x) if '.png' in y])
        if x not in labels:
            train_list.extend([x+"/"+y for y in random.sample(os.listdir(pict_path + x), 150) if '.png' in y])
        else:
            train_list.extend([x+"/"+y for y in os.listdir(pict_path + x) if '.png' in y])
print(train_list[:10])
len(train_list)
silence_list = random.sample(silence_list, len(silence_list))

['left/aff582a1_nohash_1.png', 'left/5fadb538_nohash_4.png', 'left/53eb0a88_nohash_0.png', 'left/918a2473_nohash_3.png', 'left/e5dadd24_nohash_0.png', 'left/7257420c_nohash_0.png', 'left/c0e0f834_nohash_0.png', 'left/39c13eed_nohash_0.png', 'left/1daa5ada_nohash_0.png', 'left/fd395b74_nohash_4.png']


In [8]:
len(silence_list)/100*10

210.0

In [9]:
train_list = list(set(train_list)-set(valid_list))
train_list = list(set(train_list)-set(test_list))
train_list.extend(silence_list[:1680])
valid_list.extend(silence_list[1680:1890])
test_list.extend(silence_list[1890:])
len(train_list)

24719

In [10]:
submit_list = []
submit_list.extend([test_pict_path+y for y in os.listdir(test_pict_path) if '.png' in y])
submit_list[:5]

['./data/tf_speech/picts/test/clip_cec3f56cb.png',
 './data/tf_speech/picts/test/clip_fa022b6ea.png',
 './data/tf_speech/picts/test/clip_bc8564798.png',
 './data/tf_speech/picts/test/clip_c6913eb1f.png',
 './data/tf_speech/picts/test/clip_b62f28b2a.png']

In [11]:
train_df = pd.DataFrame(train_list, columns = ["Filepath"])
valid_df = pd.DataFrame(valid_list, columns = ["Filepath"])
test_df  = pd.DataFrame(test_list,  columns = ["Filepath"])

In [12]:
train_df = train_df.sample(frac=1).reset_index(drop=True)
valid_df = valid_df.sample(frac=1).reset_index(drop=True)

Turn word labels into number labels:

In [13]:
label_dict = {}
for i, label in enumerate(labels):
    label_dict[label] = i
label_dict

{'yes': 0,
 'no': 1,
 'up': 2,
 'down': 3,
 'left': 4,
 'right': 5,
 'on': 6,
 'off': 7,
 'stop': 8,
 'go': 9,
 'silence': 10,
 'unknown': 11}

In [14]:
def make_label(x):
    label = x.split("/")[0]
    if label not in label_dict:
        return label_dict["unknown"]
    else:
        return label_dict[label]

train_df["Label"] = train_df["Filepath"].apply(make_label)
valid_df["Label"] = valid_df["Filepath"].apply(make_label)
test_df["Label"]  = test_df["Filepath"].apply(make_label)
train_df.head(5)

Unnamed: 0,Filepath,Label
0,right/106a6183_nohash_0.png,5
1,right/be7a5b2d_nohash_3.png,5
2,go/742d6431_nohash_3.png,9
3,silence/pinknoise0649.png,10
4,marvin/3fdafe25_nohash_0.png,11


In [15]:
test_df.head(5)

Unnamed: 0,Filepath,Label
0,bed/0c40e715_nohash_0.png,11
1,bed/0ea0e2f4_nohash_0.png,11
2,bed/0ea0e2f4_nohash_1.png,11
3,bed/105a0eea_nohash_0.png,11
4,bed/1528225c_nohash_0.png,11


In [16]:
#important for monitoring metrics in tensorboard
tf.logging.set_verbosity(tf.logging.INFO)

Create input functions:

In [17]:
def create_train_input_fn(files,labels, batch_size, num_epochs=1,shuffle=True):
    
    def _input_fn():
        # step 1
        #files = files.apply(lambda x: path + x)
        filenames = tf.constant(list(files))
        _labels = tf.constant(list(labels))

        # step 2: create a dataset returning slices of `filenames`
        dataset = tf.data.Dataset.from_tensor_slices((filenames, _labels))

        # step 3: parse every image in the dataset using `map`
        def _parse_function(filename, label):
            image_string = tf.read_file(filename)#tf.strings.join([path,filename])
            image_decoded = tf.image.decode_png(image_string, channels=PNG_CHANNELS)
            image = tf.image.convert_image_dtype(image_decoded, tf.float32)
            return image, label

        dataset = dataset.map(_parse_function)
        dataset = dataset.batch(batch_size).repeat(num_epochs)

        if shuffle:
          dataset = dataset.shuffle(200)
        
        # step 4: create iterator and final input tensor
        image_batch, label_batch = dataset.make_one_shot_iterator().get_next()
        
        #tf.Print(label_batch,[label_batch],message="Image: ")
        
        return image_batch, label_batch
    return _input_fn

In [18]:
def create_predict_input_fn(files, labels, batch_size):
    
    def _input_fn():
        # step 1
        filenames = tf.constant(list(files))
        _labels = tf.constant(list(labels))

        # step 2: create a dataset returning slices of `filenames`
        dataset = tf.data.Dataset.from_tensor_slices((filenames, _labels))

        # step 3: parse every image in the dataset using `map`
        def _parse_function(filename, label):
            image_string = tf.read_file(filename)#tf.strings.join([path,filename])
            image_decoded = tf.image.decode_png(image_string, channels=PNG_CHANNELS)
            image = tf.image.convert_image_dtype(image_decoded, tf.float32)
            return image, label

        dataset = dataset.map(_parse_function)
        dataset = dataset.batch(batch_size)
        
        # step 4: create iterator and final input tensor
        image_batch, label_batch = dataset.make_one_shot_iterator().get_next()
        
        return image_batch, label_batch
    return _input_fn

In [19]:
def create_submit_input_fn(files, batch_size):
    
    def _input_fn():
        # step 1
        filenames = tf.constant(list(files))
        #_labels = tf.constant(list(labels))

        # step 2: create a dataset returning slices of `filenames`
        dataset = tf.data.Dataset.from_tensor_slices(filenames)

        # step 3: parse every image in the dataset using `map`
        def _parse_function(filename):
            image_string = tf.read_file(filename)#tf.strings.join([path,filename])
            image_decoded = tf.image.decode_png(image_string, channels=PNG_CHANNELS)
            image = tf.image.convert_image_dtype(image_decoded, tf.float32)
            return image

        dataset = dataset.map(_parse_function)
        dataset = dataset.batch(batch_size)
        
        # step 4: create iterator and final input tensor
        image_batch = dataset.make_one_shot_iterator().get_next()
        
        return image_batch
    return _input_fn

In [20]:
BATCH_SIZE = 64
NUM_EPOCHS = 5
IMG_SIZE = (256, 128)

In [21]:
train_input_fn = create_train_input_fn(train_df["Filepath"].apply(lambda x: pict_path + x),train_df["Label"],
                                       batch_size=BATCH_SIZE)
valid_input_fn = create_predict_input_fn(valid_df["Filepath"].apply(lambda x: pict_path + x),valid_df["Label"],
                                       batch_size=BATCH_SIZE)


test_input_fn = create_predict_input_fn(test_df["Filepath"].apply(lambda x: pict_path + x),test_df["Label"],
                                       batch_size=BATCH_SIZE)
submit_input_fn = create_submit_input_fn(submit_list, batch_size=BATCH_SIZE)

In [22]:
#hooks = [tf_debug.LocalCLIDebugHook()]

Create model:

In [23]:
def conv2d(input_tensor, depth, kernel, name, strides=(1, 1), padding="VALID"):
    return tf.layers.conv2d(input_tensor, filters=depth, kernel_size=kernel, strides=strides, padding=padding, activation=tf.nn.leaky_relu, name=name)

In [24]:
def conv_model(features, labels, mode):
    net = tf.reshape(features, [-1,IMG_SIZE[0],IMG_SIZE[1],PNG_CHANNELS])
    net = conv2d(net,16,[3,3],"conv1",(1,1)) #254,126,16
    net = tf.layers.max_pooling2d(net, 2,2) # 127,63,16
    net = conv2d(net,32,[3,3],"conv2",(2,2)) #63,31,32
    net = tf.layers.max_pooling2d(net, 2,1)# 62,30,32   
    net = conv2d(net,64,[2,2],"conv3",(2,2)) #31,15,64
    net = tf.layers.max_pooling2d(net, 2,1)# 30,14,64
    net = conv2d(net,128,[2,2],"conv4",(2,2))#15,7,128
    net = tf.layers.max_pooling2d(net, 2,1)#14,6,128
    net = conv2d(net,256,[2,2],"conv5", (2,2))# 7,3,256
    net = tf.layers.flatten(net)
    net = tf.layers.dense(net, units = 256, activation=tf.nn.leaky_relu)
    net = tf.layers.dropout(net, rate = 0.2, training=(mode == tf.estimator.ModeKeys.TRAIN))
    logits = tf.layers.dense(net,units = 12)

    predictions = {
                    "classes": tf.argmax(input=logits, axis=1),
                    "probabilities": tf.nn.softmax(logits)
    }
    
    #define predict method logic
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode = mode, predictions=predictions)
    
    #sparse_softmax does one-hot automatically
    loss = tf.losses.sparse_softmax_cross_entropy(labels = labels, logits = logits)
    accuracy = tf.metrics.accuracy(labels = labels, predictions = predictions["classes"], name = "acc_op")
    
    
    #define train method logic
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.0005)
        train_op  = optimizer.minimize(
            loss = loss,
            global_step=tf.train.get_global_step())
        tf.identity(accuracy[1], name='train_accuracy')
        tf.summary.scalar('train_accuracy', accuracy[1])
        eval_metric_ops = {"train_accuracy":accuracy}
        return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op, eval_metric_ops=eval_metric_ops)
    
    
    
    #define evaluate method logic
    tf.identity(accuracy[1], name='val_accuracy')
    tf.summary.scalar('val_accuracy', accuracy[1])
    eval_metric_ops = {"val_accuracy":accuracy}
    return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metric_ops=eval_metric_ops)

Train model:

In [25]:
!rm -rf tf_files
OUT_DIR = "./tf_files"

In [26]:
my_cnn_classifier = tf.estimator.Estimator(model_fn=conv_model, 
                                           config=tf.estimator.RunConfig(keep_checkpoint_max=1).replace(save_summary_steps=2),
                                           model_dir=OUT_DIR)
file_writer = tf.summary.FileWriter(OUT_DIR)

def train_and_eval(estimator, num_epochs = NUM_EPOCHS, steps=100):
    for n in range(num_epochs):
        estimator.train(input_fn = train_input_fn,steps = steps)
        estimator.evaluate(input_fn = valid_input_fn)
        
train_and_eval(my_cnn_classifier)

INFO:tensorflow:Using config: {'_model_dir': './tf_files', '_tf_random_seed': None, '_save_summary_steps': 2, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 1, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7f80d2360eb8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_ini

In [27]:
train_and_eval(my_cnn_classifier)

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./tf_files/model.ckpt-500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 500 into ./tf_files/model.ckpt.
INFO:tensorflow:loss = 0.8912085, step = 500
INFO:tensorflow:Saving checkpoints for 600 into ./tf_files/model.ckpt.
INFO:tensorflow:Loss for final step: 0.9930417.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-11-27-02:38:36
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./tf_files/model.ckpt-600
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-11-27-02:38:48
INFO:tensorflow:Saving dict for global step 600: global_step = 600, loss = 1.2341

Submission file generation:

In [28]:
generator = my_cnn_classifier.predict(input_fn=submit_input_fn)
predictions = [next(generator) for i in range(len(submit_list))]
classes = [predictions[i]["classes"] for i in range(len(predictions))]

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from ./tf_files/model.ckpt-1000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.


In [29]:
def make_submission_file(classes, filename):
    submission = pd.DataFrame()
    submission["fname"] = submit_list
    submission["fname"] = submission["fname"].apply(lambda x: x.split("/")[-1][:-3]+"wav")
    submission["label"] = classes
    submission["label"] = submission["label"].apply(lambda x: labels[x])
    submission.set_index("fname", inplace=True)
    submission.to_csv(filename)
    
make_submission_file(classes, "tf_speech_pred_cnn.csv")