In [2]:
import pandas as pd
import re
import glob
import os
import librosa
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
%matplotlib inline
plt.style.use('ggplot')

  return f(*args, **kwds)


# Extracting features

In [180]:
# Cutting the audio files into segments of same length
def windows(data, window_size):
    start = 0
    while start < len(data):
        yield start, start + window_size
        start += int(window_size / 2)

def extract_features(df,bands = 60, frames = 41):
    window_size = 512 * (frames - 1) # setting sound clip length to 20480 frames
    labels = []
    log_specgrams = []
    log_mfccs = []
    for index in df.index.tolist():
        fn = df.loc[index]['filename'] # getting filenames from csv file
        try:sound_clip,s = librosa.load('SoundData/Train/'+fn)
        except FileNotFoundError:
            print(fn)
        label = df.loc[index]['classID']-1
        for (start,end) in windows(sound_clip,window_size):
            if(len(sound_clip[start:end]) == window_size):
                signal = sound_clip[start:end]
                logspec,logmfcc = audioFeatures(signal,s)
                log_specgrams.append(logspec)
                log_mfccs.append(logmfcc)
                labels.append(label)

    features1 = np.asarray(log_specgrams).reshape(len(log_specgrams),bands,frames,1)
    features2 = np.asarray(log_mfccs).reshape(len(log_mfccs),bands,frames,1)
    features = np.concatenate((features1,features2),axis=3)
    return features, np.array(labels,dtype = np.int)
    
def audioFeatures(signal,sample_rate,bands = 60):
    melspec = librosa.feature.melspectrogram(signal, n_mels=bands)
    logspec = librosa.amplitude_to_db(melspec)
    logspec = logspec.T.flatten()[:, np.newaxis].T
    mfccs = librosa.feature.mfcc(y=signal, sr=sample_rate, n_mfcc=bands)
    logmfcc = librosa.amplitude_to_db(mfccs).T.flatten()[:, np.newaxis].T
    return logspec, logmfcc


In [190]:
# reading dataset
audioFile = pd.read_csv('/Users/qizhilin/Downloads/SoundData/audio.csv')
audio_train = pd.DataFrame()
audio_test = pd.DataFrame()
# making sure there are enough dataset for each label
for i in range(1,12):
    _train, _test = train_test_split(audioFile[audioFile['classID']==i], test_size=0.3)
    audio_train = audio_train.append(_train)
    audio_test = audio_test.append(_test)
train_features, train_labels = extract_features(audio_train)
test_features, test_labels = extract_features(audio_test)


# CNN tensorflow

In [197]:
def cnn_model_fn(features, labels, mode):
    input_layer = tf.reshape(features["x"], [-1, 60, 41, 2])
    #build two CNN layers. Set ksize as [9,7].
    conv1 = tf.layers.conv2d(input_layer,64, [9,7],
                           padding="same",
                           activation=tf.nn.relu)
    pool1 = tf.layers.max_pooling2d(inputs=conv1, pool_size=[2, 2], strides=2)
    conv2 = tf.layers.conv2d(pool1,128, [9,7],
                           padding="same",
                           activation=tf.nn.relu)
    pool2 = tf.layers.max_pooling2d(inputs=conv2, pool_size=[2, 2], strides=2)
    # output size 15*11*128
    shape = conv2.get_shape().as_list()
    pool2_flat = tf.reshape(conv2, [-1, shape[1] * shape[2] * shape[3]])
    # build fully connected layers
    dense = tf.layers.dense(inputs=pool2_flat, units=1024, activation=tf.nn.relu)
    dropout = tf.layers.dropout(inputs=dense, rate=0.4, training=mode == tf.estimator.ModeKeys.TRAIN)
    
    logits = tf.layers.dense(inputs=dropout, units=11)
    #use argmax to get the maximum output among labels as prediction class
    #use softmax to compute probilities of each label for each input
    predictions = {"classes": tf.argmax(input=logits, axis=1),
                   "probabilities": tf.nn.softmax(logits, name="softmax_tensor")} 
    
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions)
    #use cross entropy to compute loss for classification 
    loss = tf.losses.sparse_softmax_cross_entropy(labels, logits) 
    tf.summary.scalar("loss", loss)
    accuracy = tf.metrics.accuracy(labels=labels, predictions=predictions["classes"], name="acc_op")
    eval_metric_ops = {"accuracy": accuracy}
    tf.summary.scalar('accuracy', accuracy[1])
    
    if mode == tf.estimator.ModeKeys.TRAIN:
        optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
        train_op = optimizer.minimize(loss=loss,global_step=tf.train.get_global_step())
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode=mode, loss=loss, eval_metric_ops=eval_metric_ops)

## Train & Evaluation

In [199]:
# generate Estimator
classifier = tf.estimator.Estimator(model_fn=cnn_model_fn, model_dir="Downloads/A")
tensors_to_log = {}
logging_hook = tf.train.LoggingTensorHook(tensors_to_log, every_n_iter=100)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": train_features},
    y=train_labels,
    batch_size=100,
    num_epochs=None,
    shuffle=True)
classifier.train(
    input_fn=train_input_fn,
    steps=1500, 
    hooks=[logging_hook])
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": test_features},
    y=test_labels,
    num_epochs=1,
    shuffle=False)
eval_results = classifier.evaluate(input_fn=eval_input_fn)
print(eval_results)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'Downloads/A', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x109f3cc50>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finali

## Prediction

In [249]:
from collections import Counter
# extract prediction file
predictFile = pd.read_csv('/Users/qizhilin/Downloads/SoundData/predict.csv')
predict_features, predict_labels = extract_features(predictFile)
pred_input_fn = tf.estimator.inputs.numpy_input_fn(
    x={"x": predict_features},
    y=predict_labels,
    num_epochs=1,
    shuffle=False)
# make prediction
predictions = classifier.predict(input_fn=pred_input_fn)
expected_labels = {}
class_dict = {0:'Thunderstrom',
              1:'Fireworks',
              2:'Chainsaw',
              3:'Drilling',
              4:'Engine Idling',
              5:'Gunshot',
              6:'Jackhammer',
              7:'Airplane',
              8:'Train',
              9:'Church Bells',
              10:'Car Horn'}
for key in predict_labels:
    expected_labels[key] = expected_labels.get(key, 0) + 1

for k,v in expected_labels.items():
    class_ids = []
    valid_predict = []
    prob = dict.fromkeys(class_dict.keys(),0)
    for pred_dict in predictions:
        class_id = pred_dict['classes']
        class_ids.append(class_id)
        probability = pred_dict['probabilities'][class_id]
        if probability > 0.8: # drop out predictions that has confidence less than 80%
            valid_predict.append(class_id)
            prob[class_id]+=probability
        label_counts = Counter(valid_predict)
        top = label_counts.most_common(1) # get most common predictions of an input audio file
        if len(class_ids)>=v:
            break
    final_prob = 100*prob[top[0][0]]/float(top[0][1]) # calculate the average probability of most common predictions
    if final_prob <= 90:
        print('\nPrediction is "Other sound".')
    else: print('\nPrediction is "{}"({:.1f}%),correct sound label is"{}".'.format(class_dict[top[0][0]],
                                                                               final_prob,class_dict[k]))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from Downloads/A/model.ckpt-1500
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.

Prediction is "Gunshot"(100.0%),correct sound label is"Gunshot".

Prediction is "Jackhammer"(100.0%),correct sound label is"Jackhammer".

Prediction is "Drilling"(100.0%),correct sound label is"Drilling".

Prediction is "Engine Idling"(100.0%),correct sound label is"Engine Idling".

Prediction is "Thunderstrom"(99.9%),correct sound label is"Thunderstrom".

Prediction is "Church Bells"(100.0%),correct sound label is"Church Bells".

Prediction is "Jackhammer"(95.3%),correct sound label is"Airplane".

Prediction is "Airplane"(95.3%),correct sound label is"Train".

Prediction is "Chainsaw"(99.6%),correct sound label is"Chainsaw".

Prediction is "Fireworks"(100.0%),correct sound label is"Fireworks".

Prediction is "Car Horn"(99.5%)