In [28]:
import os
import numpy as np
import scipy
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as ipd
import matplotlib.pyplot as plt

%matplotlib inline

In [29]:

ipd.Audio(TRAIN_PATH + "chunk_0.wav")

In [30]:
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

In [31]:
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

In [32]:
def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)

def met_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)

In [120]:
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    

    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs


In [34]:
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters

In [159]:
def pad_audio(data, fs, T=3):
    # Calculate target number of samples
    N_tar = int(fs * T)
    # Calculate number of zero samples to append
    shape = data.shape
    # Create the target shape    
    N_pad = N_tar - shape[0]
    print("Padding with %s seconds of silence" % str(N_pad/fs) )
    shape = (N_pad,) + shape[1:]
    # Stack only if there is something to append    
    if shape[0] > 0:                
        if len(shape) > 1:
            return np.vstack((np.zeros(shape),
                              data))
        else:
            return np.hstack((np.zeros(shape),
                              data))
    else:
        return data

In [160]:
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

In [161]:
def create_cepstral_coefficients(file):
    sample_rate, audio = wavfile.read(file)
    audio = pad_audio(audio, sample_rate)
    
    hop_size = 10#ms
    FFT_size = 1024

    audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)

    window = get_window("hann", FFT_size, fftbins=True)

    audio_win = audio_framed * window

    ind = 69

    audio_winT = np.transpose(audio_win)

    audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

    for n in range(audio_fft.shape[1]):
        audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

    audio_fft = np.transpose(audio_fft)

    audio_power = np.square(np.abs(audio_fft))

    freq_min = 0
    freq_high = sample_rate / 2
    mel_filter_num = 10
    
    filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)

    
    filters = get_filters(filter_points, FFT_size)

    # taken from the librosa library
    enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
    filters *= enorm[:, np.newaxis]

    audio_filtered = np.dot(filters, np.transpose(audio_power))
    audio_log = 10.0 * np.log10(audio_filtered)
    audio_log.shape

    dct_filter_num = 40

    dct_filters = dct(dct_filter_num, mel_filter_num)

    cepstral_coefficents = np.dot(dct_filters, audio_log)
    print(cepstral_coefficents.shape)
    
    return cepstral_coefficents



In [173]:
import boto3

In [174]:
import os
# assign directory

import glob
rootdir = '../dataset/'
cepstrals = []
labels = []

for subdir, dirs, files in os.walk(rootdir):
    for file in files:

        if 'chunk' in file:
            labels.append(os.path.join(subdir, file).split('-')[3])
            cepstrals.append(create_cepstral_coefficients(os.path.join(subdir, file)))
            
cepstrals = np.concatenate(cepstrals, axis=1)

Padding with 0.4411818181818182 seconds of silence
(40, 301)
Padding with 0.9061818181818182 seconds of silence
(40, 301)
Padding with 0.6871818181818182 seconds of silence
(40, 301)
Padding with 0.7861818181818182 seconds of silence
(40, 301)
Padding with 0.7331818181818182 seconds of silence
(40, 301)
Padding with 0.7691818181818182 seconds of silence
(40, 301)
Padding with 0.7351818181818182 seconds of silence
(40, 301)
Padding with 0.9771818181818182 seconds of silence
(40, 301)
Padding with 1.012181818181818 seconds of silence
(40, 301)
Padding with 1.0101818181818183 seconds of silence
(40, 301)
Padding with 1.8381818181818181 seconds of silence
(40, 301)
Padding with 1.4821818181818183 seconds of silence


  audio_log = 10.0 * np.log10(audio_filtered)


(40, 301)
Padding with 1.5591818181818182 seconds of silence
(40, 301)
Padding with 1.536181818181818 seconds of silence
(40, 301)
Padding with 1.8821818181818182 seconds of silence
(40, 301)
Padding with 1.9261818181818182 seconds of silence
(40, 301)
Padding with 1.8731818181818183 seconds of silence
(40, 301)
Padding with 1.8431818181818183 seconds of silence
(40, 301)
Padding with 1.8511818181818183 seconds of silence
(40, 301)
Padding with 1.9341818181818182 seconds of silence
(40, 301)
Padding with 0.6921818181818182 seconds of silence
(40, 301)
Padding with 1.3621818181818182 seconds of silence
(40, 301)
Padding with 1.4451818181818181 seconds of silence
(40, 301)
Padding with 1.159181818181818 seconds of silence
(40, 301)
Padding with 1.3611818181818183 seconds of silence
(40, 301)
Padding with 1.427181818181818 seconds of silence
(40, 301)
Padding with 1.3801818181818182 seconds of silence
(40, 301)
Padding with 1.264181818181818 seconds of silence
(40, 301)
Padding with 1.280

In [175]:
cepstrals

array([[-inf, -inf, -inf, ..., -inf, -inf, -inf],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       ...,
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [181]:


where_are_NaNs = np.isnan(cepstrals)
cepstrals[where_are_NaNs] = 0
cepstrals[cepstrals == -np.inf] = 0
cepstrals[cepstrals == np.inf] = 0
cepstrals




array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [182]:
from sklearn import svm

clf = svm.SVC(gamma=0.001, C=100)
print(labels.shape)
clf.fit(cepstrals, labels)

(80,)


ValueError: Found input variables with inconsistent numbers of samples: [40, 80]