In [193]:
import os
import numpy as np
import scipy
import librosa
from scipy.io import wavfile
import scipy.fftpack as fft
from scipy.signal import get_window
import IPython.display as ipd
import matplotlib.pyplot as plt
from tqdm import tqdm
from python_speech_features import mfcc
%matplotlib inline

In [194]:

#ipd.Audio(TRAIN_PATH + "chunk_0.wav")

In [195]:
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

In [196]:
def frame_audio(audio, FFT_size=2048, hop_size=10, sample_rate=44100):
    # hop_size in ms
    
    audio = np.pad(audio, int(FFT_size / 2), mode='reflect')
    frame_len = np.round(sample_rate * hop_size / 1000).astype(int)
    frame_num = int((len(audio) - FFT_size) / frame_len) + 1
    frames = np.zeros((frame_num,FFT_size))
    
    for n in range(frame_num):
        frames[n] = audio[n*frame_len:n*frame_len+FFT_size]
    
    return frames

In [197]:
def freq_to_mel(freq):
    return 2595.0 * np.log10(1.0 + freq / 700.0)

def met_to_freq(mels):
    return 700.0 * (10.0**(mels / 2595.0) - 1.0)

In [198]:
def get_filter_points(fmin, fmax, mel_filter_num, FFT_size, sample_rate=44100):
    fmin_mel = freq_to_mel(fmin)
    fmax_mel = freq_to_mel(fmax)
    

    mels = np.linspace(fmin_mel, fmax_mel, num=mel_filter_num+2)
    freqs = met_to_freq(mels)
    
    return np.floor((FFT_size + 1) / sample_rate * freqs).astype(int), freqs


In [199]:
def get_filters(filter_points, FFT_size):
    filters = np.zeros((len(filter_points)-2,int(FFT_size/2+1)))
    
    for n in range(len(filter_points)-2):
        filters[n, filter_points[n] : filter_points[n + 1]] = np.linspace(0, 1, filter_points[n + 1] - filter_points[n])
        filters[n, filter_points[n + 1] : filter_points[n + 2]] = np.linspace(1, 0, filter_points[n + 2] - filter_points[n + 1])
    
    return filters

In [200]:
def pad_audio(data, fs, T=4):
    # Calculate target number of samples
    N_tar = int(fs * T)
    # Calculate number of zero samples to append
    shape = data.shape
    # Create the target shape    
    N_pad = N_tar - shape[0]
    print("Padding with %s seconds of silence" % str(N_pad/fs) )
    shape = (N_pad,) + shape[1:]
    # Stack only if there is something to append    
    if shape[0] > 0:                
        if len(shape) > 1:
            return np.vstack((np.zeros(shape),
                              data))
        else:
            return np.hstack((np.zeros(shape),
                              data))
    else:
        return data

In [201]:
def dct(dct_filter_num, filter_len):
    basis = np.empty((dct_filter_num,filter_len))
    basis[0, :] = 1.0 / np.sqrt(filter_len)
    
    samples = np.arange(1, 2 * filter_len, 2) * np.pi / (2.0 * filter_len)

    for i in range(1, dct_filter_num):
        basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / filter_len)
        
    return basis

In [202]:
def create_cepstral_coefficients(file):
    sample_rate, audio = wavfile.read(file)
    audio = pad_audio(audio, sample_rate)
    
    hop_size = 10#ms
    FFT_size = 1024

    audio_framed = frame_audio(audio, FFT_size=FFT_size, hop_size=hop_size, sample_rate=sample_rate)

    window = get_window("hann", FFT_size, fftbins=True)

    audio_win = audio_framed * window

    ind = 6

    audio_winT = np.transpose(audio_win)

    audio_fft = np.empty((int(1 + FFT_size // 2), audio_winT.shape[1]), dtype=np.complex64, order='F')

    for n in range(audio_fft.shape[1]):
        audio_fft[:, n] = fft.fft(audio_winT[:, n], axis=0)[:audio_fft.shape[0]]

    audio_fft = np.transpose(audio_fft)

    audio_power = np.square(np.abs(audio_fft))

    freq_min = 0
    freq_high = sample_rate / 2
    mel_filter_num = 10
    
    filter_points, mel_freqs = get_filter_points(freq_min, freq_high, mel_filter_num, FFT_size, sample_rate=44100)

    
    filters = get_filters(filter_points, FFT_size)

    # taken from the librosa library
    enorm = 2.0 / (mel_freqs[2:mel_filter_num+2] - mel_freqs[:mel_filter_num])
    filters *= enorm[:, np.newaxis]

    audio_filtered = np.dot(filters, np.transpose(audio_power))
    audio_log = 10.0 * np.log10(audio_filtered)
    audio_log.shape

    dct_filter_num = 40

    dct_filters = dct(dct_filter_num, mel_filter_num)

    cepstral_coefficents = np.dot(dct_filters, audio_log)
    print(cepstral_coefficents.shape)
    
    return cepstral_coefficents



In [203]:
import boto3

In [204]:
import os
# assign directory

import glob
rootdir = 'dataset/'
output = []
labels = []

for subdir, dirs, files in os.walk(rootdir):
    for file in files:
        print(file)
        if 'chunk' in file:
            labels.append(os.path.join(subdir, file).split('-')[3])
            data = create_cepstral_coefficients(os.path.join(subdir, file))
            where_are_NaNs = np.isnan(data)
            data[where_are_NaNs] = 0
            data[data == -np.inf] = 0
            data[data == np.inf] = 0
            output.append(data.flatten())
            


2021-11-08-J-Jo.wav
2021-11-10-G-James.wav
2021-11-15-D-Michael.wav
2021-11-24-A-Seattle.wav
2021-12-19-A-Bruce-fixed.wav
2021-12-19-A-Bruce-free.wav
2021-12-19-B-Bruce-fixed.wav
2021-12-19-C-Bruce-fixed.wav
2021-12-19-D-Bruce-fixed.wav
2021-12-19-E-Bruce-fixed.wav
2021-12-19-F-Bruce-fixed.wav
2021-12-19-G-Bruce-fixed.wav
2021-12-19-H-Bruce-fixed.wav
2021-12-19-I-Bruce-fixed.wav
python.zip
chunk_0.wav
Padding with 1.4411818181818181 seconds of silence
(40, 401)
chunk_1.wav
Padding with 1.9061818181818182 seconds of silence
(40, 401)
chunk_2.wav
Padding with 1.6871818181818181 seconds of silence
(40, 401)
chunk_3.wav
Padding with 1.786181818181818 seconds of silence
(40, 401)
chunk_4.wav
Padding with 1.7331818181818182 seconds of silence
(40, 401)
chunk_5.wav
Padding with 1.7691818181818182 seconds of silence
(40, 401)
chunk_6.wav
Padding with 1.7351818181818182 seconds of silence




(40, 401)
chunk_7.wav
Padding with 1.9771818181818182 seconds of silence
(40, 401)
chunk_8.wav
Padding with 2.0121818181818183 seconds of silence
(40, 401)
chunk_9.wav
Padding with 2.010181818181818 seconds of silence
(40, 401)
chunk_0.wav
Padding with 2.8381818181818184 seconds of silence
(40, 401)
chunk_1.wav
Padding with 2.482181818181818 seconds of silence
(40, 401)
chunk_2.wav
Padding with 2.559181818181818 seconds of silence
(40, 401)
chunk_3.wav
Padding with 2.5361818181818183 seconds of silence
(40, 401)
chunk_4.wav
Padding with 2.8821818181818184 seconds of silence
(40, 401)
chunk_5.wav
Padding with 2.926181818181818 seconds of silence
(40, 401)
chunk_6.wav
Padding with 2.873181818181818 seconds of silence
(40, 401)
chunk_7.wav
Padding with 2.8431818181818183 seconds of silence
(40, 401)
chunk_8.wav
Padding with 2.8511818181818183 seconds of silence
(40, 401)
chunk_9.wav
Padding with 2.934181818181818 seconds of silence
(40, 401)
chunk_0.wav
Padding with 1.6921818181818182 sec

In [205]:
cepstrals = np.array(output)

In [206]:
cepstrals


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [207]:
from sklearn import svm

clf = svm.SVC(gamma=0.001, C=100)
#print(labels.shape)

labels = np.array(labels).ravel()
print(labels)
#labels = np.concatenate(labels, axis=1)
cepstrals.shape
#clf.fit(cepstrals, labels)

['B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'B' 'C' 'C' 'C' 'C' 'C' 'C' 'C' 'C'
 'C' 'C' 'D' 'D' 'D' 'D' 'D' 'D' 'D' 'D' 'D' 'D' 'E' 'E' 'E' 'E' 'E' 'E'
 'E' 'E' 'E' 'E' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'G' 'G' 'G' 'G'
 'G' 'G' 'G' 'G' 'G' 'G' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'H' 'I' 'I'
 'I' 'I' 'I' 'I' 'I' 'I' 'I' 'I']


(80, 16040)

In [208]:
from sklearn.model_selection import train_test_split


In [209]:
X_train, X_test, y_train, y_test = train_test_split(cepstrals, labels, test_size=0.3,random_state=109) # 70% training and 30% test

In [210]:
labels.ravel()

array(['B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'B', 'C', 'C', 'C',
       'C', 'C', 'C', 'C', 'C', 'C', 'C', 'D', 'D', 'D', 'D', 'D', 'D',
       'D', 'D', 'D', 'D', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E', 'E',
       'E', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'F', 'G', 'G',
       'G', 'G', 'G', 'G', 'G', 'G', 'G', 'G', 'H', 'H', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'H', 'I', 'I', 'I', 'I', 'I', 'I', 'I', 'I',
       'I', 'I'], dtype='<U1')

In [211]:
clf.fit(X_train, y_train)

SVC(C=100, gamma=0.001)

In [212]:
y_pred = clf.predict(X_test)

In [213]:
from sklearn import metrics
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.0


In [214]:
def extract_features(audio_data):

	# Remember that the audio data consists of raw audio wave followed by sample rate
	# so we need to only take the raw audio wave.

    audio_waves = audio_data[:,0]
    samplerate = audio_data[:,1][1]

    features = []
    for audio_wave in tqdm(audio_waves):
        features.append(mfcc(audio_wave, samplerate=samplerate, numcep=26))
    
    features = np.array(features)
    return features

# Define a function to load the raw audio files
def load_audio():
	# Allocate empty list for male and female voices
    chars, labels, = [], []
    letters = []
    for subdir, dirs, files in os.walk(rootdir):
        for file in files:
            if 'chunk' in file:
                print(librosa.load(os.path.join(subdir, file)))
                audio, sample_rate = librosa.load(os.path.join(subdir, file))
                data = pad_audio(audio, sample_rate)
                
                chars.append((data, sample_rate))
                labels.append(os.path.join(subdir, file).split('-')[3])
        if chars:
            letters.append(chars)
	# Convert the list into Numpy array
    chars = np.array(chars)
    labels = np.array(labels)
    
    return chars, labels

# How to use load_audio() function
characters, labels = load_audio()

(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.4411791383219954 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.9061678004535147 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.6871655328798185 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.7861678004535146 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.7331519274376417 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.7691609977324263 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.7351473922902494 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 1.977142857142857 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.012154195011338 seconds of si

Padding with 2.9341496598639454 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.825170068027211 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.9541496598639454 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.9541496598639454 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.97015873015873 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.962176870748299 seconds of silence
(array([0., 0., 0., ..., 0., 0., 0.], dtype=float32), 22050)
Padding with 2.3081632653061224 seconds of silence




In [215]:
character_features = extract_features(characters)




100%|██████████| 80/80 [00:01<00:00, 63.93it/s]


In [242]:
def concatenate_features(audio_features):
    concatenated = audio_features[0]
    for audio_feature in tqdm(audio_features):
        concatenated = np.vstack((concatenated, audio_feature))
        
    return concatenated
concat = concatenate_features(character_features)
            

100%|██████████| 80/80 [00:00<00:00, 2906.00it/s]


In [244]:
np.vstack(concat).shape

(32238, 26)

In [241]:


X_train, X_test, y_train, y_test = train_test_split(np.vstack(character_features), labels.ravel(), test_size=0.2, random_state=22)


ValueError: Found input variables with inconsistent numbers of samples: [31840, 80]

In [240]:
from sklearn.svm import SVC
clf = SVC(kernel='rbf')   

In [226]:
clf.fit(X_train, y_train)


ValueError: Found array with dim 3. Estimator expected <= 2.