In [1]:
### Let's read a sample audio using librosa
import librosa
audio_file_path='../UrbanSound8K/audio/fold1/7061-6-0-0.wav'
librosa_audio_data,librosa_sample_rate=librosa.load(audio_file_path)

In [2]:
## Mel-Frequency Cepstral Coefficients(MFCC) from the audio samples
mfccs = librosa.feature.mfcc(y=librosa_audio_data, sr=librosa_sample_rate, n_mfcc=40)
## n_mfcc
print(mfccs.shape)

(40, 97)


In [3]:
mfccs

array([[-1.27213120e+02,  8.04792309e+00,  1.60044117e+01, ...,
        -5.69353333e+02, -5.91727539e+02, -6.10657837e+02],
       [ 3.00206680e+01,  2.29403477e+01,  1.06696825e+01, ...,
         4.30511932e+01,  2.58880424e+01,  1.28297863e+01],
       [-2.53601742e+00, -1.12524357e+01, -1.42598038e+01, ...,
        -1.11037216e+01, -7.55062675e+00,  2.01516241e-01],
       ...,
       [ 3.23359013e-01, -3.54810309e+00, -5.02934265e+00, ...,
         2.40957856e+00,  3.16997337e+00, -2.12704360e-01],
       [-9.78005695e+00, -9.09051323e+00, -6.19126558e+00, ...,
        -5.01989722e-02,  7.28950620e-01, -9.00552392e-01],
       [ 1.65660703e+00,  1.82233298e+00,  2.83062148e+00, ...,
        -2.06092095e+00, -2.23126650e+00, -1.77458656e+00]], dtype=float32)

In [4]:
#### Extracting MFCC's For every audio file
import pandas as pd
import os
import librosa

audio_dataset_path='../UrbanSound8K/audio/'
metadata=pd.read_csv('../UrbanSound8K/metadata/UrbanSound8K.csv')
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [7]:
import numpy as np
def features_extractor(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast') 
    mfccs_features = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    mfccs_scaled_features = np.mean(mfccs_features.T,axis=0)
    
    return mfccs_scaled_features

# Mel-Frequency Cepstral Coefficients

What is?

In [8]:
from tqdm import tqdm
### Now we iterate through every audio file and extract features 
### using Mel-Frequency Cepstral Coefficients
extracted_features=[]
for index_num,row in tqdm(metadata.iterrows()):
    file_name = os.path.join(os.path.abspath(audio_dataset_path),'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    final_class_labels=row["class"]
    data=features_extractor(file_name)
    extracted_features.append([data,final_class_labels])

8732it [05:45, 25.29it/s]


## Spectrograms

Some notes about Spectograms:

1. They are generated using the Fourier Transform; 
    - Moreover using the Short-Fourier Transform since we define a frame(window) and compute the fourier transform on each segment;In this way, the Spectogram encoded also the information about how the frequency components change over time
2. Which are the output of the Fourier transform?
    - a magnitude (expressing the similarity between the signal/segment and a sinuosoid with a given frequency)
3. Basically the obtained magnitudes for each frame are grouped and they generated a frequency vector

In [9]:
### converting extracted_features to Pandas dataframe
extracted_features_df=pd.DataFrame(extracted_features,columns=['feature','class'])
extracted_features_df.head()

Unnamed: 0,feature,class
0,"[-217.35526, 70.22338, -130.38527, -53.282898,...",dog_bark
1,"[-424.09818, 109.34076, -52.919525, 60.86475, ...",children_playing
2,"[-458.79114, 121.38419, -46.520653, 52.00812, ...",children_playing
3,"[-413.89984, 101.66371, -35.42945, 53.036358, ...",children_playing
4,"[-446.60352, 113.68541, -52.402218, 60.302044,...",children_playing


In [10]:
### Split the dataset into independent and dependent dataset
X=np.array(extracted_features_df['feature'].tolist())
y=np.array(extracted_features_df['class'].tolist())

In [12]:
### Label Encoder
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y=to_categorical(labelencoder.fit_transform(y))

In [13]:
### Train Test Split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

# Model

In [14]:
import tensorflow as tf
print(tf.__version__)

2.17.0


In [15]:
### No of classes
num_labels=y.shape[1]