In [None]:
import struct
import librosa
import numpy as np

class WavFileHelper():
    
    def read_file_properties(self, filename):

        wave_file = open(filename, "rb")
        
        riff = wave_file.read(12)
        fmt = wave_file.read(36)
        
        num_channels_string = fmt[10:12]
        num_channels = struct.unpack('<H', num_channels_string)[0]

        sample_rate_string = fmt[12:16]
        sample_rate = struct.unpack("<I", sample_rate_string)[0]
        
        bit_depth_string = fmt[22:24]
        bit_depth = struct.unpack("<H", bit_depth_string)[0]
        
        wave_file.close()

        # Load the audio file with librosa
        y, sr = librosa.load(filename, sr=None, mono=True)  # Load as mono

        # Compute RMS of the audio signal using librosa
        # rms = librosa.feature.rms(y=y)[0]
        # avg_rms = np.mean(rms)  # Average RMS over time if needed
        avg_rms = None
        # Compute the length of the audio sample in seconds
        length_in_seconds = len(y) / sr  # Total samples / Sample rate
        
        # Length in samples
        length_in_samples = len(y)
        
        return (num_channels, sample_rate, bit_depth, avg_rms, length_in_seconds, length_in_samples)  # Added length_in_samples



In [None]:
#Import and install dependencies
# tensorflow_io 0.28 is compatible with TensorFlow 2.11
# Python 3.10* needed.
#! pip install tensorflow==2.11.* tensorflow-io==0.31.0 matplotlib

In [None]:
# imports and constants
import os
from matplotlib import pyplot as plt
import tensorflow as tf
import tensorflow_io as tfio
import pandas as pd
import librosa
import librosa.display
import numpy as np
from scipy.io import wavfile as wav
import IPython.display as ipd

wavfilehelper = WavFileHelper()

DATA_DIR = 'D:\Code\ProjectsPython\ML_TrainingGround\ML_Audio\data'
METADATA = os.path.join(DATA_DIR, "UrbanSound8K", "metadata", "UrbanSound8K.csv")
AUDIO_DIR = os.path.join(DATA_DIR, "UrbanSound8K", "audio")

<h3><i>Observe the Dataset</i></h3>

In [None]:
AUDIO01 = os.path.join(AUDIO_DIR, "fold1/101415-3-0-2.wav")
AUDIO02 = os.path.join(AUDIO_DIR, "fold10/2937-1-0-0.wav")

<p><b>Dataset metadata</b></p>

In [None]:
df = pd.read_csv(METADATA)
df.head()

<p><b>Class distribution</b></p>

In [None]:
print(df["class"].value_counts())

**Diversity in sample properties**
*  Number of channels  
*  Sample rates
*  Bit depths

In [None]:
audiodata = []
#Iterrows used to iterate over DataFrame rows as (index, Series) pairs. Each row is returned as a Series object, and you can access the values of the Series to process each row individually.
for index, row in df.iterrows():
    
    file_name = os.path.join(AUDIO_DIR,'fold'+str(row["fold"])+'/',str(row["slice_file_name"]))
    data = wavfilehelper.read_file_properties(file_name)
    audiodata.append(data)
    # Convert into a Panda dataframe
audiodf = pd.DataFrame(audiodata, columns=['num_channels', 'sample_rate', 'bit_depth', 'avg_rms', 'length_in_seconds', 'length_in_samples'])

In [None]:
audiodf.head()

# num of channels 
print("Channels: ")
print(audiodf.num_channels.value_counts(normalize=True))

# sample rates 
print("Sample Rates: ")
print(audiodf.sample_rate.value_counts(normalize=True))

# bit depth
print("Bit Depth: ")
print(audiodf.bit_depth.value_counts(normalize=True))

# length in samples
print("Samples: ")
print(audiodf.length_in_samples.value_counts())

# RMS
# print(audiodf.avg_rms.describe())

### Preprocessing with Librosa
* Resample to common sample rate
* Bit-depth Normalization
* Mix-down to mono channel
* Cut the length to 2sec

In [None]:
# Resample
filename = AUDIO01

# Librosa load does resample, mono and bit depth conversion.
scipy_sample_rate, scipy_audio = wav.read(filename) 
librosa_audio, librosa_sample_rate = librosa.load(filename) 

print('Original sample rate:', scipy_sample_rate) 
print('Librosa sample rate:', librosa_sample_rate) 

In [None]:
#Bit-depth 
# also known as bit resolution, refers to the number of bits used to represent each sample in a digital audio file.
# Librosa’s load function will also normalise the data so it's values range between -1 and 1. This removes the complication of the dataset having a wide range of bit-depths.

print('Original audio file min~max range:', np.min(scipy_audio), 'to', np.max(scipy_audio))
print('Librosa audio file min~max range:', np.min(librosa_audio), 'to', np.max(librosa_audio))

In [None]:
# Mix down to mono

# Original audio with 2 channels 
plt.figure(figsize=(12, 4))
plt.plot(scipy_audio)

# Librosa audio with channels merged 
plt.figure(figsize=(12, 4))
plt.plot(librosa_audio)

print(f"Scipy audio shape: {scipy_audio.shape}, Librosa audio shape: {librosa_audio.shape}")

In [None]:
# Slice for common length of 1 seconds
def slice_audio(librosa_audio, librosa_sample_rate = 22050):
    SAMPLE_LENGTH = 1 * librosa_sample_rate

    librosa_audio_sliced = librosa_audio[:SAMPLE_LENGTH]
    if len(librosa_audio) < SAMPLE_LENGTH:
        # print(f"Audio length {len(librosa_audio)} is less than 2 seconds. Padding with zeros.")
        # np.pad specifies the number of values to add at the beginning and the end of the librosa_audio array.
        # 0 -> no padding in the beginning.
        # SAMPLE_LENGTH - len(librosa_audio) -> number of zeros to end, ensuring the total length is 2 seconds.
        librosa_audio_sliced = np.pad(librosa_audio, (0, SAMPLE_LENGTH - len(librosa_audio)), constant_values=0)
    return librosa_audio_sliced

# print(f"Librosa audio before: {librosa_audio.shape} and after: {slice_audio(librosa_audio).shape}")

In [None]:
from IPython.display import Audio

Audio(data=slice_audio(librosa_audio), rate=librosa_sample_rate)

# ______________________________________________________________________________
## <i>FEATURE EXTRACTION:</i>

### Option 2
**Extract Spectrogram**

In [None]:
def extract_spectrogram(audio_path):
    
    audio_file, librosa_sample_rate = librosa.load(audio_path, res_type='kaiser_fast')
    audio_file = slice_audio(audio_file, librosa_sample_rate)

    spectrogram = librosa.stft(audio_file, n_fft=512, win_length=512, dtype=np.float32)
    spectrogram = librosa.amplitude_to_db(abs(spectrogram), ref=np.max)
    #librosa.display.specshow(spectrogram, sr=librosa_sample_rate, x_axis='time')

    # spectrogram = tf.expand_dims(spectrogram, axis = 2)

    return spectrogram

In [None]:
# Test the Fourier transform
#In each iteration of the loop, the variable index is assigned the index value of the current row, and the variable row is assigned the data of the current row (as a Series object).
row = df.iloc[99]

file = os.path.join(AUDIO_DIR, "fold" + str(row["fold"]) + "\\" + str(row["slice_file_name"]))
audio_file, librosa_sample_rate = librosa.load(file, res_type='kaiser_fast')
audio_file = slice_audio(audio_file, librosa_sample_rate)

spectrogram = librosa.stft(audio_file, n_fft=512, win_length=512)
spectrogram = librosa.amplitude_to_db(abs(spectrogram), ref=np.max)
librosa.display.specshow(spectrogram, sr=librosa_sample_rate, x_axis='time')
print(spectrogram.shape)

In [None]:
features = []

#In each iteration of the loop, the variable index is assigned the index value of the current row, and the variable row is assigned the data of the current row (as a Series object).
for index, row in df.iterrows():
    file = os.path.join(AUDIO_DIR, "fold" + str(row["fold"]) + "\\" + str(row["slice_file_name"]))
    class_label = row["class"]
    data = extract_spectrogram(file)
    
    features.append([data, class_label])
    
featuresdf = pd.DataFrame(features, columns=["features", "class_label"])
print('Finished feature extraction from ', len(featuresdf), ' files') 

In [None]:
featuresdf.head()

In [None]:
featuresdf.iloc[0]["features"].shape

#### <i>END OF FEATURE EXTRACTION</i>
# ______________________________________________________________________________

**Convert the data and labels**
<p>We will use sklearn.preprocessing.LabelEncoder to encode the categorical text data into model-understandable numerical data.</p>
<p><i>Meaning, that every class has a column, which is either 0 or 1</i></p>

In [None]:
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical

# Convert features and corresponding classification labels into numpy arrays
X = np.array(featuresdf.features.tolist())
y = np.array(featuresdf.class_label.tolist())

# Encode the classification labels
le = LabelEncoder()
yy = to_categorical(le.fit_transform(y)) 

### Split the dataset

In [None]:
# split the dataset 
from sklearn.model_selection import train_test_split 

x_train, x_test, y_train, y_test = train_test_split(X, yy, test_size=0.2, random_state = 42)

<b>Store the preprocessed data</b>

In [None]:
### store the preprocessed data for use in the next notebook

%store x_train 
%store x_test 
%store y_train 
%store y_test 
%store yy 
%store le