In [None]:
import os
import pandas as pd
import tensorflow as tf
import librosa
import librosa.display as ld
import numpy as np
import random
import matplotlib.pyplot as plt

plt.style.use("dark_background")

In [None]:
audio_data_dir = "d:\soundofai\pitch_60_audio"
audio_files = [x for x in os.listdir(audio_data_dir) if x.lower().endswith("wav")]
print(f"found {len(audio_files)} files")

In [None]:
df = pd.read_csv("../data/old_final.csv", index_col=0)
df.head()

In [None]:
dataset = {}

for i, row in df.iterrows():
    qualities= row.iloc[2:-1]
    classes = np.clip(qualities.values, 0, 1)
    if np.sum(classes) == 0:
        continue
    else:
        dataset[row.iloc[0]] = classes.tolist()

In [None]:
num_classes = len(classes)
frame_size = 2048
hop_len = 512
num_mels = 128
sample_rate = 16000

def get_melspectrogram(signal):
    melspec = librosa.feature.melspectrogram(
        signal,
        hop_length=hop_len,
        n_fft=frame_size,
        n_mels=num_mels
    )
    return librosa.power_to_db(melspec)

def plot_melspectrogram(ms):
    plt.figure(figsize=(12, 6))
    ld.specshow(
        ms,
        sr=sample_rate,
        hop_length=hop_len,
        x_axis='time',
        y_axis='mel',
        cmap='viridis'
    )
    plt.colorbar()
    plt.show()


def get_ms_from_file(file_path):
    audio, _ = librosa.load(file_path, sr=sample_rate)
    return get_melspectrogram(audio[:32400])

In [None]:
audio, _ = librosa.load(os.path.join(audio_data_dir, audio_files[0]), sr=sample_rate)
print(audio.shape)

In [None]:
mel_sp = get_melspectrogram(audio[:32400])

print(mel_sp.shape)

plot_melspectrogram(mel_sp)

In [None]:
from tensorflow.keras.layers import Conv2D, MaxPool2D, BatchNormalization
from tensorflow.keras.layers import Input, Lambda, Flatten, Dropout, Dense


def create_model():
    def conv_block(input_, num_filters):
        x = Conv2D(num_filters, 3, activation='relu')(input_)
        x = MaxPool2D(2)(x)
        return BatchNormalization()(x)

    input_ = Input(shape=(128, 64))
    x = Lambda(lambda x: tf.expand_dims(x, axis=-1))(input_)
    for i in range(0, 4):
        num_filters = 2**(4 + i)
        x = conv_block(x, num_filters)
    x = Flatten()(x)
    x = Dropout(0.25)(x)
    output_ = Dense(num_classes, activation='sigmoid')(x)

    model = tf.keras.models.Model(input_, output_)
    return model

model = create_model()
model.summary()

In [None]:
len(dataset)

In [None]:
def data_generator(batch_size=16):
    while True:
        x_batch = np.zeros((batch_size, 128, 64))
        y_batch = np.zeros((batch_size, num_classes))

        for i in range(0, batch_size):
            example, label = random.choice(list(dataset.items()))
            file_path = os.path.join(audio_data_dir, example)
            x_batch[i] = get_ms_from_file(file_path)
            y_batch[i] = np.array(label)

        yield (x_batch, y_batch)

In [None]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

_ = model.fit(
    data_generator(),
    steps_per_epoch=50,
    epochs=50,
    callbacks=[
        tf.keras.callbacks.EarlyStopping(monitor="accuracy", patience=5)
    ]
)