In [1]:
import os
import librosa
import numpy as np

# Define the path to the dataset
dataset_path = "Data"

# Define the path to save the preprocessed data
preprocessed_path = "Preprocessed-Data"

# Define the sampling rate
sampling_rate = 16000

# Define the number of mel frequency bins
n_mels = 128

# Define the hop length
hop_length = 512


In [14]:
# Loop over the clean speech files
for filename in os.listdir(os.path.join(dataset_path, "CleanSpeech_training")):
    # Load the audio file
    audio, sr = librosa.load(os.path.join(dataset_path, "CleanSpeech_training", filename), sr=sampling_rate)

    mel_spec = librosa.feature.melspectrogram(y = audio, sr=sampling_rate, n_mels=n_mels, hop_length=hop_length)

    # Convert the mel spectrogram to decibels
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    print(mel_spec_db.shape)

    # Save the mel spectrogram to disk
    np.save(os.path.join(preprocessed_path, f"{filename}.npy"), mel_spec_db)

(128, 333)
(128, 330)
(128, 380)
(128, 339)
(128, 338)
(128, 390)
(128, 322)
(128, 315)
(128, 347)
(128, 334)
(128, 370)
(128, 364)
(128, 383)
(128, 342)
(128, 390)
(128, 363)
(128, 389)
(128, 375)
(128, 332)
(128, 344)
(128, 315)
(128, 316)
(128, 446)
(128, 384)
(128, 334)
(128, 338)
(128, 328)
(128, 326)
(128, 319)
(128, 334)
(128, 373)
(128, 358)
(128, 363)
(128, 383)
(128, 327)
(128, 316)
(128, 414)
(128, 366)
(128, 352)
(128, 338)
(128, 318)
(128, 343)
(128, 361)
(128, 323)
(128, 374)
(128, 399)
(128, 409)
(128, 597)
(128, 360)
(128, 385)
(128, 332)
(128, 358)
(128, 387)
(128, 416)
(128, 360)
(128, 341)
(128, 411)
(128, 360)
(128, 330)
(128, 372)
(128, 317)
(128, 355)
(128, 375)


In [8]:
# Loop over the noisy speech files
for filename in os.listdir(os.path.join(dataset_path, "NoisySpeech_training")):
    # Load the audio file
    audio, sr = librosa.load(os.path.join(dataset_path, "NoisySpeech_training", filename), sr=sampling_rate)

    mel_spec = librosa.feature.melspectrogram(y = audio, sr=sampling_rate, n_mels=n_mels, hop_length=hop_length)

    # Convert the mel spectrogram to decibels
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)

    # Save the mel spectrogram to disk
    np.save(os.path.join(preprocessed_path + '/NoisySpeech_training', f"{filename}.npy"), mel_spec_db)

In [78]:
# Define the path to the preprocessed data
preprocessed_path = "Preprocessed-Data"
from skimage.transform import resize

# Function to load preprocessed data and labels
def load_data_labels(data_path, label):
    data = []
    labels = []

    for filename in os.listdir(os.path.join(data_path, label)):
        if (label == "NoisySpeech_training" and filename.split('_')[2] == '20.0' or label == "CleanSpeech_training") :

            file_path = os.path.join(data_path, label, filename)
            mel_spec_db = np.load(file_path)

            mel_spec_db_resized = resize(mel_spec_db, (128,431), anti_aliasing=True)
            
            # Extract the label from the filename
            if label == "NoisySpeech_training":
                fn = filename.split('_')[-1]
                sample_number = int(''.join(filter(str.isdigit, fn)))
            else:
                sample_number = int(''.join(filter(str.isdigit, filename)))

    
            # Add the data and label to the lists
            data.append(mel_spec_db_resized)
            labels.append(sample_number)  # Convert label to integer

    return data, labels

In [79]:
# Load clean speech data and labels
clean_data, clean_labels = load_data_labels(preprocessed_path, "CleanSpeech_training")

print("Clean data: ", len(clean_data))
print("Clean labels: ", len(clean_labels))

Clean data:  63
Clean labels:  63


In [81]:

# Load noisy speech data and labels
noisy_data, noisy_labels = load_data_labels(preprocessed_path, "NoisySpeech_training")

print("Noisy data: ", len(noisy_data))
print("Noisy labels: ", len(noisy_labels))


Noisy data:  63
Noisy labels:  63


In [98]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(sparse=False)
clean_labels_encoded = encoder.fit_transform(np.array(clean_labels).reshape(-1, 1))
noisy_labels_encoded = encoder.fit_transform(np.array(noisy_labels).reshape(-1, 1))




In [99]:
# Concatenate clean and noisy data along with their labels.
all_data = np.concatenate((clean_data, noisy_data), axis=0)
all_labels = np.concatenate((clean_labels_encoded, noisy_labels_encoded), axis=0)


In [100]:
# Shuffle the data and labels
from sklearn.utils import shuffle

all_data, all_labels = shuffle(all_data, all_labels, random_state=42)


In [101]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(
    all_data, all_labels, test_size=0.2, random_state=42
)


In [102]:
train_data_normalized = train_data / 255.0  # Assuming pixel values range from 0 to 255
test_data_normalized = test_data / 255.0
print(train_data_normalized[0])
print(train_data_normalized.shape)


[[-0.15411733 -0.13628708 -0.12317581 ... -0.11594322 -0.11928651
  -0.12002021]
 [-0.13866602 -0.1237194  -0.12153348 ... -0.1162685  -0.11604972
  -0.11476348]
 [-0.16813572 -0.15899122 -0.15530667 ... -0.1363196  -0.13454363
  -0.12525561]
 ...
 [-0.3137255  -0.3137255  -0.3137255  ... -0.3137255  -0.3137255
  -0.3137255 ]
 [-0.3137255  -0.3137255  -0.3137255  ... -0.3137255  -0.3137255
  -0.3137255 ]
 [-0.3137255  -0.3137255  -0.3137255  ... -0.3137255  -0.3137255
  -0.3137255 ]]
(100, 128, 431)


In [103]:
import tensorflow as tf
from keras import layers

# Input shape
input_shape = (128, 431, 1)

# Number of classes
num_classes = 63

# Define the model
model = tf.keras.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape, padding='same'),
    layers.MaxPooling2D((2, 2), padding='same'),
    layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2), padding='same'),
    layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
    layers.MaxPooling2D((2, 2), padding='same'),
    layers.Flatten(),
    layers.Dense(256, activation='relu'),
    layers.Dense(128, activation='relu'),
    layers.Dense(num_classes, activation='softmax')  # Use softmax for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [104]:

# Train the model
history = model.fit(train_data_normalized, train_labels, epochs=10, validation_data=(test_data_normalized, test_labels))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [105]:

## Assuming your model is already trained and saved in the 'model' variable

# Evaluate the model on the test set
mse = model.evaluate(test_data_normalized, test_labels)

# Print the MSE
print(f'Mean Squared Error on Test Set: {mse}')



Mean Squared Error on Test Set: [4.428445816040039, 0.0]
