In [26]:
# importing required libraries
import os
import librosa
import numpy as np
import pandas as pd
import multiprocessing.dummy as mp
import tensorflow as tf
import matplotlib.pyplot as plt
import librosa.display
import seaborn as sns


#Sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

#Tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
main_dir = 'ML_TACTIGON/customTSkin/data/audiodati'
folders = [folder for folder in os.listdir(main_dir) if os.path.isdir(os.path.join(main_dir, folder))]
print(folders)

['down', 'no', 'up', 'yes']


In [22]:
sampling_rate = 16000

def load_audio(file_path):
    y, sr = librosa.load(file_path, sr=sampling_rate)
    label = os.path.basename(os.path.dirname(file_path))  
    filename = os.path.basename(file_path)                
    return filename, label, y

In [21]:

# loading data from only the first 7 folders
file_paths = []
for folder in folders[:4]:
    folder_files = [os.path.join(main_dir, folder, file) for file in os.listdir(os.path.join(main_dir, folder)) if file.endswith('.wav')]
    file_paths.extend(folder_files)

In [None]:
with mp.Pool(os.cpu_count()) as pool:  
    audio_data = pool.map(load_audio, file_paths)

In [None]:
# organizing results into a DataFrame with filename, label, and audio data
audio_df = pd.DataFrame(audio_data, columns=['filename', 'label', 'audio_data'])
audio_df.head()

In [None]:
# getting dataframe shape
audio_df.shape

In [None]:
# defining a function for extracting MFCCs for a single audio sample
def extract_mfcc_parallel(audio_data, sr, n_mfcc=13):
    return librosa.feature.mfcc(y=audio_data, sr=sampling_rate, n_mfcc=n_mfcc).T

# preparing data for parallel processing
audio_data_list = audio_df['audio_data'].tolist()

# using thread-based multiprocessing to extract MFCCs in parallel
with mp.Pool(os.cpu_count()) as pool:
    mfcc_features = pool.starmap(extract_mfcc_parallel, [(audio, sampling_rate) for audio in audio_data_list])

# adding the extracted MFCC features to the DataFrame
audio_df['mfcc'] = mfcc_features
audio_df.head()

In [None]:
# converting list of arrays (MFCCs) into a 2D array for standardization
mfcc_flattened = np.concatenate(audio_df['mfcc'].values, axis=0)

# initializing the scaler and fit only on the flattened MFCCs (training data only)
scaler = StandardScaler().fit(mfcc_flattened)

# function to apply scaler to each sample
def scale_features(features, scaler):
    scaled_features = []
    for feature in features:
        scaled = scaler.transform(feature)  # standardize each sample
        scaled_features.append(scaled)
    return np.array(scaled_features, dtype=object)

# applying scaling to MFCC features
audio_df['scaled_mfcc'] = scale_features(audio_df['mfcc'].values, scaler)
audio_df.head()

In [None]:
# getting labels
audio_df['label'].unique()

In [None]:
# selecting an audio sample
audio_sample = audio_df[audio_df['label'] == 'backward']['audio_data'].iloc[0]

# creating a figure with subplots
fig, axes = plt.subplots(2, 2, figsize=(13, 7))

# 1. Waveform
librosa.display.waveshow(audio_sample, sr=sampling_rate, ax=axes[0, 0])
axes[0, 0].set_title("Waveform")
axes[0, 0].set_xlabel("Time (s)")
axes[0, 0].set_ylabel("Amplitude")

# 2. Spectrogram
D = librosa.amplitude_to_db(np.abs(librosa.stft(audio_sample)), ref=np.max)
librosa.display.specshow(D, sr=sampling_rate, x_axis='time', y_axis='log', ax=axes[0, 1], cmap='viridis')
axes[0, 1].set_title("Spectrogram")
axes[0, 1].set_xlabel("Time (s)")
axes[0, 1].set_ylabel("Frequency (Hz)")

# 3. MFCC
mfccs = librosa.feature.mfcc(y=audio_sample, sr=sampling_rate, n_mfcc=13)
librosa.display.specshow(mfccs, x_axis='time', sr=sampling_rate, ax=axes[1, 0])
axes[1, 0].set_title("MFCC")
axes[1, 0].set_xlabel("Time (s)")
axes[1, 0].set_ylabel("MFCC Coefficients")
fig.colorbar(librosa.display.specshow(mfccs, x_axis='time', sr=sampling_rate, ax=axes[1, 0]), ax=axes[1, 0])

# 4. Chromagram
chroma = librosa.feature.chroma_stft(y=audio_sample, sr=sampling_rate)
librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', sr=sampling_rate, ax=axes[1, 1])
axes[1, 1].set_title("Chromagram")
axes[1, 1].set_xlabel("Time (s)")
axes[1, 1].set_ylabel("Pitch Class")
fig.colorbar(librosa.display.specshow(chroma, y_axis='chroma', x_axis='time', sr=sampling_rate, ax=axes[1, 1]), ax=axes[1, 1])

# Adjust layout
plt.tight_layout()
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(audio_df['scaled_mfcc'], audio_df['label'], test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [None]:
X_train_padded = pad_sequences(X_train, padding='post')
X_test_padded = pad_sequences(X_test, padding='post')
X_val_padded = pad_sequences(X_val, padding='post')

In [None]:
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)
y_val = label_encoder.transform(y_val)

In [None]:
# defining the model
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_padded.shape[1], X_train_padded.shape[2])),
    MaxPooling1D(pool_size=2),
    Conv1D(128, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(len(audio_df['label'].unique()), activation='softmax')
])

model.summary()

# compiling the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# define the ModelCheckpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    'model_A.keras',           # filepath to save the model
    monitor='val_accuracy',       # metric to monitor
    save_best_only=True,          # save only the best model
    mode='max',                   # mode to determine if the monitored quantity is improving
    verbose=1                     # verbosity mode
)

# Train the model with the callback
history = model.fit(
    X_train_padded, 
    y_train, 
    epochs=50, 
    validation_data=(X_val_padded, y_val),
    callbacks=[checkpoint_callback]  # include the callback here
)

# loading the best model after training
best_model = tf.keras.models.load_model('model_A.keras')

# evaluating the best model on training and validation data
train_loss, train_accuracy = best_model.evaluate(X_train_padded)
val_loss, val_accuracy = best_model.evaluate(y_val)

# print the model evaluation results
print('------------------------------')
print('------Best Model Summary------')
print(f"Training Loss: {train_loss:.4f}, \nTraining Accuracy: {train_accuracy:.4f}")
print(f"Validation Loss: {val_loss:.4f}, \nValidation Accuracy: {val_accuracy:.4f}")

In [None]:
# Defining the improved model
model = Sequential([
    Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_train_padded.shape[1], X_train_padded.shape[2])),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Conv1D(128, kernel_size=3, activation='relu'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Conv1D(256, kernel_size=3, activation='relu'),  # Increased complexity
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Flatten(),
    Dense(128, activation='relu'),  # Increased neurons
    Dropout(0.5),
    Dense(len(audio_df['label'].unique()), activation='softmax')
])

# Compiling the model with Adam optimizer
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
              loss='sparse_categorical_crossentropy', 
              metrics=['accuracy'])

# Define the ModelCheckpoint callback to save the best model
checkpoint_callback = ModelCheckpoint(
    'model_B.keras',           # filepath to save the model
    monitor='val_accuracy',       # metric to monitor
    save_best_only=True,         # save only the best model
    mode='max',                   # mode to determine if the monitored quantity is improving
    verbose=1                     # verbosity mode
)

# Add ReduceLROnPlateau callback to adjust learning rate
reduce_lr_callback = ReduceLROnPlateau(
    monitor='val_loss',          # metric to monitor
    factor=0.5,                  # factor by which the learning rate will be reduced
    patience=5,                  # number of epochs with no improvement after which learning rate will be reduced
    min_lr=1e-6,                 # lower bound on the learning rate
    verbose=1
)

# Train the model with the callbacks
history = model.fit(
    X_train_padded, 
    y_train, 
    epochs=50, 
    validation_data=(X_val_padded, y_val),
    callbacks=[checkpoint_callback, reduce_lr_callback]  # include both callbacks
)

In [None]:
# Load the best model
model = tf.keras.models.load_model('best_model.keras')

# Assuming X_test_padded is your padded test data and y_test are the true labels
# Make predictions on the test data
y_pred = model.predict(X_test_padded)
y_pred_labels = np.argmax(y_pred, axis=1)  # Convert one-hot encoded predictions to label indices

# Compute the confusion matrix
cm = confusion_matrix(y_test, y_pred_labels)

# Get the label names from the label encoder
label_names = label_encoder.classes_

# Plot the confusion matrix
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_names, yticklabels=label_names)
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()