In [12]:
# import os
import librosa

import numpy as np
import pandas as pd
import seaborn as sns
# import tensorflow as tf

import librosa.display

import plotly.express as px
import IPython.display as ipd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from tqdm import tqdm, trange
from librosa import feature, amplitude_to_db, load

from tqdm.auto import tqdm
from plotly.subplots import make_subplots

from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense , Activation , Dropout

pd.plotting.register_matplotlib_converters()

%matplotlib inline


In [5]:
import librosa
import numpy as np
import pandas as pd
import os
from tqdm import tqdm  # for progress bar

# Function to extract mean MFCC features
def extract_mean_mfcc(audio_file, n_mfcc=13, frame_length_ms=25, hop_length_ms=10):
    """
    Extracts MFCC features from an audio file and computes the mean across frames.
    Returns the mean MFCC features as a list.
    """
    # Load the audio file
    y, sr = librosa.load(audio_file, sr=None)

    # Compute hop length and FFT window size
    hop_length = int((hop_length_ms / 1000) * sr)  
    n_fft = int((frame_length_ms / 1000) * sr)    

    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, hop_length=hop_length, n_fft=n_fft)

    # Compute the mean of MFCCs across time frames
    mean_mfccs = np.mean(mfccs, axis=1)

    return mean_mfccs

# Function to process audio files in multiple folders for different frame lengths and n_mfcc values
def process_audio_folders(input_folders, output_directory="output_csvs_noise_factory_f1"):
    """
    Processes all audio files in multiple input folders for multiple n_mfcc values,
    extracts mean MFCCs, and saves results to separate CSVs.
    """
    # Ensure the output directory exists
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    # Define the list of n_mfcc values
    n_mfcc_values = [13, 20, 30, 40, 50, 60, 70, 80]
    
    # Process each folder and n_mfcc value
    for n_mfcc in n_mfcc_values:
        # List to store results for the current n_mfcc
        mfcc_results = []

        # Process each folder
        for folder in input_folders:
            # Get all audio files in the folder
            audio_files = [f for f in os.listdir(folder) if f.endswith(('.wav', '.mp3'))]
            total_files = len(audio_files)
            
            if total_files == 0:
                print(f"No audio files found in folder: {folder}!")
                continue
            
            # Create a progress message for processing each audio file in the current n_mfcc
            print(f"\nProcessing {total_files} files for n_mfcc={n_mfcc} in folder: {folder}")
            for i, audio_file in enumerate(audio_files):
                audio_path = os.path.join(folder, audio_file)
                
                # For each audio file, process with the given frame length
                for frame_length in [25]:  # Process for only one frame length (25 ms)
                    mean_mfccs = extract_mean_mfcc(audio_path, n_mfcc=n_mfcc, frame_length_ms=frame_length, hop_length_ms=10)
                    
                    # Append results with filename, folder, frame length, and n_mfcc value
                    mfcc_results.append([audio_file, folder, frame_length, n_mfcc] + mean_mfccs.tolist())

                # Output progress in the console for the current file
                print(f"Processing file {i + 1}/{total_files} ({(i + 1) / total_files * 100:.2f}%) - {audio_file}")
            
        # Convert to DataFrame for the current n_mfcc
        column_names = ["Filename", "Folder", "Frame_Length_ms", "n_mfcc"] + [f"MFCC_{i+1}" for i in range(len(mean_mfccs))]
        mfcc_df = pd.DataFrame(mfcc_results, columns=column_names)

        # Save the DataFrame to a CSV file
        output_csv = os.path.join(output_directory, f"mfcc_means_nmfcc_{n_mfcc}.csv")
        mfcc_df.to_csv(output_csv, index=False)
        
        # Print success message after saving the CSV
        print(f"\n✅ Mean MFCCs for n_mfcc={n_mfcc} saved to {output_csv}")

# Define input folders containing audio files
input_folders = [
    r"C:\Users\sukal\Downloads\Noise Augmented Speech Sample (Factory Noise f1)\Dementia_Noise",  # First folder
    r"C:\Users\sukal\Downloads\Noise Augmented Speech Sample (Factory Noise f1)\COntrol_Noise"  # Second folder (replace with the second folder path)
]

# Call the function to process the files and save the results to separate CSV files
process_audio_folders(input_folders)



Processing 241 files for n_mfcc=13 in folder: C:\Users\sukal\Downloads\Noise Augmented Speech Sample (Factory Noise f1)\Dementia_Noise
Processing file 1/241 (0.41%) - 001-0_noisy.wav
Processing file 2/241 (0.83%) - 001-2_noisy.wav
Processing file 3/241 (1.24%) - 003-0_noisy.wav
Processing file 4/241 (1.66%) - 005-0_noisy.wav
Processing file 5/241 (2.07%) - 005-2_noisy.wav
Processing file 6/241 (2.49%) - 007-1_noisy.wav
Processing file 7/241 (2.90%) - 007-3_noisy.wav
Processing file 8/241 (3.32%) - 010-0_noisy.wav
Processing file 9/241 (3.73%) - 010-1_noisy.wav
Processing file 10/241 (4.15%) - 010-2_noisy.wav
Processing file 11/241 (4.56%) - 010-3_noisy.wav
Processing file 12/241 (4.98%) - 010-4_noisy.wav
Processing file 13/241 (5.39%) - 014-2_noisy.wav
Processing file 14/241 (5.81%) - 016-0_noisy.wav
Processing file 15/241 (6.22%) - 016-1_noisy.wav
Processing file 16/241 (6.64%) - 016-3_noisy.wav
Processing file 17/241 (7.05%) - 016-4_noisy.wav
Processing file 18/241 (7.47%) - 018-0_n

In [9]:
import os
import pandas as pd

# Specify the folder where the CSV files are located
folder_path = r'C:\Users\sukal\output_csvs_noise_factory_f1'  # Replace this with your folder path

# Get the list of CSV files in the specified folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Loop through each file
for file in csv_files:
    file_path = os.path.join(folder_path, file)  # Full path of the file
    
    # Load the CSV file into a DataFrame
    df = pd.read_csv(file_path)
    
    # Drop the specified column
    df = df.drop(columns=['Frame_Length_ms', 'n_mfcc'])
    
    # Binary encode the 'Folder' column
    df['Folder'] = df['Folder'].map({
        r'C:\Users\sukal\Downloads\Noise Augmented Speech Sample (Factory Noise f1)\Dementia_Noise': 1,
        r'C:\Users\sukal\Downloads\Noise Augmented Speech Sample (Factory Noise f1)\COntrol_Noise': 0
    })
    
    # Create a new filename for the modified file
    new_file = file.replace('.csv', '_SVM.csv')
    new_file_path = os.path.join(r'C:\Users\sukal\output_csvs_noise_factory_f1_SVM', new_file)  # Full path for saving
    
    # Save the modified DataFrame to a new CSV file
    df.to_csv(new_file_path, index=False)
    
    # Print a message indicating the file was processed
    print(f"Column dropped and binary encoding done successfully for {file}, saved as {new_file}.")
        

Column dropped and binary encoding done successfully for mfcc_means_nmfcc_13.csv, saved as mfcc_means_nmfcc_13_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_20.csv, saved as mfcc_means_nmfcc_20_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_30.csv, saved as mfcc_means_nmfcc_30_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_40.csv, saved as mfcc_means_nmfcc_40_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_50.csv, saved as mfcc_means_nmfcc_50_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_60.csv, saved as mfcc_means_nmfcc_60_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_70.csv, saved as mfcc_means_nmfcc_70_SVM.csv.
Column dropped and binary encoding done successfully for mfcc_means_nmfcc_80.csv, saved as mfcc_means_nmfcc_80_SVM.csv.


In [11]:
import os
import librosa

import numpy as np
import pandas as pd
import seaborn as sns
# import tensorflow as tf

import librosa.display

import plotly.express as px
import IPython.display as ipd
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from tqdm import tqdm, trange
from librosa import feature, amplitude_to_db, load

from tqdm.auto import tqdm
from plotly.subplots import make_subplots

from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV

# from tensorflow.keras.utils import to_categorical
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Dense , Activation , Dropout

pd.plotting.register_matplotlib_converters()

%matplotlib inline


In [17]:
import os
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Specify the folder containing CSV files
folder_path = r'C:\Users\sukal\output_csvs_noise_factory_f1_SVM'  # Replace with the path to your folder

# Get the list of CSV files in the specified folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Loop through each file in the folder
for file in csv_files:
    file_path = os.path.join(folder_path, file)  # Full path of the CSV file
    
    # Read CSV file
    df = pd.read_csv(file_path)
    
    # Ensure the target column 'Folder' exists
    if 'Folder' not in df.columns:
        print(f"Skipping {file} (No 'Folder' column)")
    else:
        # Separate features (X) and target variable (y)
        X = df.drop(columns=['Folder', 'Filename'])  # Drop target & filename columns
        y = df['Folder']

        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

        # Train SVM model
        model = SVC(kernel='poly')  # Using polynomial kernel
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Print the accuracy for each file
        print(f"SVM Model with poly kernel and test ratio 0.1 trained on {file} - Accuracy: {accuracy:.4f}")


SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_13_SVM.csv - Accuracy: 0.5306
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_20_SVM.csv - Accuracy: 0.5510
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_30_SVM.csv - Accuracy: 0.5918
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_40_SVM.csv - Accuracy: 0.5510
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_50_SVM.csv - Accuracy: 0.6122
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_60_SVM.csv - Accuracy: 0.6327
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_70_SVM.csv - Accuracy: 0.5918
SVM Model with poly kernel and test ratio 0.1 trained on mfcc_means_nmfcc_80_SVM.csv - Accuracy: 0.6122


In [19]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Specify the folder containing CSV files
folder_path = r'output_csvs_noise_factory_f1_SVM'  # Replace with the path to your folder

# Get the list of CSV files in the specified folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Loop through each file in the folder
for file in csv_files:
    file_path = os.path.join(folder_path, file)  # Full path of the CSV file

    # Read CSV
    df = pd.read_csv(file_path)

    # Ensure the target column 'Folder' exists
    if 'Folder' not in df.columns:
        print(f"Skipping {file_path} (No 'Folder' column)")
    else:
        # Separate features (X) and target variable (y)
        X = df.drop(columns=['Folder', 'Filename'])  # Drop target & serial columns
        y = df['Folder']

        # Standardize features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Split data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42)

        # Train KNN model with n_neighbors=8
        model = KNeighborsClassifier(n_neighbors=8)
        model.fit(X_train, y_train)

        # Predict and evaluate
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)

        # Print the accuracy for each file
        print(f"KNN Model with n_neighbors=8 and test ratio 0.1 trained on {file} - Accuracy: {accuracy:.4f}")
        

KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_13_SVM.csv - Accuracy: 0.4694
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_20_SVM.csv - Accuracy: 0.5510
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_30_SVM.csv - Accuracy: 0.5510
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_40_SVM.csv - Accuracy: 0.5510
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_50_SVM.csv - Accuracy: 0.5510
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_60_SVM.csv - Accuracy: 0.6122
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_70_SVM.csv - Accuracy: 0.6327
KNN Model with n_neighbors=8 and test ratio 0.1 trained on mfcc_means_nmfcc_80_SVM.csv - Accuracy: 0.5714
