In [26]:
import os
from pathlib import Path
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

def feature_extractor(sound_path, window_size):
    # Load the audio file
    signal, sr = librosa.load(sound_path)
    
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=signal, n_mfcc=13,sr = sr, hop_length=window_size)
    
    # Extract first MFCCs derivatives
    delta_mfccs = librosa.feature.delta(mfccs)
    
    # Extract second MFCCs derivatives
    delta2_mfccs = librosa.feature.delta(mfccs, order=2)
    
    # Concatenate features
    mfccs_features = np.concatenate((mfccs, delta_mfccs, delta2_mfccs))
    
    # Return all features
    return mfccs, delta_mfccs, delta2_mfccs, mfccs_features


In [27]:
def process_audio_folder(audio_folder,window_size):
    
    # Initialize a list to store individual DataFrames for each audio file
    dfs = []

    # Iterate over each audio file in the folder
    for audio_file in os.listdir(audio_folder):
        # Print the name of the current audio file
        print(audio_file)
        
        # Construct the full path to the audio file
        audio_path = os.path.join(audio_folder, audio_file)
        
        # Extract MFCC features for the current audio file
        # Assuming feature_extractor function is defined elsewhere
        mfcc_features = feature_extractor(audio_path,window_size=window_size)[3]  # Extract the fourth element (mfcc_features)
        transposed_mfcc = mfcc_features.T

        # Create a DataFrame from the transposed array
        df = pd.DataFrame(transposed_mfcc)

        # Add the file name as the first column
        df.insert(0, 'audio_file', os.path.basename(audio_file))

        # Optionally, you can rename the columns if you want
        column_names = ['File Name'] + [f"Feature_{i+1}" for i in range(39)]
        df.columns = column_names

        # Append the dataframe to the list
        dfs.append(df)

    # Concatenate all DataFrames into a single DataFrame
    df_output = pd.concat(dfs, ignore_index=True)
    
    # Return the concatenated DataFrame
    return df_output

# For Generating DF for all Files

In [28]:
# Run this cell to get processed_df for all the folders 
root_folder = "/kaggle/input/language-recognition/Language-Recognition"

# Process each language folder in sorted order
# Change window size from here
for language_folder in sorted(os.listdir(root_folder)):
    print(language_folder)
    processed_df = process_audio_folder(language_folder, window_size=window_size)

In [None]:
def process_language_folder_to_csv(language_folder, relative_name, window_size):
    # Process audio files in the language folder
    processed_df = process_audio_folder(language_folder, window_size=window_size)
    # Define the output directory
    output_folder = "/kaggle/working"  # Output directory

    # Check if the output directory exists, if not, create it
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Save DataFrame as CSV in the output directory
    output_file = os.path.join(output_folder, relative_name + ".csv")
    processed_df.to_csv(output_file, index=False)


# For Generating all CSV Files

In [None]:
# Main code
root_folder = "/kaggle/input/language-recognition/Language-Recognition"

# Process each language folder in sorted order
# Change window size from here
for language_folder in sorted(os.listdir(root_folder)):
    print(language_folder)
    process_language_folder_to_csv(os.path.join(root_folder, language_folder), language_folder,window_size=512)