In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
import librosa
import numpy as np
import pandas as pd

In [None]:
def is_good_sample(y, sr, min_duration=2.0, rms_threshold=0.01, zcr_threshold=(0.05, 0.3)): 
    # Check if the duration is above the minimum threshold
    duration = librosa.get_duration(y=y, sr=sr)
    if duration < min_duration:
        return False

    # Check if the RMS energy is above the threshold
    rms = np.mean(librosa.feature.rms(y=y))
    if rms < rms_threshold:
        return False

    # Check if the ZCR is within a reasonable range
    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    if not (zcr_threshold[0] <= zcr <= zcr_threshold[1]):
        return False

    return True

In [None]:
def extract_features(file_path, label):
    try:
        y, sr = librosa.load(file_path, sr=None)
        
        # Check if the sample is good
        if not is_good_sample(y, sr):
            print(f" ---Skipping {file_path} due to low quality.")
            return None
        
        # Initialize a dictionary to store features
        features = {}
        features['file_name'] = os.path.basename(file_path)
        features['label'] = label
        
        # Extract features and rename keys with a specific format
        features['mfcc_mean'] = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13), axis=1).tolist()
        features['chroma_stft_mean'] = np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1).tolist()
        features['spectral_contrast_mean'] = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1).tolist()
        features['spectral_centroid_mean'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        features['spectral_rolloff_mean'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        features['zcr_mean'] = np.mean(librosa.feature.zero_crossing_rate(y))
        features['tempo'] = librosa.beat.beat_track(y=y, sr=sr)[0]
        features['tonnetz_mean'] = np.mean(librosa.feature.tonnetz(y=y, sr=sr), axis=1).tolist()
        features['rms_mean'] = np.mean(librosa.feature.rms(y=y))
        features['onset_mean'] = np.mean(librosa.onset.onset_strength(y=y, sr=sr))
        
        return features
    
    except Exception as e:
        # Handle exceptions by returning None or a dictionary with an error message
        print(f"Error processing {file_path}: {e}")
        return {'file_name': os.path.basename(file_path), 'label': label, 'error': str(e)}

def process_directory(directory_path, label, output_csv='extracted_features.csv'):
    features_list = []
    successful_extractions = 0
    total_samples = 0
    
    for root, dirs, files in os.walk(directory_path):
        for file in files:
            if file.endswith(('.wav', '.mp3', '.flac')):
                total_samples += 1
                file_path = os.path.join(root, file)
                print('count : ',total_samples,'  filename:  ',file)
                features = extract_features(file_path, label)
                if features is not None and 'error' not in features:  # Only add features if the sample was good
                    features_list.append(features)
                    successful_extractions += 1
    
    # Convert the list of dictionaries to a DataFrame
    features_df = pd.DataFrame(features_list)
    
    # Only filter out rows with errors if the 'error' column exists
    if 'error' in features_df.columns:
        features_df = features_df[features_df['error'].isna()]
    
    # Expand lists into separate columns
    if not features_df.empty:
        mfcc_columns = pd.DataFrame(features_df['mfcc_mean'].tolist(), columns=[f'mfcc_{i}' for i in range(1, 14)])
        chroma_columns = pd.DataFrame(features_df['chroma_stft_mean'].tolist(), columns=[f'chroma_{i}' for i in range(1, 13)])
        spectral_contrast_columns = pd.DataFrame(features_df['spectral_contrast_mean'].tolist(), columns=[f'spectral_contrast_{i}' for i in range(1, 8)])
        tonnetz_columns = pd.DataFrame(features_df['tonnetz_mean'].tolist(), columns=[f'tonnetz_{i}' for i in range(1, 7)])
        
        # Concatenate everything back into a single DataFrame
        features_df = pd.concat([features_df.drop(columns=['mfcc_mean', 'chroma_stft_mean', 'spectral_contrast_mean', 'tonnetz_mean']),
                                 mfcc_columns, chroma_columns, spectral_contrast_columns, tonnetz_columns], axis=1)
    
    # Save to a CSV file
    features_df.to_csv(output_csv, index=False)
    
    print(f"Features extracted and saved to {output_csv}")
    print(f"Total samples processed: {total_samples}")
    print(f"Successful feature extractions: {successful_extractions}")
    
# Example usage:
directory_path = '/kaggle/input/lhgy-music-dataset'
label = 'lhgy'  # Example label, you can change this as needed
output_csv = '/kaggle/working/lhgy_style_features.csv'
process_directory(directory_path, label, output_csv)