In [1]:
import numpy as np
import os
import librosa
import pandas as pd 
from tqdm import tqdm #for progress bars 

In [2]:
input_file_path='T:\\TOSHITH\\PROGRAMMING\\music_genere_classification\\genres_original'
output_file_path='T:\\TOSHITH\\PROGRAMMING\\music_genere_classification\\genres_processed'

### Step 1 Define all the audio extraction functions

In [3]:
def amplitude_envolope(signal,N,FRAME_SIZE,HOP_SIZE):
    return np.array([max(signal[i:i+FRAME_SIZE]) for i in range(0,N,HOP_SIZE)])

In [4]:
def band_energy_ratio_calculator(split_freq,spectogram,sample_rate):
    freq_range=sample_rate/2
    freq_range_per_bin= freq_range/spectogram.shape[0]
    freq_bin_of_split_freq=int(np.floor(split_freq/freq_range_per_bin))
    #what bin does our split freq lie in so that we can get ratio of above and below 

    power_specturm=np.abs(spectogram)**2
    power_specturm=power_specturm.T

    band_energy_series=[]

    for freq_in_frame in power_specturm:
        sum_low_freq= np.sum(freq_in_frame[:freq_bin_of_split_freq])
        sum_high_freq= np.sum(freq_in_frame[freq_bin_of_split_freq:])

        if sum_high_freq == 0:
            band_energy_series.append(0)  # or 0, or some default value
        else:
            band_energy_series.append(sum_low_freq / sum_high_freq)

    return np.array(band_energy_series)


#### Note: all other functions are builtin in librosa 

### Step 2 processing all files and storing the output in a csv file using pandas 

In [5]:
#should be a power of 2 for FFT algorithm, and a frame size of 4096 equals to 200ms of audio length 
FRAME_SIZE=4096 
#to create overlapping frame for future calculations  
HOP_SIZE=2048  
#we define an arbitrairy band F=2500Hz to split to calculate the Band energy ratio 
SPLIT_FREQ=2500

In [6]:
os.makedirs(output_file_path, exist_ok=True)

genere_names = sorted(os.listdir(input_file_path))

#we want all audio files to be of the same length therefore we crop the audio files to 30s
no_of_samples=22050*30 #sampling rate multiplied by time 

for label, genere_name in enumerate(tqdm(genere_names, desc="Processing genres")):
    genere_path = os.path.join(input_file_path, genere_name)
    if not os.path.isdir(genere_path):
        continue

    # Create the output genre folder
    output_genere_path = os.path.join(output_file_path, genere_name)
    os.makedirs(output_genere_path, exist_ok=True)

    audio_files = os.listdir(genere_path)
    for Audio_file_name in tqdm(audio_files, desc=f"Processing {genere_name}", leave=False):
        audio_path = os.path.join(genere_path, Audio_file_name)
        try:
            audio, Fs = librosa.load(audio_path)
            audio=audio[:no_of_samples] #cropping audio to 30s

            amp_envelope = amplitude_envolope(audio, len(audio), FRAME_SIZE, HOP_SIZE)
            rms_energy = librosa.feature.rms(y=audio, frame_length=FRAME_SIZE, hop_length=HOP_SIZE)[0]
            zero_cr = librosa.feature.zero_crossing_rate(y=audio, frame_length=FRAME_SIZE, hop_length=HOP_SIZE)[0]
            sfose = librosa.onset.onset_strength(y=audio, sr=Fs, hop_length=HOP_SIZE)
            mfcc = librosa.feature.mfcc(y=audio, n_mfcc=13, sr=Fs, hop_length=HOP_SIZE, n_fft=FRAME_SIZE)
            mfcc = np.mean(mfcc, axis=0)
            delta_mfcc = librosa.feature.delta(mfcc)
            delta2_mfcc = librosa.feature.delta(mfcc, order=2)
            spec_centroid = librosa.feature.spectral_centroid(y=audio, sr=Fs, hop_length=HOP_SIZE, n_fft=FRAME_SIZE)[0]
            spec_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=Fs, hop_length=HOP_SIZE, n_fft=FRAME_SIZE)[0]
            spectrogram = librosa.stft(audio, n_fft=FRAME_SIZE, hop_length=HOP_SIZE)
            band_energy_ratio = band_energy_ratio_calculator(SPLIT_FREQ, spectrogram, Fs)

            # Create DataFrame 
            df = pd.DataFrame({
                "Amplitude_Envelope": amp_envelope,
                "RMS_Energy": rms_energy,
                "Zero_Crossing_Rate": zero_cr,
                "SFOSE": sfose,
                "MFCC": mfcc,
                "Delta_MFCC": delta_mfcc,
                "Delta2_MFCC": delta2_mfcc,
                "Spectral_Centroid": spec_centroid,
                "Spectral_Bandwidth": spec_bandwidth,
                "Band_Energy_Ratio": band_energy_ratio,
            })

            # Save to CSV in same folder structure
            csv_name = os.path.splitext(Audio_file_name)[0] + "_features.csv"
            csv_path = os.path.join(output_genere_path, csv_name)
            df.to_csv(csv_path, index=False)

        except Exception as e:
            print(f"Failed to load {audio_path}: {e}")


Processing genres:   0%|          | 0/10 [00:00<?, ?it/s]

  audio, Fs = librosa.load(audio_path)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


Failed to load T:\TOSHITH\PROGRAMMING\music_genere_classification\genres_original\jazz\jazz.00054.wav: 


Processing genres: 100%|██████████| 10/10 [02:47<00:00, 16.79s/it]


In [7]:
#verifying that the files are processed properly 
genere_names = [name for name in os.listdir(output_file_path)
                if os.path.isdir(os.path.join(output_file_path, name))]
genere_names.sort()

# Count files in each genre subfolder
for genere_name in genere_names:
    genere_path = os.path.join(output_file_path, genere_name)
    # Count only files, not directories
    file_count = len([name for name in os.listdir(genere_path)
                      if os.path.isfile(os.path.join(genere_path, name))])
    print(f"Genre: {genere_name}, Processed files: {file_count}")

Genre: blues, Processed files: 100
Genre: classical, Processed files: 100
Genre: country, Processed files: 100
Genre: disco, Processed files: 100
Genre: hiphop, Processed files: 100
Genre: jazz, Processed files: 99
Genre: metal, Processed files: 100
Genre: pop, Processed files: 100
Genre: reggae, Processed files: 100
Genre: rock, Processed files: 100
