In [1]:
import os
import concurrent.futures
import librosa
import pandas as pd
import numpy as np
import scipy
from tqdm import tqdm

In [2]:
def columns() -> list:
    """
    Zwraca listę kolumn, w tym ostatnia kolumna = 'genre'.
    """
    # Przyjęte nazwy cech (według Twojej definicji):
    feature_sizes = dict(
        chroma_stft=12, chroma_cqt=12, chroma_cens=12,
        harmonic_separation=12, percussive_separation=12,
        tempogram_ratio=13, mfcc=12, spectral_contrast=7,
        tonnetz=6, poly_features=3, spectral_centroid=1,
        spectral_bandwidth=1, spectral_flatness=1,
        spectral_rolloff=1, rms=1, zcr=1, onset_strength=1,
        plp=1, spectral_entropy=1, autocorelation=1,
        pitch_features=1, tempo_variability=1,
        spectral_decrease=1, dtempo=1
    )
    single_features = [
        'tempo', 'beat_count', 'dtempo_changes', 'onset_count',
        'low_energy_rate', 'harmonic_to_noise_rate', 'dynamic_range',
        'swing_ratio', 'syncopation', 'roughness', 'warmth'
    ]
    moments = ('kurtosis','max','mean','median','min','skew','std','sum')
    cols = []
    for name, size in feature_sizes.items():
        for moment in moments:
            # np. "chroma_stft_00_mean"...
            it = (f"{name}_{i:02d}_{moment}" for i in range(size))
            cols.extend(it)
    cols = sorted(cols)
    # Dodajemy single features
    cols += single_features
    # Dodajemy kolumnę 'genre'
    cols.append('genre')
    return cols

In [3]:
def calculate_features_for_single_record(file_path:str) -> list:
    y, sr = librosa.load(file_path)

    # Chroma features
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr, n_chroma=12)
    chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr, n_chroma=12)
    chroma_cens = librosa.feature.chroma_cens(y=y, sr=sr)

    # MFCC, harmonic, Percusive
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=12)
    harmonic, percussive = librosa.effects.hpss(y)
    harmonic_separation = librosa.feature.mfcc(y=harmonic, sr=sr, n_mfcc=12)
    percussive_separation = librosa.feature.mfcc(y=percussive, sr=sr, n_mfcc=12)

    # Tempogram
    tempogram = librosa.feature.tempogram(y=y, sr=sr)
    tempogram_ratio = librosa.feature.tempogram_ratio(tg=tempogram, sr=sr)

    # Tonnetz
    tonnetz = librosa.feature.tonnetz(y=harmonic, sr=sr)

    # Poly features
    poly_features = librosa.feature.poly_features(y=y, sr=sr, order=2)

    # Spectral
    def calculate_spectral_entropy(y, sr):
        psd = np.abs(librosa.stft(y)) ** 2
        psd_sum = np.sum(psd)
        if psd_sum == 0:
            psd_norm = psd
        else:
            psd_norm = psd / (psd_sum + 1e-10)
        entropy = -np.sum(psd_norm * np.log2(psd_norm + 1e-10), axis=0)
        return entropy
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spectral_flatness = librosa.feature.spectral_flatness(y=y)
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    spectral_entropy = calculate_spectral_entropy(y, sr)

    # RMS
    rms = librosa.feature.rms(y=y)
    
    # ZCR
    zcr = librosa.feature.zero_crossing_rate(y)
    
    # Onset strength
    onset_env = librosa.onset.onset_strength(y=y, sr=sr)
    # plp
    plp = librosa.beat.plp(onset_envelope=onset_env, sr=sr)

    # Autocorrelation
    autocorrelation = librosa.autocorrelate(y)

    # Pitch features
    pitches, magnitudes = librosa.core.piptrack(y=y, sr=sr)
    pitch_features = pitches[pitches > 0]

    # Tempo variability
    tempo_variability = librosa.feature.rhythm.tempo(onset_envelope=onset_env, sr=sr, aggregate=None)

    # Spectral decrease
    def calculate_spectral_decrease(y, sr):
        S = np.abs(librosa.stft(y))
        decrease = np.mean(np.diff(S, axis=0), axis=1)
        return decrease
    spectral_decrease = calculate_spectral_decrease(y, sr)

    # Dynamic tempo
    dtempo = librosa.feature.tempo(onset_envelope=onset_env, sr=sr, aggregate=None)
    

    # Single features
    def count_value_changes(arr:list) -> int:
        changes = 0
        for i in range(1, len(arr)):
            if arr[i] != arr[i - 1]:
                changes += 1
        return changes

    def calculate_swing_ratio(y, sr):
        onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
        onset_times = librosa.frames_to_time(onset_frames, sr=sr)
        swing_ratios = []
        for i in range(1, len(onset_times)-1, 2):
            duration_1 = onset_times[i] - onset_times[i-1]
            duration_2 = onset_times[i+1] - onset_times[i]
            if duration_2 != 0:
                swing_ratio = duration_1 / duration_2
                swing_ratios.append(swing_ratio)
        return np.mean(swing_ratios) if swing_ratios else 0
    
    def calculate_syncopation(beats, sr):
        onset_frames = librosa.onset.onset_detect(y=y, sr=sr)
        onset_times = librosa.frames_to_time(onset_frames, sr=sr)
        beat_times = librosa.frames_to_time(beats, sr=sr)
        syncopation = 0
        for onset in onset_times:
            closest_beat = min(beat_times, key=lambda x: abs(x - onset))
            syncopation += abs(onset - closest_beat)
        return syncopation / len(onset_times) if len(onset_times) else 0
    
    def calculate_roughness(harmonic, sr):
        S = np.abs(librosa.stft(harmonic))
        frequencies = librosa.fft_frequencies(sr=sr)
        magnitudes = np.mean(S, axis=1)
        roughness = 0
        for i in range(len(frequencies) - 1):
            for j in range(i + 1, len(frequencies)):
                if abs(frequencies[i] - frequencies[j]) < 20:
                    roughness += magnitudes[i] * magnitudes[j] / abs(frequencies[i] - frequencies[j])
        return roughness
    
    def calculate_warmth(y, sr):
        mel_spectrogram = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        low_freq_idx = np.where(librosa.mel_frequencies(n_mels=128, fmax=sr/2) < 200)[0]
        low_freq_mean = np.mean(S_db[low_freq_idx, :])
        overall_mean = np.mean(S_db)
        if overall_mean == 0:
            return 0
        warmth = low_freq_mean / (overall_mean + 1e-10)
        return warmth

    dtempo_changes = count_value_changes(dtempo)
    tempo, beats = librosa.beat.beat_track(onset_envelope=onset_env, sr=sr)
    beat_count = len(beats)
    onset_count = len(librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr))
    low_energy = np.sum(rms < 0.5 * np.mean(rms)) / len(rms)

    def calculate_harmonic_ratio(harmonic, percussive):
        harmonic_sum = np.sum(harmonic)
        percussive_sum = np.sum(percussive)
        denominator = percussive_sum + harmonic_sum
        if denominator == 0:
            return 0
        harmonic_ratio = harmonic_sum / denominator
        return harmonic_ratio
        
    harmonic_ratio = calculate_harmonic_ratio(harmonic, percussive)
    dynamic_range = np.max(rms) - np.min(rms)
    swing_ratio = calculate_swing_ratio(y, sr)
    syncopation = calculate_syncopation(beats, sr)
    roughness = calculate_roughness(harmonic, sr)
    warmth = calculate_warmth(y, sr)

    moments = ['kurtosis', 'max', 'mean', 'median', 'min', 'skew', 'std', 'sum']

    def aggregate_feature(feature):
        if np.allclose(feature, feature[0]):
            return [np.nan, np.max(feature), np.mean(feature), np.median(feature), 
                np.min(feature), np.std(feature), np.nan, sum(feature)]
        else:
            return [scipy.stats.kurtosis(feature), np.max(feature), np.mean(feature), np.median(feature), 
                    np.min(feature), np.std(feature), scipy.stats.skew(feature), sum(feature)]
    
    features = []

    for f in [autocorrelation, chroma_cens, chroma_cqt, chroma_stft, dtempo, harmonic_separation,
              mfcc, onset_env, percussive_separation, pitch_features, plp, poly_features, 
              rms, spectral_bandwidth, spectral_centroid, spectral_contrast, spectral_decrease, 
              spectral_entropy, spectral_flatness, spectral_rolloff, tempo_variability, 
              tempogram_ratio, tonnetz, zcr]:
        if f.ndim == 1:
            features.extend(aggregate_feature(f))
        else:
            features.extend(np.hstack([aggregate_feature(f[i]) for i in range(f.shape[0])]))
    
    single_features = [tempo[0], beat_count, dtempo_changes, onset_count, low_energy, harmonic_ratio,
                       dynamic_range, swing_ratio, syncopation, roughness, warmth]
    features.extend(single_features)

    return features

In [4]:
def build_metadata_map(metadata_csv: str) -> dict:
    """
    Wczytuje plik z metadanymi chunków (zawierający PATH i GENRE),
    Zwraca słownik: relatywna_sciezka -> gatunek
    Zakładamy, że w pliku CSV jest kolumna 'PATH' oraz 'GENRE'.
    """
    df = pd.read_csv(metadata_csv)
    # np. PATH = "train/Rock/123_chunk_0.mp3"
    # GENRE = "Rock"
    # W zależności od nazewnictwa w Twoim pliku, zmień np. "Genre" -> "GENRE"
    
    if 'PATH' not in df.columns:
        raise ValueError("Brak kolumny 'PATH' w metadata CSV.")
    if 'TAGS' not in df.columns:
        raise ValueError("Brak kolumny 'TAGS' w metadata CSV.")
    
    path_to_genre = {}
    for _, row in df.iterrows():
        rel_path = row['PATH']
        g = row['TAGS']  # lub cokolwiek innego, np. row['TAGS']
        path_to_genre[rel_path] = g
    return path_to_genre

In [9]:
def process_file(file_path: str, base_folder: str, path_to_genre_map: dict) -> list:
    """
    Oblicza cechy (calculate_features_for_single_record), a następnie
    dołącza 'genre' na końcu, zaczerpnięty z path_to_genre_map.
    
    :param file_path: pełna ścieżka do pliku MP3 (np. ".../train/Rock/123.mp3")
    :param base_folder: folder bazowy (np. ".../split_audio_dataset/") 
                        aby wyliczyć ścieżkę relatywną do mapy.
    :param path_to_genre_map: dict: rel_path -> genre
    :return: list cech + [genre], lub None w razie błędu
    """
    try:
        # Obliczamy cechy audio
        features = calculate_features_for_single_record(file_path)
        
        # Wyznaczamy ścieżkę relatywną względem base_folder,
        # bo w metadata CSV pewnie jest PATH relatywna do base_folder
        rel_path = os.path.relpath(file_path, base_folder).replace("\\","/")
        
        # Pobieramy gatunek z mapy
        genre = path_to_genre_map.get(rel_path, "Unknown")
        
        # Dodajemy go jako OSTATNI element listy
        features.append(genre)
        
        return features
    except Exception as e:
        print(f"[BŁĄD] Podczas przetwarzania pliku {file_path}: {e}")
        return None


def process_files_in_parallel(folder: str, path_to_genre_map: dict) -> list:
    """
    Wyszukuje pliki .mp3 w folderze (rekurencyjnie),
    wywołuje process_file w wątkach, dołączając 'genre' z path_to_genre_map.
    Zwraca listę wierszy (list cech).
    """
    all_features = []

    mp3_paths = []
    for root, _, files in os.walk(folder):
        for f in files:
            if f.lower().endswith('.mp3'):
                mp3_paths.append(os.path.join(root, f))

    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        future_tasks = {
            executor.submit(process_file, p, folder, path_to_genre_map): p for p in mp3_paths
        }
        for future in tqdm(
            concurrent.futures.as_completed(future_tasks),
            total=len(future_tasks),
            desc=f"Przetwarzanie plików w {folder}",
            unit="plik"
        ):
            result = future.result()
            if result is not None:
                all_features.append(result)

    return all_features

In [6]:
def create_features_as_csv(
    base_folder: str,
    metadata_csv: str,
    output_csv: str
) -> None:
    """
    1) Tworzymy mapę: relatywna_ścieżka -> gatunek (z pliku metadata_csv).
    2) Przetwarzamy wszystkie mp3 w base_folder,
       obliczamy cechy + dołączamy gatunek z mapy,
       zapisujemy do output_csv.
    """
    # Budujemy mapę PATH->GENRE
    path_to_genre_map = build_metadata_map(metadata_csv)
    # print(path_to_genre_map)
    # print(len(path_to_genre_map))
    
    # Obliczamy cechy w wątkach
    rows = process_files_in_parallel(base_folder, path_to_genre_map)
    
    # Budujemy DataFrame i zapisujemy
    df = pd.DataFrame(rows, columns=columns())
    df.to_csv(output_csv, index=False)
    print(f"[INFO] Zapisano cechy do pliku {output_csv}")

In [7]:
val_folder    = "../../../datasets/jamendo/split_audio_dataset/val/"
val_metadata  = "../../../datasets/jamendo/metadata/val_metadata.csv"
val_features_csv = "val_features.csv"

create_features_as_csv(
    base_folder=val_folder,
    metadata_csv=val_metadata,
    output_csv=val_features_csv
)

  return pitch_tuning(
Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/val/: 100%|██████████| 5982/5982 [1:27:01<00:00,  1.15plik/s]


[INFO] Zapisano cechy do pliku val_features.csv


In [8]:
test_folder    = "../../../datasets/jamendo/split_audio_dataset/test/"
test_metadata  = "../../../datasets/jamendo/metadata/test_metadata.csv"
test_features_csv = "test_features.csv"

create_features_as_csv(
    base_folder=test_folder,
    metadata_csv=test_metadata,
    output_csv=test_features_csv
)

  return pitch_tuning(
Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/test/:  74%|███████▍  | 4736/6372 [58:06<38:10,  1.40s/plik]  

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_10.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/test/:  74%|███████▍  | 4738/6372 [58:06<23:22,  1.17plik/s]

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_16.mp3: index 0 is out of bounds for axis 0 with size 0
[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_14.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/test/:  74%|███████▍  | 4739/6372 [58:07<18:18,  1.49plik/s]

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_13.mp3: index 0 is out of bounds for axis 0 with size 0
[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_12.mp3: index 0 is out of bounds for axis 0 with size 0
[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_15.mp3: index 0 is out of bounds for axis 0 with size 0
[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/test/76\10376_chunk_11.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/test/: 100%|██████████| 6372/6372 [1:20:27<00:00,  1.32plik/s]


[INFO] Zapisano cechy do pliku test_features.csv


In [10]:
train_folder    = "../../../datasets/jamendo/split_audio_dataset/train/"
train_metadata  = "../../../datasets/jamendo/metadata/train_metadata.csv"
train_features_csv = "train_features.csv"

create_features_as_csv(
    base_folder=train_folder,
    metadata_csv=train_metadata,
    output_csv=train_features_csv
)

  return pitch_tuning(
Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/train/:   5%|▌         | 1365/26743 [16:55<4:03:03,  1.74plik/s] 

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/train/04\433604_chunk_31.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/train/:   7%|▋         | 1763/26743 [22:54<5:24:40,  1.28plik/s] 

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/train/06\12306_chunk_11.mp3: index 0 is out of bounds for axis 0 with size 0
[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/train/06\12306_chunk_12.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/train/:   7%|▋         | 1766/26743 [22:55<3:45:33,  1.85plik/s]

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/train/06\12306_chunk_13.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/train/:  84%|████████▍ | 22506/26743 [4:46:42<1:36:31,  1.37s/plik]

[BŁĄD] Podczas przetwarzania pliku ../../../datasets/jamendo/split_audio_dataset/train/82\433582_chunk_31.mp3: index 0 is out of bounds for axis 0 with size 0


Przetwarzanie plików w ../../../datasets/jamendo/split_audio_dataset/train/: 100%|██████████| 26743/26743 [5:57:51<00:00,  1.25plik/s]  


[INFO] Zapisano cechy do pliku train_features.csv
