In [1]:
import librosa
import librosa.display
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None)

        #Энергия (средняя RMS)
        energy = librosa.feature.rms(y=y).mean()

        #Танцевальность (по темпу)
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)

        #Громкость (RMS)
        loudness = np.sqrt(np.mean(y**2))

        #Акустичность (Spectral Rolloff)
        acousticness = librosa.feature.spectral_rolloff(y=y, sr=sr).mean()

        #Средняя тональность (Chroma)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr).mean()

        #Средний MFCC (13 коэффициентов)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13).mean(axis=1)

        #Спектральные фичи
        spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr).mean()
        spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr).mean()

        #Инструментальность (через Zero-Crossing Rate)
        zero_crossing_rate = librosa.feature.zero_crossing_rate(y=y).mean()

        features = {
            "energy": energy,
            "tempo": tempo,
            "loudness": loudness,
            "acousticness": acousticness,
            "chroma": chroma,
            "spectral_centroid": spectral_centroid,
            "spectral_bandwidth": spectral_bandwidth,
            "zero_crossing_rate": zero_crossing_rate,
        }

        #MFCC (отдельные колонки)
        for i, mfcc in enumerate(mfccs):
            features[f"mfcc_{i+1}"] = mfcc

        return features

    except Exception as e:
        print(f"Ошибка с файлом {file_path}: {e}")
        return None


In [2]:
dataset_dir = "/kaggle/input/pmemo2019/PMEmo2019/chorus"
output_dir = "/kaggle/working/audio_features.csv"

audio_files = [f for f in os.listdir(dataset_dir) if f.endswith(('.mp3'))]

all_features = []

for file_name in tqdm(audio_files):
    file_path = os.path.join(dataset_dir, file_name)
    features = extract_features(file_path)
    
    if features:
        features["id"] = file_name
        all_features.append(features)

df = pd.DataFrame(all_features)
df.to_csv(output_dir, index=False)

100%|██████████| 794/794 [18:46<00:00,  1.42s/it]
