In [None]:
import librosa
import librosa.feature
import librosa.display
import numpy as np
import pandas as pd
from PIL import Image
import os

def load_fixed_audio(path, duration=30, sr=22050):
    audio, _ = librosa.load(path, sr=sr, duration=duration)
    desired_length = duration * sr

    if len(audio) < desired_length:
        audio = np.pad(audio, (0, desired_length - len(audio)))
    else:
        audio = audio[:desired_length]
    
    return audio

def audio_to_melspec(audio, sr=22050, n_mels=128, hop_length=512):
    melspec = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, hop_length=hop_length)
    melspec_db = librosa.power_to_db(melspec+1e-10, ref=np.max, amin=1e-10)
    return melspec_db

def normalize(melspec_db):
    min_val = np.min(melspec_db)
    max_val = np.max(melspec_db)
    # Ensure no division by zero
    if max_val - min_val == 0:
        return np.zeros_like(melspec_db)
    return (melspec_db - min_val) / (max_val - min_val)

def get_all_wav_files(directory):
    wav_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.wav'):
                wav_files.append(dirpath + '/' + filename)
    return wav_files

def get_all_image_files(directory):
    wav_files = []
    for dirpath, _, filenames in os.walk(directory):
        for filename in filenames:
            if filename.endswith('.png'):
                wav_files.append(dirpath + '/' + filename)
    return wav_files

Feature Extraction

In [None]:
# path to folder with audio files
files = get_all_wav_files("../data/genres")
entries = []

def clean_labels(s):
    return s.split('.')[0]

def extract_features(file_path,sr=22050,total_duration=30.0,segment_duration=3,n_mfcc=64):
    
    # Load
    y, sr = librosa.load(file_path, sr=sr, mono=True, duration=30)
    
    num_samples = len(y)

    # 2. Calculate how many segments we can extract
    samples_per_segment = int(segment_duration * sr)
    num_segments = 10
    
    # 3. For each segment, extract features
    all_segments = []
    for i in range(num_segments):
        features = []
        start = i * samples_per_segment
        end = start + samples_per_segment
        segment_y = y[start:end]
        
        # MFCCs
        mfcc = librosa.feature.mfcc(y=segment_y, sr=sr, n_mfcc=n_mfcc)
        mfcc_mean = mfcc.mean(axis=1)
        mfcc_std = mfcc.std(axis=1)
        mfcc_min = mfcc.min(axis=1)
        mfcc_max = mfcc.max(axis=1)
        features.extend(mfcc_mean)
        features.extend(mfcc_std)
        features.extend(mfcc_min)
        features.extend(mfcc_max)
        
        # Chroma features
        chroma = librosa.feature.chroma_stft(y=segment_y, sr=sr)
        chroma_mean = chroma.mean(axis=1)
        chroma_std = chroma.std(axis=1)
        chroma_min = chroma.min(axis=1)
        chroma_max = chroma.max(axis=1)
        features.extend(chroma_mean)
        features.extend(chroma_std)
        features.extend(chroma_min)
        features.extend(chroma_max)
        
        # 3. Spectral contrast
        spec_contrast = librosa.feature.spectral_contrast(y=segment_y, sr=sr)
        spec_contrast_mean = spec_contrast.mean(axis=1)
        spec_contrast_min = spec_contrast.max(axis=1)
        spec_contrast_max = spec_contrast.min(axis=1)
        spec_contrast_std = spec_contrast.std(axis=1)
        features.extend(spec_contrast_mean)
        features.extend(spec_contrast_std)        
        features.extend(spec_contrast_max)
        features.extend(spec_contrast_min)
        
        # 4. Tonnetz features (need harmonic component of audio for tonnetz)
        y_harm = librosa.effects.harmonic(segment_y)  # isolate harmonic component
        tonnetz = librosa.feature.tonnetz(y=y_harm, sr=sr)
        tonnetz_mean = tonnetz.mean(axis=1)
        tonnetz_std = tonnetz.std(axis=1)
        tonnetz_min = tonnetz.min(axis=1)
        tonnetz_max = tonnetz.max(axis=1)
        features.extend(tonnetz_mean)
        features.extend(tonnetz_std)
        features.extend(tonnetz_min)
        features.extend(tonnetz_max)
        
        # 5. Tempo 
        tempo, _ = librosa.beat.beat_track(y=segment_y, sr=sr)
        features.append(tempo[0])
        
        # 6. Spectral Rolloff
        spectral_rolloff = librosa.feature.spectral_rolloff(y=segment_y, sr=sr)[0]
        features.append(spectral_rolloff.mean())
        features.append(spectral_rolloff.std())        
        features.append(spectral_rolloff.min())
        features.append(spectral_rolloff.max())
        
        # 7. Spectral Centroids
        spectral_centroids = librosa.feature.spectral_centroid(y=segment_y, sr=sr)[0]
        features.append(spectral_centroids.mean())
        features.append(spectral_centroids.std())
        features.append(spectral_centroids.min())
        features.append(spectral_centroids.max())
        
        # 9. Zero Crossings Rate
        zc = librosa.zero_crossings(y=segment_y, pad=False)
        features.append(sum(zc)/len(zc))
        
        rms = librosa.feature.rms(y=segment_y)
        features.append(rms.mean())
        features.append(rms.std())        
        features.append(rms.min())
        features.append(rms.max())
        
        if file_path.endswith("_m2.wav") or file_path.endswith("_p2.wav"):
            features.append(1)
        else:
            features.append(0)
            
        all_segments.append(features)
    return all_segments

for file in files:
    try:
        print(file)
        filename = file.split('/')[-1]
        row = extract_features(file)
        new_row = [clean_labels(filename)] + [row]
        
        if file.endswith("_m2.wav") or file.endswith("_p2.wav"):
            new_row.append(1)
        else:
            new_row.append(0)

        entries.append(new_row)
    except Exception as e:
        print(e)
        print(f"Could not process {file}")
        
df = pd.DataFrame(entries)
df.to_csv(f"../data/files/audio_classification_3sec_deep_32.csv")

In [None]:
files = get_all_image_files('../data/images/')
mel_spectograms = []

for file in files:
    try:
        filename = file.split('/')[-1]
        print(filename)
        img = Image.open(file).convert('L')  # ensure RGB format
        img_array = np.asarray(img).flatten()
        img_array = img_array.astype(np.float16)
        mel_spectograms.append([filename] + img_array.tolist())
        img.close()
    except Exception as e:
        print(e)
        print(f"Could not process {file}")
        
df = pd.DataFrame.from_records(mel_spectograms)
df.head(10)
df.to_csv("../data/files/mel_spectograms_flat.csv")