In [1]:
!pip install numpy pandas matplotlib librosa pydub



In [4]:
import os
import librosa
import numpy as np
import multiprocessing as mp
from glob import glob
from tqdm import tqdm
from pydub import AudioSegment
from concurrent.futures import ProcessPoolExecutor
import librosa
from scipy.signal import istft
import subprocess
import soundfile as sf
from concurrent.futures import ThreadPoolExecutor
from pydub.utils import mediainfo

#mp.set_start_method("spawn")

In [None]:
# to download the data used in this project please go to https://github.com/mdeff/fma?tab=readme-ov-file and download the appropriate size data
# it will be in a zip format and consider  your computers processing capabilities


In [6]:


def is_mp3_valid(file_path):
    """Checks if the MP3 file is valid using mediainfo and ensures it can be opened."""
    try:
        if not os.path.exists(file_path) or not file_path.lower().endswith('.mp3'):
            print(f"‚ö† Not an MP3 file: {file_path}")
            return False
        
        info = mediainfo(file_path)
        if 'audio' not in info.get('codec_type', ''):
            print(f"‚ö† Invalid MP3 file: {file_path} (wrong codec type)")
            return False
        
        # Attempt to open the file using pydub
        from pydub import AudioSegment
        AudioSegment.from_file(file_path)
        return True

    except Exception as e:
        print(f"‚ö† MP3 validation failed for {file_path}: {e}")
        return False




def process_segment(segment, sr, num_freqs=5):
    """Extracts the top frequencies, amplitudes, and phases for a segment."""
    if len(segment) == 0:
        return None

    fft_complex = np.fft.rfft(segment)
    magnitudes = np.abs(fft_complex)
    phases = np.angle(fft_complex)
    freqs = np.fft.rfftfreq(len(segment), d=1/sr)

    # Get top N frequencies
    top_indices = np.argsort(magnitudes)[-num_freqs:]
    top_freqs = freqs[top_indices]
    top_mags = magnitudes[top_indices]
    top_phases = phases[top_indices]

    return list(zip(top_freqs, top_mags, top_phases))

def load_audio(file_path, sample_rate):
    """Loads an audio file with a fallback to pydub if librosa fails."""
    try:
        # Try loading with soundfile backend first
        y, sr = sf.read(file_path, always_2d=True)
        y = y.mean(axis=1)  # Convert stereo to mono
        return y, sample_rate
    except Exception as e1:
        print(f"‚ö† Soundfile failed for {file_path}, trying Librosa (FFmpeg)... Error: {e1}")
        try:
            return librosa.load(file_path, sr=sample_rate, mono=True, backend="ffmpeg")
        except Exception as e2:
            print(f"‚ö† Librosa (FFmpeg) failed for {file_path}, trying pydub... Error: {e2}")
            try:
                audio = AudioSegment.from_file(file_path)
                audio = audio.set_frame_rate(sample_rate).set_channels(1)
                samples = np.array(audio.get_array_of_samples(), dtype=np.float32) / 32768.0
                return samples, sample_rate
            except Exception as e3:
                print(f"‚ùå Error loading {file_path} with all methods: {e3}")
                return None, None
                
def process_audio_file(file_path, segment_length=1/8, sample_rate=44100, num_freqs=5, max_segments=128):
    """Processes an MP3 file into fixed-length frequency tokens for transformer training."""
    if not is_mp3_valid(file_path):
        print(f"‚ö† Skipping corrupt file: {file_path}")
        return None  # Skip this file

    try:
        y, sr = load_audio(file_path, sample_rate)
        if y is None:
            return None  # Skip files that couldn't be loaded

        samples_per_segment = int(segment_length * sr)
        segments = [y[i:i+samples_per_segment] for i in range(0, len(y), samples_per_segment)]

        # Process each segment into (num_freqs, 3) tokens
        results = [process_segment(seg, sr, num_freqs) for seg in segments]

        # Remove None values
        results = [res for res in results if res is not None]

        # Convert to NumPy array and ensure fixed shape (max_segments, num_freqs, 3)
        results = np.array(results, dtype=np.float32)

        # Pad or truncate to a fixed shape (128, num_freqs, 3) for transformer input
        if len(results) < max_segments:
            padding = np.zeros((max_segments - len(results), num_freqs, 3), dtype=np.float32)
            results = np.vstack((results, padding))
        else:
            results = results[:max_segments]  # Truncate if too long

        return results  # Returns (128, num_freqs, 3) array
    except Exception as e:
        print(f"‚ùå Error processing {file_path}: {e}")
        return None


def process_folder(folder_path, output_folder, folder_index, total_folders):
    """Process all MP3 files in a given folder sequentially."""
    folder_name = os.path.basename(folder_path)
    audio_files = sorted(glob(os.path.join(folder_path, "*.mp3")))

    if not audio_files:
        print(f"‚ö† No MP3 files found in {folder_name}")
        return

    print(f"üü¢ Processing folder {folder_index + 1}/{total_folders}: {folder_name} ({len(audio_files)} files)")

    token_dict = {}
    skipped_files = 0  # Track number of skipped files

    for i, audio_file in enumerate(audio_files):
        if not is_mp3_valid(audio_file):  # Skip invalid MP3s
            skipped_files += 1
            continue

        tokens = process_audio_file(audio_file)  
        if tokens is not None:
            token_dict[os.path.basename(audio_file)] = tokens

    if token_dict:
        save_path = os.path.join(output_folder, f"{folder_name}_tokens.npz")
        np.savez_compressed(save_path, **token_dict)
        print(f"‚úÖ Saved: {save_path}")
    else:
        print(f"‚ö† No valid tokens extracted in {folder_name} (Skipped {skipped_files} corrupt files)")

def process_all_folders(base_folder, output_folder, num_workers=2):
    """Process multiple folders in parallel using threads (safer alternative)."""
    os.makedirs(output_folder, exist_ok=True)
    folders = sorted([f for f in os.listdir(base_folder) if os.path.isdir(os.path.join(base_folder, f))])
    total_folders = len(folders)

    print(f"üìÇ Found {total_folders} folders. Processing with {num_workers} workers...")

    with ThreadPoolExecutor(max_workers=num_workers) as executor:  # Use threads instead of processes
        futures = [executor.submit(process_folder, os.path.join(base_folder, folder), output_folder, idx, total_folders) 
                   for idx, folder in enumerate(folders)]
        
        for future in futures:
            future.result()


In [7]:
base_folder = "/home/renzo/projects/stempalooza/audio"
output_folder = "/home/renzo/projects/stempalooza/transformer/procesed"

#print(process_audio_file("/home/renzo/projects/stempalooza/audio/000/000002.mp3"))
#process_folder("/home/renzo/projects/stempalooza/audio/000/",output_folder,1,100)

process_all_folders(base_folder, output_folder, num_workers=2)

üìÇ Found 156 folders. Processing with 2 workers...
üü¢ Processing folder 1/156: 000 (199 files)
üü¢ Processing folder 2/156: 001 (215 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/000_tokens.npz
üü¢ Processing folder 3/156: 002 (5 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/002_tokens.npz
üü¢ Processing folder 4/156: 003 (196 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/001_tokens.npz
üü¢ Processing folder 5/156: 004 (240 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/003_tokens.npz
üü¢ Processing folder 6/156: 005 (129 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/004_tokens.npz
üü¢ Processing folder 7/156: 006 (151 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/005_tokens.npz
üü¢ Processing folder 8/156: 007 (116 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/006_tokens.npz
üü¢ Processing folder 9/156: 008 (78 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/007_tok

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/010_tokens.npz
üü¢ Processing folder 13/156: 012 (202 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/011_tokens.npz
üü¢ Processing folder 14/156: 013 (158 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/012_tokens.npz
üü¢ Processing folder 15/156: 014 (269 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/013_tokens.npz
üü¢ Processing folder 16/156: 015 (142 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/015_tokens.npz
üü¢ Processing folder 17/156: 016 (214 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/014_tokens.npz
üü¢ Processing folder 18/156: 017 (169 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3264) too large for available bit count (3224)


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/016_tokens.npz
üü¢ Processing folder 19/156: 018 (136 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/017_tokens.npz
üü¢ Processing folder 20/156: 019 (187 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/018_tokens.npz
üü¢ Processing folder 21/156: 020 (192 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/019_tokens.npz
üü¢ Processing folder 22/156: 021 (170 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/020_tokens.npz
üü¢ Processing folder 23/156: 022 (140 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/021_tokens.npz
üü¢ Processing folder 24/156: 023 (159 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/022_tokens.npz
üü¢ Processing folder 25/156: 024 (177 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/023_tokens.npz
üü¢ Processing folder 26/156: 025 (103 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/024_tokens.npz
üü¢ Processing folder 27/156: 026 (160 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/025_tokens.npz
üü¢ Processing folder 28/156: 027 (141 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/027_tokens.npz
üü¢ Processing folder 29/156: 028 (135 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/026_tokens.npz
üü¢ Processing folder 30/156: 029 (145 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/028_tokens.npz
üü¢ Processing folder 31/156: 030 (135 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/029_tokens.npz
üü¢ Processing folder 32/156: 031 (117 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/031_tokens.npz
üü¢ Processing folder 33/156: 032 (184 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/030_tokens.npz
üü¢ Processing folder 34/156: 033 (134 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/033_tokens.npz
üü¢ Processing folder 35/156: 034 (95 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/032_tokens.npz
üü¢ Processing folder 36/156: 035 (91 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/034_tokens.npz
üü¢ Processing folder 37/156: 036 (170 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/035_tokens.npz
üü¢ Processing folder 38/156: 037 (119 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/037_tokens.npz
üü¢ Processing folder 39/156: 038 (150 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/036_tokens.npz
üü¢ Processing folder 40/156: 039 (109 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/039_tokens.npz
üü¢ Processing folder 41/156: 040 (206 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/038_tokens.npz
üü¢ Processing folder 42/156: 041 (124 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/041_tokens.npz
üü¢ Processing folder 43/156: 042 (208 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/040_tokens.npz
üü¢ Processing folder 44/156: 043 (166 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/042_tokens.npz
üü¢ Processing folder 45/156: 044 (141 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/043_tokens.npz
üü¢ Processing folder 46/156: 045 (115 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/045_tokens.npz
üü¢ Process

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3360) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1774] error: part2_3_length (3328) too large for available bit count (3240)
[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/054_tokens.npz
üü¢ Processing folder 57/156: 056 (178 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/055_tokens.npz
üü¢ Processing folder 58/156: 057 (205 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/056_tokens.npz
üü¢ Processing folder 59/156: 058 (119 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/057_tokens.npz
üü¢ Processing folder 60/156: 059 (201 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/058_tokens.npz
üü¢ Processing folder 61/156: 060 (197 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/059_tokens.npz
üü¢ Processing folder 62/156: 061 (122 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/060_tokens.npz
üü¢ Processing folder 63/156: 062 (168 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/061_tokens.npz
üü¢ Processing folder 64/156: 063 (237 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/062_tokens.npz
üü¢ Process

[src/libmpg123/layer3.c:INT123_do_layer3():1844] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/073_tokens.npz
üü¢ Processing folder 76/156: 075 (242 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/074_tokens.npz
üü¢ Processing folder 77/156: 076 (113 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/076_tokens.npz
üü¢ Processing folder 78/156: 077 (64 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/077_tokens.npz
üü¢ Processing folder 79/156: 078 (73 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/075_tokens.npz
üü¢ Processing folder 80/156: 079 (92 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/078_tokens.npz
üü¢ Processing folder 81/156: 080 (132 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/079_tokens.npz
üü¢ Processing folder 82/156: 081 (115 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/080_tokens.npz
üü¢ Processing folder 83/156: 082 (188 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/081_tokens.npz
üü¢ Processing

Note: Illegal Audio-MPEG-Header 0x00000000 at offset 33361.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


‚ö† Soundfile failed for /home/renzo/projects/stempalooza/audio/098/098565.mp3, trying Librosa (FFmpeg)... Error: Unspecified internal error.
‚ö† Librosa (FFmpeg) failed for /home/renzo/projects/stempalooza/audio/098/098565.mp3, trying pydub... Error: load() got an unexpected keyword argument 'backend'


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 187493.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


‚ö† Soundfile failed for /home/renzo/projects/stempalooza/audio/098/098566.mp3, trying Librosa (FFmpeg)... Error: Unspecified internal error.
‚ö† Librosa (FFmpeg) failed for /home/renzo/projects/stempalooza/audio/098/098566.mp3, trying pydub... Error: load() got an unexpected keyword argument 'backend'


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 22401.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


‚ö† Soundfile failed for /home/renzo/projects/stempalooza/audio/098/098567.mp3, trying Librosa (FFmpeg)... Error: Unspecified internal error.
‚ö† Librosa (FFmpeg) failed for /home/renzo/projects/stempalooza/audio/098/098567.mp3, trying pydub... Error: load() got an unexpected keyword argument 'backend'


Note: Illegal Audio-MPEG-Header 0x00000000 at offset 106439.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


‚ö† Soundfile failed for /home/renzo/projects/stempalooza/audio/098/098568.mp3, trying Librosa (FFmpeg)... Error: Unspecified internal error.
‚ö† Librosa (FFmpeg) failed for /home/renzo/projects/stempalooza/audio/098/098568.mp3, trying pydub... Error: load() got an unexpected keyword argument 'backend'


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!
Note: Illegal Audio-MPEG-Header 0x00000000 at offset 63168.
Note: Trying to resync...
Note: Skipped 1024 bytes in input.
[src/libmpg123/parse.c:wetwork():1349] error: Giving up resync after 1024 bytes - your stream is not nice... (maybe increasing resync limit could help).


‚ö† Soundfile failed for /home/renzo/projects/stempalooza/audio/098/098569.mp3, trying Librosa (FFmpeg)... Error: Unspecified internal error.
‚ö† Librosa (FFmpeg) failed for /home/renzo/projects/stempalooza/audio/098/098569.mp3, trying pydub... Error: load() got an unexpected keyword argument 'backend'
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/097_tokens.npz
üü¢ Processing folder 100/156: 099 (199 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/098_tokens.npz
üü¢ Processing folder 101/156: 100 (59 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/100_tokens.npz
üü¢ Processing folder 102/156: 101 (52 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/099_tokens.npz
üü¢ Processing folder 103/156: 102 (48 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/101_tokens.npz
üü¢ Processing folder 104/156: 103 (32 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/103_tokens.npz
üü¢ Processing folder 105/156: 104 (107 files)
‚úÖ 

[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/106_tokens.npz
üü¢ Processing folder 109/156: 108 (254 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/107_tokens.npz
üü¢ Processing folder 110/156: 109 (242 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/108_tokens.npz
üü¢ Processing folder 111/156: 110 (190 files)


[src/libmpg123/layer3.c:INT123_do_layer3():1804] error: dequantization failed!


‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/109_tokens.npz
üü¢ Processing folder 112/156: 111 (221 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/110_tokens.npz
üü¢ Processing folder 113/156: 112 (165 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/112_tokens.npz
üü¢ Processing folder 114/156: 113 (232 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/111_tokens.npz
üü¢ Processing folder 115/156: 114 (189 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/114_tokens.npz
üü¢ Processing folder 116/156: 115 (163 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/113_tokens.npz
üü¢ Processing folder 117/156: 116 (233 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/115_tokens.npz
üü¢ Processing folder 118/156: 117 (124 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/116_tokens.npz
üü¢ Processing folder 119/156: 118 (200 files)
‚úÖ Saved: /home/renzo/projects/stempalooza/procesed/117_tokens.npz
üü¢

In [7]:
from scipy.io.wavfile import write

def reconstruct_audio(tokens, output_path, segment_length=1/8, sample_rate=44100):
    """Reconstructs a simplified audio waveform using ISTFT."""
    time_steps = int(sample_rate * segment_length)
    reconstructed_waveform = np.zeros(time_steps * len(tokens))

    for i, token_set in enumerate(tokens):
        t = np.linspace(0, segment_length, time_steps, endpoint=False)
        segment_wave = np.zeros(time_steps)

        for freq, mag, phase in token_set:
            segment_wave += mag * np.sin(2 * np.pi * freq * t + phase)

        start_idx = i * time_steps
        reconstructed_waveform[start_idx:start_idx + time_steps] = segment_wave

    # Normalize to 16-bit WAV
    reconstructed_waveform = np.int16(reconstructed_waveform / np.max(np.abs(reconstructed_waveform)) * 32767)
    
    write(output_path, sample_rate, reconstructed_waveform)

# Example usage
tokens = np.load("/home/renzo/projects/stempalooza/procesed/133_tokens.npz", allow_pickle=True)["133003.mp3"]
reconstruct_audio(tokens, "reconstructed_audio.wav")
