In [1]:
import os
import numpy as np
import pandas as pd
import librosa
from tqdm import tqdm

In [2]:
AUDIO_DIR = "GTZAN"  # root folder containing genre subfolders
OUTPUT_CSV = "features_for_ml.csv"
OUTPUT_CNN_DIR = "features_for_cnn"
os.makedirs(OUTPUT_CNN_DIR, exist_ok=True)

In [3]:
genres = [g for g in os.listdir(AUDIO_DIR) if os.path.isdir(os.path.join(AUDIO_DIR, g))]
print("Genres found:", genres)

Genres found: ['reggae', 'rock', 'hiphop', 'disco', 'classical', 'pop', 'blues', 'metal', 'jazz', 'country']


In [4]:
def extract_features_for_ml(y, sr):
    # Pre-emphasis filter
    y_pre = np.append(y[0], y[1:] - 0.97 * y[:-1])
    
    # Zero-Crossing Rate
    zcr = np.mean(librosa.feature.zero_crossing_rate(y_pre))
    
    #Root Mean Square Energy (RMS)
    rmse = np.mean(librosa.feature.rms(y=y_pre))
    
    # Tempo (BPM)
    tempo, _ = librosa.beat.beat_track(y=y_pre, sr=sr)
    
    # MFCCs (Mel-Frequency Cepstral Coefficients)
    mfccs = librosa.feature.mfcc(y=y_pre, sr=sr, n_mfcc=20)
    mfccs_mean = np.mean(mfccs, axis=1)

    # Chroma features
    chroma = librosa.feature.chroma_stft(y=y_pre, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    
    # Spectral centroid
    spec_centroid = np.mean(librosa.feature.spectral_centroid(y=y_pre, sr=sr))

    # Spectral bandwidth
    spec_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y_pre, sr=sr))
    
    # Combine features
    features = np.hstack([
        mfccs_mean,
        chroma_mean,
        zcr,
        rmse,
        tempo,
        spec_centroid,
        spec_bandwidth
    ])
    return features

In [5]:
def extract_features_for_cnn(y, sr=22050, n_mels=128, fmax=8000):
    # Apply pre-emphasis to boost high frequencies
    y_pre = np.append(y[0], y[1:] - 0.97 * y[:-1])
    
    # Compute Mel-spectrogram
    S = librosa.feature.melspectrogram(y=y_pre, sr=sr, n_mels=n_mels, fmax=fmax)
    
    # Convert power to log scale (dB)
    S_db = librosa.power_to_db(S, ref=np.max)
    
    # Add channel dimension for CNN (grayscale image)
    cnn_image = S_db[..., np.newaxis]
    
    return cnn_image



In [6]:
all_files = []
for genre in genres:
    genre_dir = os.path.join(AUDIO_DIR, genre)
    for fname in os.listdir(genre_dir):
        if fname.endswith('.wav'):
            file_path = os.path.join(genre_dir, fname)
            all_files.append((file_path, genre))

In [7]:
feature_list = []
label_list = []
file_list = []

# Single tqdm progress bar
for file_path, genre in tqdm(all_files, desc="Processing all GTZAN files"):
    try:
        y, sr = librosa.load(file_path, sr=22050, mono=True)
        
        # Segment into 3-second clips
        clip_length = 3 * sr
        num_clips = len(y) // clip_length
        for i in range(num_clips):
            clip = y[i*clip_length : (i+1)*clip_length]
            features = extract_features_for_ml(clip, sr)
            feature_list.append(features)
            label_list.append(genre)
            file_list.append(os.path.basename(file_path))
            
    except Exception as e:
        print(f"Error processing {file_path}: {e}")


  y, sr = librosa.load(file_path, sr=22050, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Processing all GTZAN files:  84%|████████▎ | 835/1000 [04:56<00:44,  3.69it/s]

Error processing GTZAN/jazz/jazz.00054.wav: 


Processing all GTZAN files: 100%|██████████| 1000/1000 [05:51<00:00,  2.84it/s]


In [8]:
feature_names = [f"mfcc_{i+1}" for i in range(20)] + \
                [f"chroma_{i+1}" for i in range(12)] + \
                ['zcr','rmse','tempo','spec_centroid','spec_bandwidth']

df_features = pd.DataFrame(feature_list, columns=feature_names)
df_features['genre'] = label_list
df_features['filename'] = file_list

df_features.to_csv(OUTPUT_CSV, index=False)
print(f"Feature extraction done. Saved to {OUTPUT_CSV}")

Feature extraction done. Saved to features_for_ml.csv


In [10]:
for file_path, genre in tqdm(all_files, desc="Converting audio to CNN images"):
    try:
        y, sr = librosa.load(file_path, sr=22050, mono=True)
        cnn_img = extract_features_for_cnn(y, sr)
        
        # Create genre folder if it doesn't exist
        genre_folder = os.path.join(OUTPUT_CNN_DIR, genre)
        os.makedirs(genre_folder, exist_ok=True)
        
        # Save CNN image in its genre folder
        filename = os.path.splitext(os.path.basename(file_path))[0]
        npy_path = os.path.join(genre_folder, f"{filename}.npy")
        np.save(npy_path, cnn_img)
        
    except Exception as e:
        print(f"Error processing {file_path}: {e}")

print(f"All CNN images saved individually in '{OUTPUT_CNN_DIR}', organized by genre.")

  y, sr = librosa.load(file_path, sr=22050, mono=True)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)
Converting audio to CNN images:  84%|████████▍ | 839/1000 [00:34<00:07, 22.33it/s]

Error processing GTZAN/jazz/jazz.00054.wav: 


Converting audio to CNN images: 100%|██████████| 1000/1000 [00:40<00:00, 24.50it/s]

All CNN images saved individually in 'features_for_cnn', organized by genre.



