In [1]:
import os
import numpy as np
import pandas as pd
import librosa
import warnings
from tqdm import tqdm
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

warnings.filterwarnings('ignore')

print("=" * 60)
print("ADVANCED PREPROCESSING FOR CONVOLUTIONAL VAE")
print("=" * 60)
print("\nLibraries imported successfully!")


CONFIG = {
    'sample_rate': 22050,           # Standard sample rate for audio processing
    'duration': 30,                  # Duration in seconds to load from each audio file
    'n_mels': 128,                   # Number of mel bands (height of spectrogram)
    'n_fft': 2048,                   # FFT window size
    'hop_length': 512,               # Hop length for STFT
    'n_mfcc': 40,                    # Number of MFCC coefficients
    'fixed_time_steps': 128,         # Fixed time dimension for CNN (width of spectrogram)
    'max_samples_per_class': 160,    # Maximum samples per class to load
    'lyrics_max_features': 100,      # Max TF-IDF features for lyrics
}

# Define paths
BASE_PATH = r"f:\BRACU\Semester 12 Final\CSE425\FInal_project\Datasets"
BANGLA_PATH = os.path.join(BASE_PATH, "Bangla_Datasets")
ENGLISH_PATH = os.path.join(BASE_PATH, "English_Datasets")
METADATA_PATH = os.path.join(BASE_PATH, "updated_metadata.csv")
OUTPUT_PATH = r"f:\BRACU\Semester 12 Final\CSE425\FInal_project\processed_data2"

# Create output directory if it doesn't exist
os.makedirs(OUTPUT_PATH, exist_ok=True)

print(f"\nConfiguration loaded!")
print(f"Bangla datasets path: {BANGLA_PATH}")
print(f"English datasets path: {ENGLISH_PATH}")
print(f"Output path: {OUTPUT_PATH}")
print(f"\nSpectrogram dimensions: {CONFIG['n_mels']} x {CONFIG['fixed_time_steps']}")

print("\n" + "=" * 60)
print("LOADING METADATA")
print("=" * 60)

metadata_df = pd.read_csv(METADATA_PATH)
print(f"Metadata shape: {metadata_df.shape}")
print(f"Columns: {metadata_df.columns.tolist()}")

# Create lookup dictionaries
genre_lookup = dict(zip(metadata_df['ID'].astype(str), metadata_df['genre']))
lyrics_lookup = dict(zip(metadata_df['ID'].astype(str), metadata_df['lyrics'].fillna('')))

print(f"Loaded {len(genre_lookup)} genre entries from metadata")
print(f"Loaded {len(lyrics_lookup)} lyrics entries from metadata")

ADVANCED PREPROCESSING FOR CONVOLUTIONAL VAE

Libraries imported successfully!

Configuration loaded!
Bangla datasets path: f:\BRACU\Semester 12 Final\CSE425\FInal_project\Datasets\Bangla_Datasets
English datasets path: f:\BRACU\Semester 12 Final\CSE425\FInal_project\Datasets\English_Datasets
Output path: f:\BRACU\Semester 12 Final\CSE425\FInal_project\processed_data2

Spectrogram dimensions: 128 x 128

LOADING METADATA
Metadata shape: (1859, 4)
Columns: ['ID', 'language', 'genre', 'lyrics']
Loaded 1859 genre entries from metadata
Loaded 1859 lyrics entries from metadata


In [2]:

def load_audio_file(file_path):
    """Load an audio file with error handling."""
    try:
        audio, sr = librosa.load(
            file_path,
            sr=CONFIG['sample_rate'],
            duration=CONFIG['duration']
        )
        
        # Pad if audio is shorter than duration
        expected_samples = CONFIG['sample_rate'] * CONFIG['duration']
        if len(audio) < expected_samples:
            audio = np.pad(audio, (0, expected_samples - len(audio)), mode='constant')
        
        return audio, sr
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None, None


def extract_mel_spectrogram(audio, sr):
    """Extract mel spectrogram with fixed dimensions for CNN."""
    mel = librosa.feature.melspectrogram(
        y=audio, 
        sr=sr, 
        n_mels=CONFIG['n_mels'],
        n_fft=CONFIG['n_fft'], 
        hop_length=CONFIG['hop_length']
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    # Resize to fixed time steps (width)
    if mel_db.shape[1] > CONFIG['fixed_time_steps']:
        mel_db = mel_db[:, :CONFIG['fixed_time_steps']]
    else:
        # Pad with minimum value (silence) instead of 0 (which denotes max volume in dB)
        pad_width = CONFIG['fixed_time_steps'] - mel_db.shape[1]
        mel_db = np.pad(mel_db, ((0, 0), (0, pad_width)), mode='constant', constant_values=mel_db.min())
    
    return mel_db


def extract_mfcc(audio, sr):
    """Extract MFCC features with fixed dimensions for CNN."""
    mfcc = librosa.feature.mfcc(
        y=audio, 
        sr=sr, 
        n_mfcc=CONFIG['n_mfcc'],
        n_fft=CONFIG['n_fft'], 
        hop_length=CONFIG['hop_length']
    )
    
    # Resize to fixed time steps
    if mfcc.shape[1] > CONFIG['fixed_time_steps']:
        mfcc = mfcc[:, :CONFIG['fixed_time_steps']]
    else:
        # Pad with minimum value to avoid artifacts
        pad_width = CONFIG['fixed_time_steps'] - mfcc.shape[1]
        mfcc = np.pad(mfcc, ((0, 0), (0, pad_width)), mode='constant', constant_values=mfcc.min())
    
    return mfcc


def extract_flattened_features(audio, sr):
    """
    Extract flattened statistical features (for compatibility with MLP-based models).
    This matches the format from 1_preprocessing.py
    """
    # Extract mel spectrogram
    mel = librosa.feature.melspectrogram(
        y=audio, sr=sr, n_mels=CONFIG['n_mels'],
        n_fft=CONFIG['n_fft'], hop_length=CONFIG['hop_length']
    )
    mel_db = librosa.power_to_db(mel, ref=np.max)
    
    # Extract MFCC
    mfcc = librosa.feature.mfcc(
        y=audio, sr=sr, n_mfcc=CONFIG['n_mfcc'],
        n_fft=CONFIG['n_fft'], hop_length=CONFIG['hop_length']
    )
    
    # Extract spectral features
    spectral_centroid = librosa.feature.spectral_centroid(y=audio, sr=sr, hop_length=CONFIG['hop_length'])
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sr, hop_length=CONFIG['hop_length'])
    spectral_rolloff = librosa.feature.spectral_rolloff(y=audio, sr=sr, hop_length=CONFIG['hop_length'])
    zcr = librosa.feature.zero_crossing_rate(audio, hop_length=CONFIG['hop_length'])
    rms = librosa.feature.rms(y=audio, hop_length=CONFIG['hop_length'])
    
    # Extract chroma features
    chroma = librosa.feature.chroma_stft(
        y=audio, sr=sr, n_fft=CONFIG['n_fft'], hop_length=CONFIG['hop_length']
    )
    
    # Aggregate features into fixed-size vectors using statistical measures
    features = []
    
    # Mel spectrogram statistics
    features.extend(np.mean(mel_db, axis=1))
    features.extend(np.std(mel_db, axis=1))
    
    # MFCC statistics
    features.extend(np.mean(mfcc, axis=1))
    features.extend(np.std(mfcc, axis=1))
    
    # Spectral feature statistics
    for feat in [spectral_centroid, spectral_bandwidth, spectral_rolloff, zcr, rms]:
        features.append(np.mean(feat))
        features.append(np.std(feat))
    
    # Chroma statistics
    features.extend(np.mean(chroma, axis=1))
    features.extend(np.std(chroma, axis=1))
    
    return np.array(features)


print("Feature extraction functions defined!")


Feature extraction functions defined!


In [3]:
def collect_audio_files():
    """Collect all audio file paths with their labels from METADATA (not folder names)."""
    audio_files = []
    skipped_files = 0
    
    # Collect Bangla songs
    print("Collecting Bangla song files...")
    if os.path.exists(BANGLA_PATH):
        for genre_folder in os.listdir(BANGLA_PATH):
            genre_path = os.path.join(BANGLA_PATH, genre_folder)
            if os.path.isdir(genre_path):
                files_in_genre = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
                # Limit samples per class
                files_in_genre = files_in_genre[:CONFIG['max_samples_per_class']]
                for audio_file in files_in_genre:
                    file_id = os.path.splitext(audio_file)[0]
                    if file_id in genre_lookup:
                        lyrics = lyrics_lookup.get(file_id, '')
                        
                        # Filter out samples with empty or insufficient lyrics
                        if not isinstance(lyrics, str) or len(lyrics.strip()) < 10:
                            skipped_files += 1
                            continue
                            
                        audio_files.append({
                            'path': os.path.join(genre_path, audio_file),
                            'language': 'bn',
                            'genre': genre_lookup[file_id],
                            'filename': audio_file,
                            'file_id': file_id,
                            'lyrics': lyrics
                        })
                    else:
                        skipped_files += 1
    
    # Collect English songs
    print("Collecting English song files...")
    if os.path.exists(ENGLISH_PATH):
        for genre_folder in os.listdir(ENGLISH_PATH):
            genre_path = os.path.join(ENGLISH_PATH, genre_folder)
            if os.path.isdir(genre_path):
                files_in_genre = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
                files_in_genre = files_in_genre[:CONFIG['max_samples_per_class']]
                for audio_file in files_in_genre:
                    file_id = os.path.splitext(audio_file)[0]
                    if file_id in genre_lookup:
                        lyrics = lyrics_lookup.get(file_id, '')
                        
                        # Filter out samples with empty or insufficient lyrics
                        if not isinstance(lyrics, str) or len(lyrics.strip()) < 10:
                            skipped_files += 1
                            continue
                            
                        audio_files.append({
                            'path': os.path.join(genre_path, audio_file),
                            'language': 'en',
                            'genre': genre_lookup[file_id],
                            'filename': audio_file,
                            'file_id': file_id,
                            'lyrics': lyrics
                        })
                    else:
                        skipped_files += 1
    
    print(f"Total audio files collected: {len(audio_files)}")
    if skipped_files > 0:
        print(f"Skipped {skipped_files} files (not found in metadata)")
    return audio_files


print("\n" + "=" * 60)
print("COLLECTING AUDIO FILES")
print("=" * 60)

audio_files = collect_audio_files()
print(f"\nTotal files to process: {len(audio_files)}")


COLLECTING AUDIO FILES
Collecting Bangla song files...
Collecting English song files...
Total audio files collected: 1766
Skipped 93 files (not found in metadata)

Total files to process: 1766


In [4]:
print("\n" + "=" * 60)
print("EXTRACTING FEATURES")
print("=" * 60)

mel_spectrograms = []
mfccs = []
flattened_features = []
labels = []
lyrics_list = []
metadata_list = []
failed_files = []

for file_info in tqdm(audio_files, desc="Processing audio files"):
    audio, sr = load_audio_file(file_info['path'])
    
    if audio is not None:
        try:
            # Extract mel spectrogram (2D for CNN)
            mel_spec = extract_mel_spectrogram(audio, sr)
            mel_spectrograms.append(mel_spec)
            
            # Extract MFCC (2D for CNN)
            mfcc = extract_mfcc(audio, sr)
            mfccs.append(mfcc)
            
            # Extract flattened features (1D for MLP - same as original preprocessing)
            flat_feat = extract_flattened_features(audio, sr)
            flattened_features.append(flat_feat)
            
            # Collect labels and metadata
            labels.append(file_info['genre'])
            lyrics_list.append(file_info['lyrics'])
            metadata_list.append({
                'language': file_info['language'],
                'genre': file_info['genre'],
                'filename': file_info['filename'],
                'file_id': file_info['file_id']
            })
        except Exception as e:
            failed_files.append((file_info['path'], str(e)))
    else:
        failed_files.append((file_info['path'], "Failed to load"))

# Convert to numpy arrays
mel_spectrograms = np.array(mel_spectrograms)
mfccs = np.array(mfccs)
flattened_features = np.array(flattened_features)
labels = np.array(labels)

print(f"\nSuccessfully processed: {len(mel_spectrograms)} files")
print(f"Failed to process: {len(failed_files)} files")


EXTRACTING FEATURES


Processing audio files: 100%|██████████| 1766/1766 [09:09<00:00,  3.21it/s]



Successfully processed: 1766 files
Failed to process: 0 files


In [5]:
print("\n" + "=" * 60)
print("CREATING LYRICS EMBEDDINGS")
print("=" * 60)

def create_lyrics_embeddings(lyrics_list, max_features=100):
    """Create TF-IDF embeddings for lyrics."""
    vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
    # Handle empty lyrics
    lyrics_cleaned = [l if l and len(str(l)) > 0 else ' ' for l in lyrics_list]
    embeddings = vectorizer.fit_transform(lyrics_cleaned).toarray()
    return embeddings, vectorizer


lyrics_embeddings, tfidf_vectorizer = create_lyrics_embeddings(
    lyrics_list, 
    max_features=CONFIG['lyrics_max_features']
)
print(f"Lyrics embeddings shape: {lyrics_embeddings.shape}")


CREATING LYRICS EMBEDDINGS
Lyrics embeddings shape: (1766, 100)


In [6]:
print("\n" + "=" * 60)
print("FEATURE EXTRACTION SUMMARY")
print("=" * 60)

print(f"\n1. Mel Spectrograms (for CNN):")
print(f"   Shape: {mel_spectrograms.shape}")
print(f"   Dimensions: {mel_spectrograms.shape[1]} (mel bands) x {mel_spectrograms.shape[2]} (time steps)")

print(f"\n2. MFCCs (for CNN):")
print(f"   Shape: {mfccs.shape}")
print(f"   Dimensions: {mfccs.shape[1]} (coefficients) x {mfccs.shape[2]} (time steps)")

print(f"\n3. Flattened Features (for MLP - backward compatible):")
print(f"   Shape: {flattened_features.shape}")
print(f"   Feature vector size: {flattened_features.shape[1]}")

print(f"\n4. Lyrics Embeddings (TF-IDF):")
print(f"   Shape: {lyrics_embeddings.shape}")

print(f"\n5. Number of samples: {len(labels)}")
print(f"   Number of unique genres: {len(np.unique(labels))}")

# Label distribution
print("\nLabel distribution:")
label_counts = pd.Series(labels).value_counts()
for label, count in label_counts.items():
    print(f"  - {label}: {count}")

# Language distribution
languages = [m['language'] for m in metadata_list]
lang_counts = pd.Series(languages).value_counts()
print("\nLanguage distribution:")
for lang, count in lang_counts.items():
    print(f"  - {'Bangla' if lang == 'bn' else 'English'}: {count}")


FEATURE EXTRACTION SUMMARY

1. Mel Spectrograms (for CNN):
   Shape: (1766, 128, 128)
   Dimensions: 128 (mel bands) x 128 (time steps)

2. MFCCs (for CNN):
   Shape: (1766, 40, 128)
   Dimensions: 40 (coefficients) x 128 (time steps)

3. Flattened Features (for MLP - backward compatible):
   Shape: (1766, 370)
   Feature vector size: 370

4. Lyrics Embeddings (TF-IDF):
   Shape: (1766, 100)

5. Number of samples: 1766
   Number of unique genres: 8

Label distribution:
  - traditional: 260
  - hiphop: 258
  - pop: 250
  - rock: 249
  - metal: 232
  - disco: 195
  - jazz: 163
  - indie: 159

Language distribution:
  - Bangla: 913
  - English: 853


In [7]:

print("\n" + "=" * 60)
print("NORMALIZING FEATURES")
print("=" * 60)

# Normalize mel spectrograms (per-sample normalization)
mel_scaler = StandardScaler()
mel_flat = mel_spectrograms.reshape(len(mel_spectrograms), -1)
mel_normalized_flat = mel_scaler.fit_transform(mel_flat)
mel_normalized = mel_normalized_flat.reshape(mel_spectrograms.shape)

# Normalize MFCCs
mfcc_scaler = StandardScaler()
mfcc_flat = mfccs.reshape(len(mfccs), -1)
mfcc_normalized_flat = mfcc_scaler.fit_transform(mfcc_flat)
mfcc_normalized = mfcc_normalized_flat.reshape(mfccs.shape)

# Normalize flattened features
from sklearn.impute import SimpleImputer

# Check for NaN or inf values
print(f"\nNaN values in flattened features: {np.isnan(flattened_features).sum()}")
print(f"Inf values in flattened features: {np.isinf(flattened_features).sum()}")

# Replace inf with NaN, then impute
features_clean = np.where(np.isinf(flattened_features), np.nan, flattened_features)
imputer = SimpleImputer(strategy='mean')
features_imputed = imputer.fit_transform(features_clean)

# Normalize
flat_scaler = StandardScaler()
features_normalized = flat_scaler.fit_transform(features_imputed)

print(f"\nAfter normalization:")
print(f"  - Mel spectrograms mean: {mel_normalized.mean():.4f}, std: {mel_normalized.std():.4f}")
print(f"  - MFCCs mean: {mfcc_normalized.mean():.4f}, std: {mfcc_normalized.std():.4f}")
print(f"  - Flattened features mean: {features_normalized.mean():.4f}, std: {features_normalized.std():.4f}")


NORMALIZING FEATURES

NaN values in flattened features: 0
Inf values in flattened features: 0

After normalization:
  - Mel spectrograms mean: 0.0000, std: 1.0000
  - MFCCs mean: 0.0000, std: 1.0000
  - Flattened features mean: 0.0000, std: 1.0000


In [8]:
print("\n" + "=" * 60)
print("SAVING PREPROCESSED DATA")
print("=" * 60)

# Create DataFrame with metadata
metadata_df_out = pd.DataFrame(metadata_list)
metadata_df_out['label'] = labels

# Save 2D features for CNN (mel spectrograms)
np.save(os.path.join(OUTPUT_PATH, 'mel_spectrograms_raw.npy'), mel_spectrograms)
np.save(os.path.join(OUTPUT_PATH, 'mel_spectrograms_normalized.npy'), mel_normalized)

# Save MFCCs
np.save(os.path.join(OUTPUT_PATH, 'mfccs_raw.npy'), mfccs)
np.save(os.path.join(OUTPUT_PATH, 'mfccs_normalized.npy'), mfcc_normalized)

# Save flattened features (backward compatible with 2_vae_clustering.py)
np.save(os.path.join(OUTPUT_PATH, 'features_raw.npy'), flattened_features)
np.save(os.path.join(OUTPUT_PATH, 'features_normalized.npy'), features_normalized)

# Save lyrics embeddings
np.save(os.path.join(OUTPUT_PATH, 'lyrics_embeddings.npy'), lyrics_embeddings)

# Save labels
np.save(os.path.join(OUTPUT_PATH, 'labels.npy'), labels)

# Save metadata
metadata_df_out.to_csv(os.path.join(OUTPUT_PATH, 'metadata.csv'), index=False)

# Save scalers and vectorizer for later use
with open(os.path.join(OUTPUT_PATH, 'mel_scaler.pkl'), 'wb') as f:
    pickle.dump(mel_scaler, f)

with open(os.path.join(OUTPUT_PATH, 'mfcc_scaler.pkl'), 'wb') as f:
    pickle.dump(mfcc_scaler, f)

with open(os.path.join(OUTPUT_PATH, 'flat_scaler.pkl'), 'wb') as f:
    pickle.dump(flat_scaler, f)

with open(os.path.join(OUTPUT_PATH, 'imputer.pkl'), 'wb') as f:
    pickle.dump(imputer, f)

with open(os.path.join(OUTPUT_PATH, 'tfidf_vectorizer.pkl'), 'wb') as f:
    pickle.dump(tfidf_vectorizer, f)

# Save configuration
with open(os.path.join(OUTPUT_PATH, 'config.pkl'), 'wb') as f:
    pickle.dump(CONFIG, f)

print(f"\nFiles saved to: {OUTPUT_PATH}")
print("\nSaved files:")
print("  FOR CONVOLUTIONAL VAE (3_advanced_vae_clustering.py):")
print("    - mel_spectrograms_raw.npy")
print("    - mel_spectrograms_normalized.npy")
print("    - mfccs_raw.npy")
print("    - mfccs_normalized.npy")
print("    - lyrics_embeddings.npy")
print("\n  FOR MLP-BASED VAE (2_vae_clustering.py - backward compatible):")
print("    - features_raw.npy")
print("    - features_normalized.npy")
print("\n  COMMON FILES:")
print("    - labels.npy")
print("    - metadata.csv")
print("    - mel_scaler.pkl")
print("    - mfcc_scaler.pkl")
print("    - flat_scaler.pkl")
print("    - imputer.pkl")
print("    - tfidf_vectorizer.pkl")
print("    - config.pkl")


SAVING PREPROCESSED DATA

Files saved to: f:\BRACU\Semester 12 Final\CSE425\FInal_project\processed_data2

Saved files:
  FOR CONVOLUTIONAL VAE (3_advanced_vae_clustering.py):
    - mel_spectrograms_raw.npy
    - mel_spectrograms_normalized.npy
    - mfccs_raw.npy
    - mfccs_normalized.npy
    - lyrics_embeddings.npy

  FOR MLP-BASED VAE (2_vae_clustering.py - backward compatible):
    - features_raw.npy
    - features_normalized.npy

  COMMON FILES:
    - labels.npy
    - metadata.csv
    - mel_scaler.pkl
    - mfcc_scaler.pkl
    - flat_scaler.pkl
    - imputer.pkl
    - tfidf_vectorizer.pkl
    - config.pkl
