# Enhanced Data Preprocessing for VAE Clustering

This notebook preprocesses audio data for enhanced VAE clustering with:
- **Convolutional VAE**: Mel-spectrograms for CNN architecture
- **Hybrid Features**: Audio features + Lyrics embeddings
- **Multiple Clustering**: K-Means, Agglomerative Clustering, DBSCAN
- **Evaluation Metrics**: Silhouette Score, Davies-Bouldin Index, Adjusted Rand Index

## Output Files:
- `mel_spectrograms.npy` - Mel-spectrograms for Conv-VAE (N, n_mels, time_frames)
- `audio_features.npy` - Traditional audio features (for comparison)
- `lyrics_embeddings.npy` - Lyrics embeddings (TF-IDF based)
- `hybrid_features.npy` - Combined audio + lyrics embeddings
- `labels.npy` - Genre labels
- `genre_mapping.pkl` - Genre to label mapping
- `preprocessing_info.pkl` - Parameters and metadata


## Cell 1: Setup and Imports


In [None]:
# Install packages if needed (uncomment for Colab)
# !pip install scipy scikit-learn numpy pandas -q

import os
import numpy as np
import pandas as pd
import pickle
import warnings
warnings.filterwarnings('ignore')

from scipy.io import wavfile
from scipy import signal
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

np.random.seed(42)

print("‚úÖ Libraries imported successfully!")


## Cell 2: Configuration


In [None]:
# Dataset configuration
BANGLA_DATASET_DIR = r'E:\425 Project\Datasets\Bangla_Datasets'  # Change for Colab: '/content/Bangla_Datasets'
ENGLISH_DATASET_DIR = r'E:\425 Project\Datasets\English_Datasets'  # Change for Colab: '/content/English_Datasets'
METADATA_FILE = r'E:\425 Project\Datasets\updated_metadata.csv'  # Change for Colab: '/content/updated_metadata.csv'

# Audio processing parameters
TARGET_SR = 22050  # Target sample rate
DURATION = 3.0  # Duration in seconds (for 3-second clips)
N_MELS = 128  # Number of mel filterbanks for spectrogram
N_FFT = 2048  # FFT window size
HOP_LENGTH = 512  # Hop length for STFT

# Feature extraction parameters
MAX_SAMPLES_PER_GENRE = None  # None = use all samples

# Lyrics embedding parameters
LYRICS_EMBEDDING_DIM = 256  # Dimension for lyrics embeddings

print("‚úÖ Configuration set!")


## Cell 3: Helper Functions


In [None]:
def load_wav_file(file_path, target_sr=22050, duration=3.0):
    """Load and preprocess WAV file."""
    try:
        sr, audio = wavfile.read(file_path)
        
        # Convert to mono if stereo
        if len(audio.shape) > 1:
            audio = np.mean(audio, axis=1)
        
        # Normalize to [-1, 1]
        if audio.dtype == np.int16:
            audio = audio.astype(np.float32) / 32768.0
        elif audio.dtype == np.int32:
            audio = audio.astype(np.float32) / 2147483648.0
        
        # Resample if needed
        if sr != target_sr:
            num_samples = int(len(audio) * target_sr / sr)
            audio = signal.resample(audio, num_samples)
        
        # Trim or pad to target duration
        target_length = int(target_sr * duration)
        if len(audio) > target_length:
            audio = audio[:target_length]
        else:
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
        
        return audio, target_sr
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return np.zeros(int(target_sr * duration)), target_sr


def compute_mel_spectrogram(audio, sr=22050, n_mels=128, n_fft=2048, hop_length=512):
    """Compute mel-spectrogram using scipy."""
    # Compute STFT
    f, t, stft = signal.stft(audio, sr, nperseg=n_fft, noverlap=n_fft-hop_length, window='hann')
    
    # Magnitude spectrum
    magnitude = np.abs(stft)
    power = magnitude ** 2
    
    # Create mel filterbank
    n_freq_bins = power.shape[0]
    n_filters = n_mels
    filter_bank = np.zeros((n_filters, n_freq_bins))
    
    # Mel scale conversion
    mel_points = np.linspace(0, n_freq_bins, n_filters + 2)
    
    for i in range(n_filters):
        start = int(mel_points[i])
        center = int(mel_points[i + 1])
        end = int(mel_points[i + 2])
        
        if start < center:
            filter_bank[i, start:center] = np.linspace(0, 1, center - start)
        if center < end:
            filter_bank[i, center:end] = np.linspace(1, 0, end - center)
    
    # Apply mel filterbank
    mel_power = np.dot(filter_bank, power)
    
    # Convert to log scale (dB)
    mel_spec_db = 10 * np.log10(mel_power + 1e-10)
    
    return mel_spec_db


def extract_audio_features(audio, sr=22050):
    """Extract traditional audio features."""
    features = []
    
    # Spectral features
    f, t, stft = signal.stft(audio, sr, nperseg=2048, noverlap=1536)
    magnitude = np.abs(stft)
    
    # Mel-spectrogram features
    mel_spec = compute_mel_spectrogram(audio, sr, n_mels=64)
    
    # Statistical features from mel-spectrogram
    features.extend(np.mean(mel_spec, axis=1))  # Mean per mel band
    features.extend(np.std(mel_spec, axis=1))   # Std per mel band
    
    # Spectral features
    features.append(np.mean(magnitude))
    features.append(np.std(magnitude))
    
    # Spectral centroid approximation
    features.append(np.mean(np.diff(magnitude, axis=0)))
    
    # Zero crossing rate
    zcr = np.mean(np.abs(np.diff(np.sign(audio)))) / 2
    features.append(zcr)
    
    # Energy
    features.append(np.mean(audio ** 2))
    
    return np.array(features)


print("‚úÖ Helper functions defined!")


## Cell 4: Load Dataset and Metadata


In [None]:
# Load metadata (for Bangla dataset)
print("Loading metadata...")
try:
    metadata_df = pd.read_csv(METADATA_FILE)
    print(f"‚úÖ Metadata loaded: {len(metadata_df)} entries")
    print(f"   Columns: {list(metadata_df.columns)}")
    
    # Create ID to lyrics mapping (for Bangla dataset)
    id_to_lyrics = dict(zip(metadata_df['ID'], metadata_df['lyrics'].fillna('')))
    print(f"‚úÖ Lyrics mapping created: {len(id_to_lyrics)} entries")
except Exception as e:
    print(f"‚ö†Ô∏è Warning: Could not load metadata file: {e}")
    id_to_lyrics = {}

# Load audio files from both datasets
audio_files = []
file_ids = []
labels = []
dataset_types = []  # Track which dataset each file comes from
genre_to_label = {}
label_to_genre = {}

# Process Bangla dataset
print("\n" + "=" * 60)
print("Loading BANGLA dataset...")
print("=" * 60)

bangla_genres = [d for d in os.listdir(BANGLA_DATASET_DIR) if os.path.isdir(os.path.join(BANGLA_DATASET_DIR, d))]
bangla_genres.sort()

label_offset = 0
for genre in bangla_genres:
    genre_to_label[genre] = label_offset
    label_to_genre[label_offset] = genre
    label_offset += 1
    
    genre_path = os.path.join(BANGLA_DATASET_DIR, genre)
    files = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
    files.sort()
    
    if MAX_SAMPLES_PER_GENRE:
        files = files[:MAX_SAMPLES_PER_GENRE]
    
    for file in files:
        file_path = os.path.join(genre_path, file)
        file_id = os.path.splitext(file)[0]  # Remove .wav extension
        
        audio_files.append(file_path)
        file_ids.append(file_id)
        labels.append(genre_to_label[genre])
        dataset_types.append('bangla')
    
    print(f"  {genre}: {len(files)} files")

# Process English dataset
print("\n" + "=" * 60)
print("Loading ENGLISH dataset...")
print("=" * 60)

english_genres = [d for d in os.listdir(ENGLISH_DATASET_DIR) if os.path.isdir(os.path.join(ENGLISH_DATASET_DIR, d))]
english_genres.sort()

# Remove jazz genre (no lyrics available)
if 'jazz' in english_genres:
    english_genres.remove('jazz')
    print("‚ö†Ô∏è  Excluded 'jazz' genre (no lyrics available)")

for genre in english_genres:
    genre_to_label[genre] = label_offset
    label_to_genre[label_offset] = genre
    label_offset += 1
    
    genre_path = os.path.join(ENGLISH_DATASET_DIR, genre)
    files = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
    files.sort()
    
    if MAX_SAMPLES_PER_GENRE:
        files = files[:MAX_SAMPLES_PER_GENRE]
    
    for file in files:
        file_path = os.path.join(genre_path, file)
        file_id = os.path.splitext(file)[0]  # Remove .wav extension
        
        audio_files.append(file_path)
        file_ids.append(file_id)
        labels.append(genre_to_label[genre])
        dataset_types.append('english')
    
    print(f"  {genre}: {len(files)} files")

labels = np.array(labels)
dataset_types = np.array(dataset_types)

print("\n" + "=" * 60)
print("DATASET SUMMARY")
print("=" * 60)
print(f"‚úÖ Total samples: {len(audio_files)}")
print(f"   - Bangla: {np.sum(dataset_types == 'bangla')}")
print(f"   - English: {np.sum(dataset_types == 'english')}")
print(f"‚úÖ Number of genres: {len(genre_to_label)}")
print(f"‚úÖ Genres: {', '.join(sorted(genre_to_label.keys()))}")


## Cell 5: Extract Mel-Spectrograms (for Conv-VAE)


In [None]:
print("Extracting mel-spectrograms...")
mel_spectrograms = []

for i, file_path in enumerate(audio_files):
    if (i + 1) % 50 == 0:
        print(f"  Processed {i + 1}/{len(audio_files)} files...")
    
    audio, sr = load_wav_file(file_path, TARGET_SR, DURATION)
    mel_spec = compute_mel_spectrogram(audio, sr, N_MELS, N_FFT, HOP_LENGTH)
    
    # Normalize to [0, 1]
    mel_spec = (mel_spec - mel_spec.min()) / (mel_spec.max() - mel_spec.min() + 1e-8)
    
    mel_spectrograms.append(mel_spec)

mel_spectrograms = np.array(mel_spectrograms)
print(f"\n‚úÖ Mel-spectrograms extracted!")
print(f"   Shape: {mel_spectrograms.shape} (samples, n_mels, time_frames)")
print(f"   Min: {mel_spectrograms.min():.4f}, Max: {mel_spectrograms.max():.4f}")


## Cell 6: Extract Audio Features


In [None]:
print("Extracting audio features...")
audio_features_list = []

for i, file_path in enumerate(audio_files):
    if (i + 1) % 50 == 0:
        print(f"  Processed {i + 1}/{len(audio_files)} files...")
    
    audio, sr = load_wav_file(file_path, TARGET_SR, DURATION)
    features = extract_audio_features(audio, sr)
    audio_features_list.append(features)

audio_features = np.array(audio_features_list)
print(f"\n‚úÖ Audio features extracted!")
print(f"   Shape: {audio_features.shape} (samples, features)")

# Standardize audio features
audio_scaler = StandardScaler()
audio_features_scaled = audio_scaler.fit_transform(audio_features)
print(f"‚úÖ Audio features standardized!")


## Cell 7: Extract Lyrics Embeddings


In [None]:
# Get lyrics for each file (only available for Bangla dataset)
print("Extracting lyrics for each file...")
lyrics_list = []

for i, file_id in enumerate(file_ids):
    if dataset_types[i] == 'bangla':
        lyrics = id_to_lyrics.get(file_id, '')
    else:
        lyrics = ''  # English dataset doesn't have lyrics metadata
    lyrics_list.append(lyrics)

print(f"‚úÖ Lyrics extracted for {len(lyrics_list)} files")
files_with_lyrics = sum(1 for l in lyrics_list if l and len(l.strip()) > 0)
print(f"   Files with lyrics: {files_with_lyrics} (Bangla only)")

# Create TF-IDF embeddings
print("\nCreating TF-IDF embeddings...")
vectorizer = TfidfVectorizer(
    max_features=LYRICS_EMBEDDING_DIM,
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=2,  # Minimum document frequency
    stop_words=None  # Keep all words (multilingual)
)

lyrics_embeddings = vectorizer.fit_transform(lyrics_list).toarray()
print(f"‚úÖ Lyrics embeddings created!")
print(f"   Shape: {lyrics_embeddings.shape} (samples, embedding_dim)")
print(f"   Vocabulary size: {len(vectorizer.vocabulary_)}")
print(f"   Note: English files will have zero embeddings (no lyrics metadata)")


## Cell 8: Create Hybrid Features


In [None]:
# Combine audio features and lyrics embeddings
print("Creating hybrid features (audio + lyrics)...")

# Standardize lyrics embeddings
lyrics_scaler = StandardScaler()
lyrics_embeddings_scaled = lyrics_scaler.fit_transform(lyrics_embeddings)

# Concatenate audio features and lyrics embeddings
hybrid_features = np.hstack([audio_features_scaled, lyrics_embeddings_scaled])

# Standardize hybrid features
hybrid_scaler = StandardScaler()
hybrid_features_scaled = hybrid_scaler.fit_transform(hybrid_features)

print(f"‚úÖ Hybrid features created!")
print(f"   Audio features dimension: {audio_features_scaled.shape[1]}")
print(f"   Lyrics embeddings dimension: {lyrics_embeddings_scaled.shape[1]}")
print(f"   Hybrid features dimension: {hybrid_features_scaled.shape[1]}")
print(f"   Shape: {hybrid_features_scaled.shape} (samples, features)")


## Cell 9: Save Preprocessed Data


In [None]:
# Save all preprocessed data
print("Saving preprocessed data...")

# Save mel-spectrograms
np.save('mel_spectrograms.npy', mel_spectrograms)
print("‚úÖ Saved: mel_spectrograms.npy")

# Save audio features
np.save('audio_features.npy', audio_features_scaled)
print("‚úÖ Saved: audio_features.npy")

# Save lyrics embeddings
np.save('lyrics_embeddings.npy', lyrics_embeddings_scaled)
print("‚úÖ Saved: lyrics_embeddings.npy")

# Save hybrid features
np.save('hybrid_features.npy', hybrid_features_scaled)
print("‚úÖ Saved: hybrid_features.npy")

# Save labels
np.save('labels.npy', labels)
print("‚úÖ Saved: labels.npy")

# Save genre mapping
with open('genre_mapping.pkl', 'wb') as f:
    pickle.dump({'genre_to_label': genre_to_label, 'label_to_genre': label_to_genre}, f)
print("‚úÖ Saved: genre_mapping.pkl")

# Save scalers and vectorizer
preprocessing_info = {
    'audio_scaler': audio_scaler,
    'lyrics_scaler': lyrics_scaler,
    'hybrid_scaler': hybrid_scaler,
    'lyrics_vectorizer': vectorizer,
    'n_mels': N_MELS,
    'n_fft': N_FFT,
    'hop_length': HOP_LENGTH,
    'target_sr': TARGET_SR,
    'duration': DURATION,
    'audio_feature_dim': audio_features_scaled.shape[1],
    'lyrics_embedding_dim': lyrics_embeddings_scaled.shape[1],
    'hybrid_feature_dim': hybrid_features_scaled.shape[1]
}

with open('preprocessing_info.pkl', 'wb') as f:
    pickle.dump(preprocessing_info, f)
print("‚úÖ Saved: preprocessing_info.pkl")

print("\n" + "=" * 60)
print("üìÅ ALL PREPROCESSED DATA SAVED SUCCESSFULLY!")
print("=" * 60)
print("\nFiles created:")
print("  1. mel_spectrograms.npy - For Convolutional VAE")
print("  2. audio_features.npy - Traditional audio features")
print("  3. lyrics_embeddings.npy - Lyrics TF-IDF embeddings")
print("  4. hybrid_features.npy - Audio + Lyrics combined")
print("  5. labels.npy - Genre labels")
print("  6. genre_mapping.pkl - Genre mappings")
print("  7. preprocessing_info.pkl - Preprocessing metadata")
print("\n‚úÖ Ready for enhanced VAE clustering!")
