# Exploratory Data Analysis (EDA)
## Music Genre Classification Dataset

This notebook provides a comprehensive exploratory data analysis of the music genre classification dataset, including:
- **Dataset Overview**: File counts, languages, genres
- **Metadata Analysis**: Language/genre distributions, lyrics analysis
- **Audio File Analysis**: Sample rates, durations, file properties
- **Feature Analysis**: Mel-spectrograms, audio features, lyrics embeddings
- **Visualizations**: Distributions, correlations, sample spectrograms
- **Data Quality Checks**: Missing values, duplicates, outliers


## 1. Setup and Imports


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
from scipy.io import wavfile
from collections import Counter
import pickle

warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Configuration
BANGLA_DATASET_DIR = r'E:\425 Project\Datasets\Bangla_Datasets'
ENGLISH_DATASET_DIR = r'E:\425 Project\Datasets\English_Datasets'
METADATA_FILE = r'E:\425 Project\Datasets\updated_metadata.csv'
RESULTS_DIR = r'E:\425 Project\results'

# Create results directory if it doesn't exist
os.makedirs(RESULTS_DIR, exist_ok=True)

print("‚úÖ Libraries imported successfully!")
print(f"üìÅ Results will be saved to: {RESULTS_DIR}")


## 2. Dataset Overview


In [None]:
# Load metadata
print("Loading metadata...")
metadata_df = pd.read_csv(METADATA_FILE)
print(f"‚úÖ Metadata loaded: {len(metadata_df)} entries")
print(f"\nMetadata columns: {list(metadata_df.columns)}")
print(f"\nFirst few rows:")
print(metadata_df.head())

# Get dataset structure
print("\n" + "="*60)
print("DATASET STRUCTURE")
print("="*60)

# Bangla dataset
bangla_genres = [d for d in os.listdir(BANGLA_DATASET_DIR) 
                 if os.path.isdir(os.path.join(BANGLA_DATASET_DIR, d))]
bangla_genres.sort()

bangla_file_counts = {}
for genre in bangla_genres:
    genre_path = os.path.join(BANGLA_DATASET_DIR, genre)
    files = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
    bangla_file_counts[genre] = len(files)

# English dataset
english_genres = [d for d in os.listdir(ENGLISH_DATASET_DIR) 
                  if os.path.isdir(os.path.join(ENGLISH_DATASET_DIR, d))]
english_genres.sort()

english_file_counts = {}
for genre in english_genres:
    genre_path = os.path.join(ENGLISH_DATASET_DIR, genre)
    files = [f for f in os.listdir(genre_path) if f.endswith('.wav')]
    english_file_counts[genre] = len(files)

print(f"\nüìä BANGLA DATASET:")
print(f"   Total genres: {len(bangla_genres)}")
print(f"   Genres: {', '.join(bangla_genres)}")
print(f"   Total files: {sum(bangla_file_counts.values())}")
for genre, count in sorted(bangla_file_counts.items()):
    print(f"   - {genre}: {count} files")

print(f"\nüìä ENGLISH DATASET:")
print(f"   Total genres: {len(english_genres)}")
print(f"   Genres: {', '.join(english_genres)}")
print(f"   Total files: {sum(english_file_counts.values())}")
for genre, count in sorted(english_file_counts.items()):
    print(f"   - {genre}: {count} files")

print(f"\nüìä OVERALL SUMMARY:")
total_files = sum(bangla_file_counts.values()) + sum(english_file_counts.values())
total_genres = len(set(bangla_genres) | set(english_genres))
print(f"   Total audio files: {total_files}")
print(f"   Total unique genres: {total_genres}")
print(f"   Languages: Bangla (bn), English (en)")


## 3. Metadata Analysis


In [None]:
# Basic metadata statistics
print("="*60)
print("METADATA STATISTICS")
print("="*60)

print(f"\nüìã Dataset Info:")
print(f"   Total entries: {len(metadata_df)}")
print(f"   Columns: {list(metadata_df.columns)}")
print(f"\nüìã Data Types:")
print(metadata_df.dtypes)
print(f"\nüìã Missing Values:")
print(metadata_df.isnull().sum())
print(f"\nüìã Basic Statistics:")
print(metadata_df.describe(include='all'))


In [None]:
# Language distribution
print("\n" + "="*60)
print("LANGUAGE DISTRIBUTION")
print("="*60)

language_counts = metadata_df['language'].value_counts()
print("\nLanguage counts:")
for lang, count in language_counts.items():
    percentage = (count / len(metadata_df)) * 100
    print(f"   {lang}: {count} ({percentage:.2f}%)")

# Visualize language distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
axes[0].pie(language_counts.values, labels=language_counts.index, autopct='%1.1f%%', 
            startangle=90, colors=sns.color_palette("husl", len(language_counts)))
axes[0].set_title('Language Distribution (Pie Chart)', fontsize=14, fontweight='bold')

# Bar chart
axes[1].bar(language_counts.index, language_counts.values, color=sns.color_palette("husl", len(language_counts)))
axes[1].set_title('Language Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Language', fontsize=12)
axes[1].set_ylabel('Count', fontsize=12)
axes[1].grid(axis='y', alpha=0.3)

for i, v in enumerate(language_counts.values):
    axes[1].text(i, v + 10, str(v), ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'language_distribution.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Language distribution plot saved!")


In [None]:
# Genre distribution
print("\n" + "="*60)
print("GENRE DISTRIBUTION")
print("="*60)

genre_counts = metadata_df['genre'].value_counts()
print("\nGenre counts:")
for genre, count in genre_counts.items():
    percentage = (count / len(metadata_df)) * 100
    print(f"   {genre}: {count} ({percentage:.2f}%)")

# Visualize genre distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Horizontal bar chart
axes[0].barh(genre_counts.index, genre_counts.values, color=sns.color_palette("husl", len(genre_counts)))
axes[0].set_title('Genre Distribution (Bar Chart)', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Count', fontsize=12)
axes[0].set_ylabel('Genre', fontsize=12)
axes[0].grid(axis='x', alpha=0.3)

# Add count labels
for i, v in enumerate(genre_counts.values):
    axes[0].text(v + 5, i, str(v), va='center', fontweight='bold')

# Pie chart (top 10 genres)
top_genres = genre_counts.head(10)
axes[1].pie(top_genres.values, labels=top_genres.index, autopct='%1.1f%%', 
            startangle=90, colors=sns.color_palette("husl", len(top_genres)))
axes[1].set_title('Top 10 Genres Distribution', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'genre_distribution.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Genre distribution plot saved!")


In [None]:
# Genre by Language
print("\n" + "="*60)
print("GENRE BY LANGUAGE")
print("="*60)

genre_language = pd.crosstab(metadata_df['genre'], metadata_df['language'])
print("\nGenre-Language Cross-tabulation:")
print(genre_language)

# Visualize
fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(genre_language, annot=True, fmt='d', cmap='YlOrRd', ax=ax, 
            cbar_kws={'label': 'Count'})
ax.set_title('Genre Distribution by Language (Heatmap)', fontsize=14, fontweight='bold')
ax.set_xlabel('Language', fontsize=12)
ax.set_ylabel('Genre', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'genre_language_heatmap.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Genre-Language heatmap saved!")


## 4. Lyrics Analysis (Bangla Dataset)


In [None]:
# Lyrics statistics
print("="*60)
print("LYRICS ANALYSIS")
print("="*60)

# Filter Bangla entries
bangla_metadata = metadata_df[metadata_df['language'] == 'bn'].copy()

# Calculate lyrics statistics
bangla_metadata['lyrics_length'] = bangla_metadata['lyrics'].fillna('').str.len()
bangla_metadata['lyrics_word_count'] = bangla_metadata['lyrics'].fillna('').str.split().str.len()
bangla_metadata['has_lyrics'] = bangla_metadata['lyrics'].fillna('').str.len() > 0

print(f"\nüìù Lyrics Statistics:")
print(f"   Total Bangla entries: {len(bangla_metadata)}")
print(f"   Entries with lyrics: {bangla_metadata['has_lyrics'].sum()}")
print(f"   Entries without lyrics: {(~bangla_metadata['has_lyrics']).sum()}")

print(f"\nüìù Lyrics Length Statistics:")
print(bangla_metadata['lyrics_length'].describe())

print(f"\nüìù Word Count Statistics:")
print(bangla_metadata['lyrics_word_count'].describe())

# Visualize lyrics statistics
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Lyrics length distribution
axes[0, 0].hist(bangla_metadata['lyrics_length'], bins=50, color='skyblue', edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Lyrics Length (Characters)', fontsize=12, fontweight='bold')
axes[0, 0].set_xlabel('Character Count', fontsize=10)
axes[0, 0].set_ylabel('Frequency', fontsize=10)
axes[0, 0].grid(alpha=0.3)

# Word count distribution
axes[0, 1].hist(bangla_metadata['lyrics_word_count'], bins=50, color='lightcoral', edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Distribution of Word Count', fontsize=12, fontweight='bold')
axes[0, 1].set_xlabel('Word Count', fontsize=10)
axes[0, 1].set_ylabel('Frequency', fontsize=10)
axes[0, 1].grid(alpha=0.3)

# Lyrics length by genre (boxplot)
if len(bangla_metadata['genre'].unique()) > 1:
    genre_order = bangla_metadata.groupby('genre')['lyrics_length'].median().sort_values(ascending=False).index
    sns.boxplot(data=bangla_metadata, x='genre', y='lyrics_length', order=genre_order, ax=axes[1, 0])
    axes[1, 0].set_title('Lyrics Length by Genre', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Genre', fontsize=10)
    axes[1, 0].set_ylabel('Character Count', fontsize=10)
    axes[1, 0].tick_params(axis='x', rotation=45)
    axes[1, 0].grid(alpha=0.3)

# Word count by genre (boxplot)
if len(bangla_metadata['genre'].unique()) > 1:
    sns.boxplot(data=bangla_metadata, x='genre', y='lyrics_word_count', order=genre_order, ax=axes[1, 1])
    axes[1, 1].set_title('Word Count by Genre', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Genre', fontsize=10)
    axes[1, 1].set_ylabel('Word Count', fontsize=10)
    axes[1, 1].tick_params(axis='x', rotation=45)
    axes[1, 1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(RESULTS_DIR, 'lyrics_analysis.png'), dpi=300, bbox_inches='tight')
plt.show()

print(f"\n‚úÖ Lyrics analysis plots saved!")


## 5. Audio File Analysis


In [None]:
# Analyze audio file properties
print("="*60)
print("AUDIO FILE ANALYSIS")
print("="*60)

# Sample a subset of files for analysis (to avoid long processing time)
sample_size = 100  # Analyze 100 files from each dataset
audio_properties = []

print(f"\nüìä Analyzing audio file properties (sampling {sample_size} files per dataset)...")

# Bangla dataset
bangla_files_analyzed = 0
for genre in bangla_genres[:3]:  # Sample first 3 genres
    if bangla_files_analyzed >= sample_size:
        break
    genre_path = os.path.join(BANGLA_DATASET_DIR, genre)
    files = [f for f in os.listdir(genre_path) if f.endswith('.wav')][:sample_size//3]
    
    for file in files:
        if bangla_files_analyzed >= sample_size:
            break
        file_path = os.path.join(genre_path, file)
        try:
            sr, audio = wavfile.read(file_path)
            duration = len(audio) / sr
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
            
            # Check if stereo or mono
            channels = 1 if len(audio.shape) == 1 else audio.shape[1]
            
            audio_properties.append({
                'file': file,
                'language': 'bn',
                'genre': genre,
                'sample_rate': sr,
                'duration': duration,
                'file_size_mb': file_size,
                'channels': channels,
                'samples': len(audio) if channels == 1 else audio.shape[0]
            })
            bangla_files_analyzed += 1
        except Exception as e:
            print(f"   Error reading {file_path}: {e}")

# English dataset
english_files_analyzed = 0
for genre in english_genres[:3]:  # Sample first 3 genres
    if english_files_analyzed >= sample_size:
        break
    genre_path = os.path.join(ENGLISH_DATASET_DIR, genre)
    files = [f for f in os.listdir(genre_path) if f.endswith('.wav')][:sample_size//3]
    
    for file in files:
        if english_files_analyzed >= sample_size:
            break
        file_path = os.path.join(genre_path, file)
        try:
            sr, audio = wavfile.read(file_path)
            duration = len(audio) / sr
            file_size = os.path.getsize(file_path) / (1024 * 1024)  # MB
            
            channels = 1 if len(audio.shape) == 1 else audio.shape[1]
            
            audio_properties.append({
                'file': file,
                'language': 'en',
                'genre': genre,
                'sample_rate': sr,
                'duration': duration,
                'file_size_mb': file_size,
                'channels': channels,
                'samples': len(audio) if channels == 1 else audio.shape[0]
            })
            english_files_analyzed += 1
        except Exception as e:
            print(f"   Error reading {file_path}: {e}")

audio_df = pd.DataFrame(audio_properties)
print(f"\n‚úÖ Analyzed {len(audio_df)} audio files")
print(f"\nüìä Audio Properties Summary:")
print(audio_df.describe())


In [None]:
# Visualize audio properties
if len(audio_df) > 0:
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    
    # Sample rate distribution
    axes[0, 0].hist(audio_df['sample_rate'], bins=20, color='steelblue', edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Sample Rate Distribution', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Sample Rate (Hz)', fontsize=10)
    axes[0, 0].set_ylabel('Frequency', fontsize=10)
    axes[0, 0].grid(alpha=0.3)
    
    # Duration distribution
    axes[0, 1].hist(audio_df['duration'], bins=30, color='coral', edgecolor='black', alpha=0.7)
    axes[0, 1].set_title('Duration Distribution', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Duration (seconds)', fontsize=10)
    axes[0, 1].set_ylabel('Frequency', fontsize=10)
    axes[0, 1].grid(alpha=0.3)
    
    # File size distribution
    axes[0, 2].hist(audio_df['file_size_mb'], bins=30, color='mediumseagreen', edgecolor='black', alpha=0.7)
    axes[0, 2].set_title('File Size Distribution', fontsize=12, fontweight='bold')
    axes[0, 2].set_xlabel('File Size (MB)', fontsize=10)
    axes[0, 2].set_ylabel('Frequency', fontsize=10)
    axes[0, 2].grid(alpha=0.3)
    
    # Sample rate by language
    sns.boxplot(data=audio_df, x='language', y='sample_rate', ax=axes[1, 0])
    axes[1, 0].set_title('Sample Rate by Language', fontsize=12, fontweight='bold')
    axes[1, 0].set_xlabel('Language', fontsize=10)
    axes[1, 0].set_ylabel('Sample Rate (Hz)', fontsize=10)
    axes[1, 0].grid(alpha=0.3)
    
    # Duration by language
    sns.boxplot(data=audio_df, x='language', y='duration', ax=axes[1, 1])
    axes[1, 1].set_title('Duration by Language', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Language', fontsize=10)
    axes[1, 1].set_ylabel('Duration (seconds)', fontsize=10)
    axes[1, 1].grid(alpha=0.3)
    
    # Channels distribution
    channel_counts = audio_df['channels'].value_counts()
    axes[1, 2].bar(channel_counts.index.astype(str), channel_counts.values, 
                    color='plum', edgecolor='black', alpha=0.7)
    axes[1, 2].set_title('Audio Channels Distribution', fontsize=12, fontweight='bold')
    axes[1, 2].set_xlabel('Number of Channels', fontsize=10)
    axes[1, 2].set_ylabel('Frequency', fontsize=10)
    axes[1, 2].grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'audio_properties_analysis.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\n‚úÖ Audio properties plots saved!")


## 6. Preprocessed Features Analysis


In [None]:
# Check if preprocessed files exist and analyze them
print("="*60)
print("PREPROCESSED FEATURES ANALYSIS")
print("="*60)

preprocessed_files = {
    'mel_spectrograms': 'mel_spectrograms.npy',
    'audio_features': 'audio_features.npy',
    'lyrics_embeddings': 'lyrics_embeddings.npy',
    'hybrid_features': 'hybrid_features.npy',
    'labels': 'labels.npy'
}

loaded_features = {}
for name, filename in preprocessed_files.items():
    filepath = os.path.join('.', filename)
    if os.path.exists(filepath):
        try:
            loaded_features[name] = np.load(filepath)
            print(f"‚úÖ Loaded {name}: shape {loaded_features[name].shape}")
        except Exception as e:
            print(f"‚ö†Ô∏è  Could not load {filename}: {e}")
    else:
        print(f"‚ö†Ô∏è  File not found: {filename}")

if len(loaded_features) == 0:
    print("\n‚ö†Ô∏è  No preprocessed features found. Run data_preprocessing.ipynb first.")
else:
    print(f"\n‚úÖ Loaded {len(loaded_features)} preprocessed feature files")


In [None]:
# Analyze mel-spectrograms
if 'mel_spectrograms' in loaded_features:
    mel_specs = loaded_features['mel_spectrograms']
    print("\n" + "="*60)
    print("MEL-SPECTROGRAM ANALYSIS")
    print("="*60)
    print(f"Shape: {mel_specs.shape} (samples, n_mels, time_frames)")
    print(f"Min value: {mel_specs.min():.4f}")
    print(f"Max value: {mel_specs.max():.4f}")
    print(f"Mean value: {mel_specs.mean():.4f}")
    print(f"Std value: {mel_specs.std():.4f}")
    
    # Visualize sample mel-spectrograms
    fig, axes = plt.subplots(2, 3, figsize=(16, 10))
    
    # Sample 6 random spectrograms
    if 'labels' in loaded_features:
        labels = loaded_features['labels']
        unique_labels = np.unique(labels)
        
        # Load genre mapping if available
        genre_mapping = {}
        if os.path.exists('genre_mapping.pkl'):
            with open('genre_mapping.pkl', 'rb') as f:
                mapping = pickle.load(f)
                genre_mapping = mapping.get('label_to_genre', {})
        
        for idx, ax in enumerate(axes.flat):
            # Sample from different genres if possible
            if idx < len(unique_labels):
                label = unique_labels[idx]
                label_indices = np.where(labels == label)[0]
                if len(label_indices) > 0:
                    sample_idx = np.random.choice(label_indices)
                    genre_name = genre_mapping.get(label, f'Label {label}')
                else:
                    sample_idx = np.random.randint(0, len(mel_specs))
                    genre_name = 'Unknown'
            else:
                sample_idx = np.random.randint(0, len(mel_specs))
                genre_name = 'Unknown'
            
            spec = mel_specs[sample_idx]
            im = ax.imshow(spec, aspect='auto', origin='lower', cmap='viridis')
            ax.set_title(f'Sample {sample_idx + 1}\n{genre_name}', fontsize=10, fontweight='bold')
            ax.set_xlabel('Time Frames', fontsize=9)
            ax.set_ylabel('Mel Bands', fontsize=9)
            plt.colorbar(im, ax=ax, fraction=0.046)
    
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'sample_mel_spectrograms.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\n‚úÖ Sample mel-spectrograms plot saved!")


In [None]:
# Analyze audio features
if 'audio_features' in loaded_features:
    audio_feat = loaded_features['audio_features']
    print("\n" + "="*60)
    print("AUDIO FEATURES ANALYSIS")
    print("="*60)
    print(f"Shape: {audio_feat.shape} (samples, features)")
    print(f"Min value: {audio_feat.min():.4f}")
    print(f"Max value: {audio_feat.max():.4f}")
    print(f"Mean value: {audio_feat.mean():.4f}")
    print(f"Std value: {audio_feat.std():.4f}")
    
    # Feature statistics
    feat_df = pd.DataFrame(audio_feat)
    print(f"\nüìä Feature Statistics:")
    print(feat_df.describe())
    
    # Visualize feature distributions
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # Distribution of mean feature values
    mean_features = feat_df.mean(axis=0)
    axes[0, 0].hist(mean_features, bins=30, color='steelblue', edgecolor='black', alpha=0.7)
    axes[0, 0].set_title('Distribution of Mean Feature Values', fontsize=12, fontweight='bold')
    axes[0, 0].set_xlabel('Mean Feature Value', fontsize=10)
    axes[0, 0].set_ylabel('Frequency', fontsize=10)
    axes[0, 0].grid(alpha=0.3)
    
    # Distribution of std feature values
    std_features = feat_df.std(axis=0)
    axes[0, 1].hist(std_features, bins=30, color='coral', edgecolor='black', alpha=0.7)
    axes[0, 1].set_title('Distribution of Std Feature Values', fontsize=12, fontweight='bold')
    axes[0, 1].set_xlabel('Std Feature Value', fontsize=10)
    axes[0, 1].set_ylabel('Frequency', fontsize=10)
    axes[0, 1].grid(alpha=0.3)
    
    # Correlation matrix (sample features if too many)
    if audio_feat.shape[1] <= 50:
        corr_matrix = np.corrcoef(audio_feat.T)
        im = axes[1, 0].imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
        axes[1, 0].set_title('Feature Correlation Matrix', fontsize=12, fontweight='bold')
        plt.colorbar(im, ax=axes[1, 0])
    else:
        # Sample 20 features for correlation
        sample_indices = np.random.choice(audio_feat.shape[1], 20, replace=False)
        corr_matrix = np.corrcoef(audio_feat[:, sample_indices].T)
        im = axes[1, 0].imshow(corr_matrix, cmap='coolwarm', aspect='auto', vmin=-1, vmax=1)
        axes[1, 0].set_title('Feature Correlation Matrix (20 sampled features)', fontsize=12, fontweight='bold')
        plt.colorbar(im, ax=axes[1, 0])
    
    # Feature variance
    feature_variance = feat_df.var(axis=0)
    top_var_indices = feature_variance.nlargest(20).index
    axes[1, 1].barh(range(len(top_var_indices)), feature_variance[top_var_indices], 
                    color='mediumseagreen', edgecolor='black', alpha=0.7)
    axes[1, 1].set_yticks(range(len(top_var_indices)))
    axes[1, 1].set_yticklabels([f'Feature {i}' for i in top_var_indices])
    axes[1, 1].set_title('Top 20 Features by Variance', fontsize=12, fontweight='bold')
    axes[1, 1].set_xlabel('Variance', fontsize=10)
    axes[1, 1].grid(axis='x', alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'audio_features_analysis.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\n‚úÖ Audio features analysis plots saved!")


In [None]:
# Analyze labels distribution
if 'labels' in loaded_features:
    labels = loaded_features['labels']
    print("\n" + "="*60)
    print("LABELS ANALYSIS")
    print("="*60)
    
    unique_labels, counts = np.unique(labels, return_counts=True)
    print(f"Total samples: {len(labels)}")
    print(f"Number of unique labels: {len(unique_labels)}")
    print(f"\nLabel distribution:")
    
    # Load genre mapping if available
    genre_mapping = {}
    if os.path.exists('genre_mapping.pkl'):
        with open('genre_mapping.pkl', 'rb') as f:
            mapping = pickle.load(f)
            genre_mapping = mapping.get('label_to_genre', {})
    
    for label, count in zip(unique_labels, counts):
        genre_name = genre_mapping.get(label, f'Label {label}')
        percentage = (count / len(labels)) * 100
        print(f"   {genre_name} (label {label}): {count} ({percentage:.2f}%)")
    
    # Visualize label distribution
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Bar chart
    genre_names = [genre_mapping.get(l, f'Label {l}') for l in unique_labels]
    axes[0].bar(range(len(unique_labels)), counts, color=sns.color_palette("husl", len(unique_labels)), 
                edgecolor='black', alpha=0.7)
    axes[0].set_xticks(range(len(unique_labels)))
    axes[0].set_xticklabels(genre_names, rotation=45, ha='right')
    axes[0].set_title('Label Distribution (Bar Chart)', fontsize=12, fontweight='bold')
    axes[0].set_xlabel('Genre', fontsize=10)
    axes[0].set_ylabel('Count', fontsize=10)
    axes[0].grid(axis='y', alpha=0.3)
    
    # Add count labels
    for i, v in enumerate(counts):
        axes[0].text(i, v + max(counts)*0.01, str(v), ha='center', va='bottom', fontweight='bold', fontsize=8)
    
    # Pie chart
    axes[1].pie(counts, labels=genre_names, autopct='%1.1f%%', startangle=90,
                colors=sns.color_palette("husl", len(unique_labels)))
    axes[1].set_title('Label Distribution (Pie Chart)', fontsize=12, fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(os.path.join(RESULTS_DIR, 'labels_distribution.png'), dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\n‚úÖ Labels distribution plot saved!")


## 7. Data Quality Checks


In [None]:
# Data quality checks
print("="*60)
print("DATA QUALITY CHECKS")
print("="*60)

quality_issues = []

# Check metadata
print("\nüìã Metadata Quality Checks:")
print(f"   Total entries: {len(metadata_df)}")
print(f"   Missing values:")
for col in metadata_df.columns:
    missing = metadata_df[col].isnull().sum()
    if missing > 0:
        percentage = (missing / len(metadata_df)) * 100
        print(f"      - {col}: {missing} ({percentage:.2f}%)")
        quality_issues.append(f"Missing values in {col}: {missing} ({percentage:.2f}%)")

# Check for duplicate IDs
duplicate_ids = metadata_df['ID'].duplicated().sum()
if duplicate_ids > 0:
    print(f"   ‚ö†Ô∏è  Duplicate IDs: {duplicate_ids}")
    quality_issues.append(f"Duplicate IDs: {duplicate_ids}")
else:
    print(f"   ‚úÖ No duplicate IDs")

# Check audio files
print("\nüìÅ Audio Files Quality Checks:")
total_expected = sum(bangla_file_counts.values()) + sum(english_file_counts.values())
print(f"   Expected total files: {total_expected}")

# Check for missing audio files (sample check)
missing_files = 0
for genre in list(bangla_genres[:2]):  # Check first 2 genres
    genre_path = os.path.join(BANGLA_DATASET_DIR, genre)
    genre_metadata = metadata_df[(metadata_df['language'] == 'bn') & (metadata_df['genre'] == genre)]
    if len(genre_metadata) > 0:
        for _, row in genre_metadata.head(10).iterrows():  # Check first 10
            file_path = os.path.join(genre_path, f"{row['ID']}.wav")
            if not os.path.exists(file_path):
                missing_files += 1

if missing_files > 0:
    print(f"   ‚ö†Ô∏è  Sample check found {missing_files} missing audio files")
    quality_issues.append(f"Missing audio files (sample): {missing_files}")
else:
    print(f"   ‚úÖ Sample check: No missing audio files found")

# Check preprocessed features consistency
if len(loaded_features) > 1:
    print("\nüìä Preprocessed Features Consistency:")
    feature_shapes = {name: feat.shape[0] for name, feat in loaded_features.items() if len(feat.shape) > 0}
    if len(set(feature_shapes.values())) == 1:
        print(f"   ‚úÖ All features have consistent sample count: {list(feature_shapes.values())[0]}")
    else:
        print(f"   ‚ö†Ô∏è  Inconsistent sample counts:")
        for name, count in feature_shapes.items():
            print(f"      - {name}: {count}")
        quality_issues.append("Inconsistent feature sample counts")

# Summary
print("\n" + "="*60)
print("QUALITY SUMMARY")
print("="*60)
if len(quality_issues) == 0:
    print("‚úÖ No major quality issues detected!")
else:
    print(f"‚ö†Ô∏è  Found {len(quality_issues)} potential issues:")
    for issue in quality_issues:
        print(f"   - {issue}")


## 8. Summary and Conclusions


In [None]:
# Generate summary report
print("="*60)
print("EDA SUMMARY REPORT")
print("="*60)

print("\nüìä DATASET OVERVIEW:")
print(f"   - Total audio files: {total_files}")
print(f"   - Languages: Bangla ({sum(bangla_file_counts.values())} files), English ({sum(english_file_counts.values())} files)")
print(f"   - Total genres: {total_genres}")
print(f"   - Metadata entries: {len(metadata_df)}")

print("\nüìã METADATA:")
print(f"   - Languages: {', '.join(metadata_df['language'].unique())}")
print(f"   - Genres: {len(metadata_df['genre'].unique())} unique genres")
if 'bn' in metadata_df['language'].values:
    bangla_with_lyrics = metadata_df[(metadata_df['language'] == 'bn') & 
                                     (metadata_df['lyrics'].fillna('').str.len() > 0)]
    print(f"   - Bangla entries with lyrics: {len(bangla_with_lyrics)}")

print("\nüìÅ PREPROCESSED FEATURES:")
if len(loaded_features) > 0:
    for name, feat in loaded_features.items():
        print(f"   - {name}: {feat.shape}")
else:
    print("   - No preprocessed features found")

print("\n‚úÖ All analysis plots have been saved to the results directory!")
print(f"   Results directory: {RESULTS_DIR}")

print("\n" + "="*60)
print("EDA COMPLETE!")
print("="*60)
