# Native Language Identification - Data Exploration

This notebook explores the IndicAccentDb dataset for native language identification of Indian English speakers.

## Objectives:
1. Load and explore the dataset
2. Analyze language distribution
3. Visualize audio samples
4. Extract and visualize acoustic features
5. Prepare data for modeling

## 1. Import Required Libraries

In [None]:
# Standard libraries
import os
import sys
import logging
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# Data processing
import numpy as np
import pandas as pd
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Audio processing
import librosa
import librosa.display
import soundfile as sf

# Project modules
from src.data import IndicAccentDataLoader, AudioPreprocessor
from src.features import MFCCExtractor
from src.utils import load_config

# Configure plotting
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

# Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

print("Libraries imported successfully!")

## 2. Load and Prepare Data

In [None]:
# Load configuration
config = load_config('../configs/default.yaml')
print("Configuration loaded successfully!")
print(f"Project: {config['project_name']}")
print(f"Dataset: {config['data']['dataset_name']}")

In [None]:
# Initialize data loader
loader = IndicAccentDataLoader(
    dataset_name=config['data']['dataset_name'],
    cache_dir='../data/raw',
    processed_dir='../data/processed',
    metadata_dir='../data/metadata'
)

print("Data loader initialized!")

In [None]:
# Load dataset (this may take a few minutes)
print("Loading dataset from HuggingFace...")
dataset = loader.load_dataset()
print(f"Dataset loaded: {dataset}")

### Dataset Statistics

In [None]:
# Get dataset statistics
stats = loader.get_statistics()

print("=" * 60)
print("DATASET STATISTICS")
print("=" * 60)
print(f"Total Samples: {stats['total_samples']}")
print(f"Number of Speakers: {stats.get('num_speakers', 'N/A')}")
print("\nLanguage Distribution:")
for lang, count in stats.get('language_distribution', {}).items():
    print(f"  {lang}: {count}")

if 'age_group_distribution' in stats:
    print("\nAge Group Distribution:")
    for age, count in stats['age_group_distribution'].items():
        print(f"  {age}: {count}")

if 'duration_stats' in stats:
    print("\nAudio Duration Statistics:")
    print(f"  Mean: {stats['duration_stats']['mean']:.2f}s")
    print(f"  Std:  {stats['duration_stats']['std']:.2f}s")
    print(f"  Min:  {stats['duration_stats']['min']:.2f}s")
    print(f"  Max:  {stats['duration_stats']['max']:.2f}s")

## 3. Visualize Data Distribution

In [None]:
# Plot language distribution
if 'language_distribution' in stats:
    fig, ax = plt.subplots(figsize=(12, 6))
    
    languages = list(stats['language_distribution'].keys())
    counts = list(stats['language_distribution'].values())
    
    ax.bar(languages, counts, color='steelblue', alpha=0.8)
    ax.set_xlabel('Native Language', fontsize=12)
    ax.set_ylabel('Number of Samples', fontsize=12)
    ax.set_title('Distribution of Native Languages in Dataset', fontsize=14, fontweight='bold')
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels on bars
    for i, count in enumerate(counts):
        ax.text(i, count, str(count), ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

In [None]:
# Plot age group and speech level distribution if available
if 'age_group_distribution' in stats or 'speech_level_distribution' in stats:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    if 'age_group_distribution' in stats:
        ages = list(stats['age_group_distribution'].keys())
        age_counts = list(stats['age_group_distribution'].values())
        axes[0].pie(age_counts, labels=ages, autopct='%1.1f%%', startangle=90)
        axes[0].set_title('Age Group Distribution')
    
    if 'speech_level_distribution' in stats:
        levels = list(stats['speech_level_distribution'].keys())
        level_counts = list(stats['speech_level_distribution'].values())
        axes[1].pie(level_counts, labels=levels, autopct='%1.1f%%', startangle=90)
        axes[1].set_title('Speech Level Distribution')
    
    plt.tight_layout()
    plt.show()

## 4. Audio Sample Exploration

In [None]:
# Function to visualize audio sample
def plot_audio_analysis(audio, sr, title="Audio Analysis"):
    """Plot waveform, spectrogram, and mel-spectrogram."""
    fig, axes = plt.subplots(3, 1, figsize=(14, 10))
    
    # Waveform
    time = np.arange(len(audio)) / sr
    axes[0].plot(time, audio)
    axes[0].set_xlabel('Time (s)')
    axes[0].set_ylabel('Amplitude')
    axes[0].set_title(f'{title} - Waveform')
    axes[0].grid(True, alpha=0.3)
    
    # Spectrogram
    D = librosa.amplitude_to_db(np.abs(librosa.stft(audio)), ref=np.max)
    img1 = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='hz', ax=axes[1])
    axes[1].set_title(f'{title} - Spectrogram')
    fig.colorbar(img1, ax=axes[1], format='%+2.0f dB')
    
    # Mel-spectrogram
    S = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=128)
    S_db = librosa.power_to_db(S, ref=np.max)
    img2 = librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='mel', ax=axes[2])
    axes[2].set_title(f'{title} - Mel Spectrogram')
    fig.colorbar(img2, ax=axes[2], format='%+2.0f dB')
    
    plt.tight_layout()
    plt.show()

print("Audio visualization function defined!")

In [None]:
# Visualize sample from each language (if dataset is loaded)
# Note: This assumes dataset has audio and language labels
# Modify according to actual dataset structure

print("To visualize audio samples, run the following after loading specific samples:")
print("audio, sr = librosa.load('path_to_audio.wav', sr=16000)")
print("plot_audio_analysis(audio, sr, title='Sample Audio')")

## 5. Feature Extraction Demo

In [None]:
# Initialize MFCC extractor
mfcc_extractor = MFCCExtractor(
    sample_rate=config['data']['sample_rate'],
    n_mfcc=config['features']['mfcc']['n_mfcc'],
    n_fft=config['features']['mfcc']['n_fft'],
    hop_length=config['features']['mfcc']['hop_length'],
    use_deltas=config['features']['mfcc']['use_deltas'],
    use_delta_deltas=config['features']['mfcc']['use_delta_deltas']
)

print("MFCC extractor initialized!")
print(f"Number of MFCC coefficients: {mfcc_extractor.n_mfcc}")
print(f"Using deltas: {mfcc_extractor.use_deltas}")
print(f"Using delta-deltas: {mfcc_extractor.use_delta_deltas}")

In [None]:
# Generate sample audio for demo
sr = 16000
duration = 3  # seconds
sample_audio = np.random.randn(sr * duration) * 0.1

# Extract MFCCs
mfccs = mfcc_extractor.extract(sample_audio)

print(f"Sample audio shape: {sample_audio.shape}")
print(f"Extracted MFCC features shape: {mfccs.shape}")
print(f"Feature dimensions: {mfccs.shape[0]} coefficients × {mfccs.shape[1]} frames")

In [None]:
# Visualize MFCC features
plt.figure(figsize=(14, 6))
librosa.display.specshow(mfccs, sr=sr, hop_length=mfcc_extractor.hop_length, x_axis='time')
plt.colorbar(format='%+2.0f')
plt.title('MFCC Features (Demo)')
plt.ylabel('MFCC Coefficients')
plt.xlabel('Time')
plt.tight_layout()
plt.show()

## 6. Data Splitting Strategy

In [None]:
# Create train/val/test splits
print("Creating data splits...")
splits = loader.create_splits(
    train_size=config['data']['train_split'],
    val_size=config['data']['val_split'],
    test_size=config['data']['test_split'],
    seed=config['seed']
)

print("\nData Splits Created:")
print(f"  Training samples:   {len(splits['train'])}")
print(f"  Validation samples: {len(splits['val'])}")
print(f"  Test samples:       {len(splits['test'])}")
print(f"  Total:              {len(splits['train']) + len(splits['val']) + len(splits['test'])}")

## 7. Summary and Next Steps

### Key Findings:
1. Dataset loaded successfully from HuggingFace
2. Multiple Indian languages represented
3. Both adult and child speakers (if applicable)
4. Word and sentence level utterances (if applicable)
5. MFCC feature extraction demonstrated

### Next Steps:
1. **Feature Extraction Notebook** (`02_feature_extraction.ipynb`):
   - Extract MFCC features from all samples
   - Extract HuBERT embeddings
   - Save processed features

2. **Model Training Notebook** (`03_training_and_eval.ipynb`):
   - Train CNN, BiLSTM, and Transformer models
   - Compare MFCC vs HuBERT performance
   - Perform layer-wise analysis

3. **Application Development**:
   - Build cuisine recommendation system
   - Test on real audio samples