In [None]:
# Required Libraries
import pandas as pd
import os
import librosa
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter

# Constants
DATA_PATH = 'path_to_common_voice_data'  # Replace with your actual dataset path

# Load the Dataset
def load_dataset(data_path):
    """
    Loads metadata of the audio dataset for analysis.
    This assumes a CSV or similar metadata file with language, speaker, and phonetic transcription info.
    """
    metadata_path = os.path.join(data_path, 'metadata.csv')  # Adjust to actual metadata filename
    if os.path.exists(metadata_path):
        dataset = pd.read_csv(metadata_path)
        print(f"Loaded dataset with {len(dataset)} samples.")
        return dataset
    else:
        print("Metadata file not found.")
        return None

# Basic Dataset Information
def dataset_info(dataset):
    """
    Prints general information about the dataset.
    """
    print("\n--- Dataset Info ---")
    print(f"Languages available: {dataset['language'].unique()}")
    print(f"Total samples per language: \n{dataset['language'].value_counts()}")
    print(f"Total speakers per language: \n{dataset.groupby('language')['speaker_id'].nunique()}")

# Analyze Phonetic Transcriptions
def analyze_phonetic_transcriptions(dataset):
    """
    Analyzes the phonetic transcriptions to understand phoneme distribution.
    """
    phoneme_list = []
    for transcription in dataset['phonetic_transcription']:
        phonemes = transcription.split()
        phoneme_list.extend(phonemes)

    phoneme_counts = Counter(phoneme_list)
    print("\n--- Phoneme Distribution ---")
    print(f"Top 10 Phonemes: {phoneme_counts.most_common(10)}")

    # Plotting phoneme frequency
    plt.figure(figsize=(12, 6))
    sns.barplot(x=[p[0] for p in phoneme_counts.most_common(20)],
                y=[p[1] for p in phoneme_counts.most_common(20)])
    plt.title("Top 20 Phonemes in Dataset")
    plt.xlabel("Phoneme")
    plt.ylabel("Frequency")
    plt.show()

# Phoneme-to-Phonological Mapping
def map_phonemes_to_phonological_features(phoneme_counts):
    """
    Mock function to map phonemes to phonological features.
    """
    # For example purposes; replace with actual mapping logic
    phonological_features = {
        'a': {'openness': 'open', 'frontness': 'front', 'stress': 'unstressed'},
        'e': {'openness': 'mid', 'frontness': 'front', 'stress': 'unstressed'},
        # Add mappings for other phonemes
    }

    phonological_data = []
    for phoneme, count in phoneme_counts.items():
        features = phonological_features.get(phoneme, {'openness': 'unknown', 'frontness': 'unknown', 'stress': 'unknown'})
        phonological_data.append((phoneme, features['openness'], features['frontness'], features['stress'], count))

    phonological_df = pd.DataFrame(phonological_data, columns=['Phoneme', 'Openness', 'Frontness', 'Stress', 'Frequency'])
    print("\n--- Phoneme to Phonological Feature Mapping ---")
    print(phonological_df.head(10))

    # Visualization of features
    sns.countplot(data=phonological_df, x="Openness", hue="Frontness")
    plt.title("Phonological Feature Distribution by Openness and Frontness")
    plt.show()

# Execution Pipeline
def main():
    dataset = load_dataset(DATA_PATH)
    if dataset is not None:
        dataset_info(dataset)
        analyze_phonetic_transcriptions(dataset)

        # Map phonemes to phonological features for analysis
        phoneme_counts = Counter([phoneme for transcript in dataset['phonetic_transcription'] for phoneme in transcript.split()])
        map_phonemes_to_phonological_features(phoneme_counts)

# Run Analysis
if __name__ == "__main__":
    main()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.metrics import accuracy_score

# Placeholder classes for dataset, TTS model, and evaluation metrics

class TTS_Dataset(Dataset):
    """
    Custom dataset class for multispeaker TTS with phonological features.
    Each sample includes phonological features, speaker ID, and corresponding audio features.
    """
    def __init__(self, phonological_data, audio_features, speaker_ids):
        self.phonological_data = phonological_data
        self.audio_features = audio_features
        self.speaker_ids = speaker_ids

    def __len__(self):
        return len(self.phonological_data)

    def __getitem__(self, idx):
        return {
            'phonological': torch.tensor(self.phonological_data[idx], dtype=torch.float32),
            'audio': torch.tensor(self.audio_features[idx], dtype=torch.float32),
            'speaker_id': torch.tensor(self.speaker_ids[idx], dtype=torch.int64)
        }

class MultispeakerTTSModel(nn.Module):
    """
    A baseline multispeaker TTS model.
    Replace this with a real model such as FastSpeech2 or Tacotron2.
    """
    def __init__(self, phonological_dim, speaker_emb_dim, audio_dim):
        super(MultispeakerTTSModel, self).__init__()
        self.phonological_encoder = nn.Linear(phonological_dim, 128)
        self.speaker_encoder = nn.Embedding(num_embeddings=100, embedding_dim=speaker_emb_dim)  # Assume 100 speakers

        # TTS Layers
        self.fc1 = nn.Linear(128 + speaker_emb_dim, 256)
        self.fc2 = nn.Linear(256, audio_dim)

    def forward(self, phonological, speaker_id):
        phonological_encoded = self.phonological_encoder(phonological)
        speaker_emb = self.speaker_encoder(speaker_id)

        combined = torch.cat([phonological_encoded, speaker_emb], dim=-1)
        x = torch.relu(self.fc1(combined))
        audio_out = self.fc2(x)
        return audio_out

# Initialize model
phonological_dim = 20  # Example dimensionality for phonological feature inputs
speaker_emb_dim = 32
audio_dim = 80  # Example: MFCC or Mel-spectrogram feature dimension
model = MultispeakerTTSModel(phonological_dim, speaker_emb_dim, audio_dim)

# Training setup
def train_model(model, train_loader, num_epochs=10):
    criterion = nn.MSELoss()  # Loss for TTS task
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for batch in train_loader:
            phonological = batch['phonological']
            audio = batch['audio']
            speaker_id = batch['speaker_id']

            optimizer.zero_grad()
            outputs = model(phonological, speaker_id)
            loss = criterion(outputs, audio)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss/len(train_loader):.4f}")

# Evaluation on MOS and WER
def evaluate_model(model, test_loader):
    model.eval()
    mos_scores = []
    predicted_texts = []
    actual_texts = []

    for batch in test_loader:
        phonological = batch['phonological']
        audio = batch['audio']
        speaker_id = batch['speaker_id']

        with torch.no_grad():
            output = model(phonological, speaker_id)

        # Placeholder MOS calculation
        mos_score = calculate_mos(output, audio)
        mos_scores.append(mos_score)

        # Placeholder WER calculation
        pred_text = phoneme_to_text(output)  # Convert output spectrogram to text
        actual_text = phoneme_to_text(audio)  # Convert actual audio to text
        predicted_texts.append(pred_text)
        actual_texts.append(actual_text)

    avg_mos = np.mean(mos_scores)
    wer = calculate_wer(predicted_texts, actual_texts)
    print(f"Average MOS: {avg_mos:.2f}")
    print(f"Word Error Rate (WER): {wer:.2%}")

# Placeholder MOS and WER functions (to replace with actual calculations)
def calculate_mos(pred, target):
    return 4.5  # Dummy score

def phoneme_to_text(audio_feature):
    return "dummy"  # Dummy text

def calculate_wer(pred_texts, actual_texts):
    errors = sum([1 for pred, actual in zip(pred_texts, actual_texts) if pred != actual])
    return errors / len(actual_texts)

# Data Preparation
# Example data
phonological_data = np.random.rand(100, phonological_dim)
audio_features = np.random.rand(100, audio_dim)
speaker_ids = np.random.randint(0, 100, 100)

# Data loaders
train_data = TTS_Dataset(phonological_data[:80], audio_features[:80], speaker_ids[:80])
test_data = TTS_Dataset(phonological_data[80:], audio_features[80:], speaker_ids[80:])
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16)

# Training and Evaluation
train_model(model, train_loader, num_epochs=10)
evaluate_model(model, test_loader)
