In [17]:
# ============================================================
# Vietnamese Speaker Profiling - Kaggle Training
# ============================================================
!git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git

Cloning into 'Profiling_gender_dialect'...
remote: Enumerating objects: 66, done.[K
remote: Counting objects: 100% (66/66), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 66 (delta 30), reused 57 (delta 21), pack-reused 0 (from 0)[K
Receiving objects: 100% (66/66), 55.35 KiB | 5.03 MiB/s, done.
Resolving deltas: 100% (30/30), done.


In [35]:
cd Profiling_gender_dialect

/home/Profiling_gender_dialect


In [36]:
ls

app.py    eval.py      infer.py    prepare_data.py  requirements.txt
[0m[01;34mconfigs[0m/  finetune.py  [01;34mnotebooks[0m/  README.md        [01;34msrc[0m/


In [20]:
# Install dependencies
!pip install -r requirements.txt

Collecting scikit-learn>=1.3.0 (from -r requirements.txt (line 8))
  Downloading scikit_learn-1.7.2-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (11 kB)
Collecting audiomentations==0.35.0 (from -r requirements.txt (line 9))
  Downloading audiomentations-0.35.0-py3-none-any.whl.metadata (10 kB)
Collecting mlflow>=2.10.0 (from -r requirements.txt (line 15))
  Downloading mlflow-3.6.0-py3-none-any.whl.metadata (31 kB)
Collecting librosa>=0.10.0 (from -r requirements.txt (line 4))
  Downloading librosa-0.10.2.post1-py3-none-any.whl.metadata (8.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->-r requirements.txt (line 1))
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->-r requirements.txt (line 1))
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12

In [37]:
# Check dataset structure
import os
import pandas as pd

VISPEECH_ROOT = "/kaggle/input/vispeech"

print("Dataset structure:")
for item in os.listdir(VISPEECH_ROOT):
    item_path = os.path.join(VISPEECH_ROOT, item)
    if os.path.isdir(item_path):
        print(f"  {item}/")
        for subitem in os.listdir(item_path)[:3]:
            print(f"    - {subitem}")
    else:
        print(f"  {item}")

# Check metadata format
print("\nMetadata columns:")
meta_path = os.path.join(VISPEECH_ROOT, "metadata/trainset.csv")
df = pd.read_csv(meta_path)
print(f"  Columns: {list(df.columns)}")
print(f"  Samples: {len(df)}")
print(f"\nFirst 3 rows:")
print(df.head(3).to_string())

Dataset structure:
  trainset/
    - ViSpeech_00569.mp3
    - ViSpeech_04453.mp3
    - ViSpeech_03028.mp3
  metadata/
    - trainset.csv
    - clean_testset.csv
    - noisy_testset.csv
  noisy_testset/
    - ViSpeech_10402.mp3
    - ViSpeech_10495.mp3
    - ViSpeech_10020.mp3
  clean_testset/
    - ViSpeech_09610.mp3
    - ViSpeech_09244.mp3
    - ViSpeech_09148.mp3

Metadata columns:
  Columns: ['audio_name', 'dialect', 'gender', 'speaker']
  Samples: 8166

First 3 rows:
           audio_name  dialect gender  speaker
0  ViSpeech_00001.mp3  Central   Male  SPK0001
1  ViSpeech_00002.mp3  Central   Male  SPK0001
2  ViSpeech_00003.mp3  Central   Male  SPK0001


In [38]:
# Create Kaggle-specific config
config_content = """
# Finetune Configuration for Kaggle
# Architecture: WavLM + Attentive Pooling + LayerNorm

# Model
model:
  name: "microsoft/wavlm-base-plus"
  hidden_size: 768
  num_genders: 2
  num_dialects: 3
  dropout: 0.1
  head_hidden_dim: 256

# Training
training:
  batch_size: 32
  learning_rate: 5e-5
  num_epochs: 15
  warmup_ratio: 0.125
  weight_decay: 0.0125
  gradient_clip: 1.0
  lr_scheduler: "linear"
  fp16: true
  dataloader_num_workers: 2

# Loss
loss:
  dialect_weight: 3.0

# MLflow Configuration
mlflow:
  enabled: false  # Disable on Kaggle

# Dataset paths - Kaggle specific
data:
  # Raw dataset paths (for prepare_data.py)
  vispeech_root: "/kaggle/input/vispeech"
  train_meta: "/kaggle/input/vispeech/metadata/trainset.csv"
  train_audio: "/kaggle/input/vispeech/trainset"
  clean_test_meta: "/kaggle/input/vispeech/metadata/clean_testset.csv"
  clean_test_audio: "/kaggle/input/vispeech/clean_testset"
  noisy_test_meta: "/kaggle/input/vispeech/metadata/noisy_testset.csv"
  noisy_test_audio: "/kaggle/input/vispeech/noisy_testset"
  val_split: 0.15
  
  # Extracted features paths (for finetune.py)
  train_dir: "/kaggle/working/datasets/ViSpeech/train"
  val_dir: "/kaggle/working/datasets/ViSpeech/val"

# Audio Processing
audio:
  sampling_rate: 16000
  max_duration: 5

# Output
output:
  dir: "/kaggle/working/output"
  save_total_limit: 2
  metric_for_best_model: "dialect_acc"

# Early Stopping
early_stopping:
  patience: 3
  threshold: 0.0025

# Label Mappings
labels:
  gender:
    Male: 0
    Female: 1
  dialect:
    North: 0
    Central: 1
    South: 2

# Reproducibility
seed: 42
"""

with open("configs/finetune.yaml", "w") as f:
    f.write(config_content)

print("Config file created: configs/finetune.yaml")

Config file created: configs/finetune.yaml


In [39]:
# Extract training features (~85% of trainset)
!python prepare_data.py \
    --dataset vispeech \
    --config configs/finetune.yaml \
    --output_dir /kaggle/working/datasets/ViSpeech/train \
    --split train

2025-11-28 01:24:37.415751: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764293077.604606     209 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764293077.604606     209 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764293077.659527     209 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
E0000 00:00:1764293077.659527     209 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeErr

In [40]:
# Extract validation features (~15% of trainset)
!python prepare_data.py \
    --dataset vispeech \
    --config configs/finetune.yaml \
    --output_dir /kaggle/working/datasets/ViSpeech/val \
    --split val

2025-11-28 01:33:08.574144: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764293588.601269     250 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764293588.608780     250 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
E0000 00:00:1764293588.601269     250 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764293588.608780     250 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeErr

In [41]:
# Verify extracted features
import os
print("Extracted features:")
for split in ['train', 'val']:
    path = f"/kaggle/working/datasets/ViSpeech/{split}"
    if os.path.exists(path):
        features_dir = os.path.join(path, 'features')
        n_files = len(os.listdir(features_dir)) if os.path.exists(features_dir) else 0
        print(f"  {split}: {n_files} files")

Extracted features:
  train: 7137 files
  val: 1029 files


In [42]:
# Train model
!python finetune.py --config configs/finetune.yaml

2025-11-28 01:38:04.914687: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764293884.939605     276 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764293884.947971     276 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
E0000 00:00:1764293884.947971     276 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has

In [43]:
# Check saved model
import os

output_model_dir = "/kaggle/working/output/best_model"
if os.path.exists(output_model_dir):
    print(f"Model saved at: {output_model_dir}")
    print("Files:")
    for f in os.listdir(output_model_dir):
        size = os.path.getsize(os.path.join(output_model_dir, f)) / 1024 / 1024
        print(f"  - {f} ({size:.2f} MB)")
else:
    print("Model not found. Check training logs.")

Model saved at: /kaggle/working/output/best_model
Files:
  - training_args.bin (0.01 MB)
  - model.safetensors (3.89 MB)


In [44]:
# Extract test features (optional - if you want to evaluate)
!python prepare_data.py \
    --dataset vispeech \
    --config configs/finetune.yaml \
    --output_dir /kaggle/working/datasets/ViSpeech/clean_test \
    --split clean_test

2025-11-28 01:40:08.889729: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764294008.913982     535 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764294008.921047     535 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
E0000 00:00:1764294008.913982     535 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764294008.921047     535 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeErr

In [45]:
!python prepare_data.py \
    --dataset vispeech \
    --config configs/finetune.yaml \
    --output_dir /kaggle/working/datasets/ViSpeech/noisy_test \
    --split noisy_test

2025-11-28 01:42:08.123574: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764294128.148840     561 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764294128.156260     561 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
E0000 00:00:1764294128.148840     561 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764294128.156260     561 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeErr

In [None]:
# ============================================================
# EVALUATION ON TEST SETS
# ============================================================
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from pathlib import Path

# Add project to path
import sys
sys.path.insert(0, '/kaggle/working/Profiling_gender_dialect')

from src.models import ClassificationHeadModel

# Dataset class for pre-extracted features
class FeatureDataset(Dataset):
    def __init__(self, data_dir):
        self.data_dir = Path(data_dir)
        self.feature_dir = self.data_dir / 'features'
        self.df = pd.read_csv(self.data_dir / 'metadata.csv')
        print(f"Loaded {len(self.df)} samples from {data_dir}")
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        features = np.load(self.feature_dir / row['feature_name'])
        return {
            'input_features': torch.from_numpy(features).float(),
            'gender_labels': torch.tensor(row['gender_label'], dtype=torch.long),
            'dialect_labels': torch.tensor(row['dialect_label'], dtype=torch.long)
        }

def evaluate_model(model, dataloader, device):
    model.eval()
    all_gender_preds = []
    all_dialect_preds = []
    all_gender_labels = []
    all_dialect_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            features = batch['input_features'].to(device)
            gender_labels = batch['gender_labels']
            dialect_labels = batch['dialect_labels']
            
            outputs = model(input_features=features)
            
            gender_preds = outputs['gender_logits'].argmax(dim=-1).cpu().numpy()
            dialect_preds = outputs['dialect_logits'].argmax(dim=-1).cpu().numpy()
            
            all_gender_preds.extend(gender_preds)
            all_dialect_preds.extend(dialect_preds)
            all_gender_labels.extend(gender_labels.numpy())
            all_dialect_labels.extend(dialect_labels.numpy())
    
    return {
        'gender_preds': np.array(all_gender_preds),
        'dialect_preds': np.array(all_dialect_preds),
        'gender_labels': np.array(all_gender_labels),
        'dialect_labels': np.array(all_dialect_labels)
    }

def print_results(results, dataset_name):
    print(f"\n{'='*60}")
    print(f"RESULTS ON {dataset_name.upper()}")
    print(f"{'='*60}")
    
    gender_acc = accuracy_score(results['gender_labels'], results['gender_preds']) * 100
    gender_f1 = f1_score(results['gender_labels'], results['gender_preds'], average='weighted') * 100
    dialect_acc = accuracy_score(results['dialect_labels'], results['dialect_preds']) * 100
    dialect_f1 = f1_score(results['dialect_labels'], results['dialect_preds'], average='weighted') * 100
    
    print(f"\nGender  - Accuracy: {gender_acc:.2f}%  |  F1: {gender_f1:.2f}%")
    print(f"Dialect - Accuracy: {dialect_acc:.2f}%  |  F1: {dialect_f1:.2f}%")
    
    print("\n--- Gender Classification Report ---")
    print(classification_report(results['gender_labels'], results['gender_preds'],
                               target_names=['Male', 'Female'], digits=4))
    
    print("--- Dialect Classification Report ---")
    print(classification_report(results['dialect_labels'], results['dialect_preds'],
                               target_names=['North', 'Central', 'South'], digits=4))
    
    print("Gender Confusion Matrix:")
    print(confusion_matrix(results['gender_labels'], results['gender_preds']))
    
    print("\nDialect Confusion Matrix:")
    print(confusion_matrix(results['dialect_labels'], results['dialect_preds']))
    
    return {
        'dataset': dataset_name,
        'gender_acc': gender_acc,
        'gender_f1': gender_f1,
        'dialect_acc': dialect_acc,
        'dialect_f1': dialect_f1
    }

print("Evaluation functions loaded!")

In [None]:
# Load trained model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Device: {device}")

# Load model weights
model = ClassificationHeadModel(
    hidden_size=768,
    num_genders=2,
    num_dialects=3,
    dropout=0.1,
    head_hidden_dim=256,
    dialect_loss_weight=3.0
)

# Load best checkpoint
checkpoint_dir = "/kaggle/working/output/best_model"
if os.path.exists(checkpoint_dir):
    # Find the model file
    model_files = [f for f in os.listdir(checkpoint_dir) if f.endswith('.bin') or f.endswith('.pt')]
    if model_files:
        model_path = os.path.join(checkpoint_dir, model_files[0])
        state_dict = torch.load(model_path, map_location=device)
        model.load_state_dict(state_dict)
        print(f"Loaded model from: {model_path}")
    else:
        # Try loading from pytorch_model.bin or model.safetensors
        if os.path.exists(os.path.join(checkpoint_dir, 'pytorch_model.bin')):
            state_dict = torch.load(os.path.join(checkpoint_dir, 'pytorch_model.bin'), map_location=device)
            model.load_state_dict(state_dict)
            print("Loaded from pytorch_model.bin")
        elif os.path.exists(os.path.join(checkpoint_dir, 'model.safetensors')):
            from safetensors.torch import load_file
            state_dict = load_file(os.path.join(checkpoint_dir, 'model.safetensors'))
            model.load_state_dict(state_dict)
            print("Loaded from model.safetensors")
else:
    print(f"Checkpoint not found at {checkpoint_dir}")

model.to(device)
model.eval()
print("Model loaded and ready for evaluation!")

In [None]:
# Evaluate on Clean Test Set
clean_test_dir = "/kaggle/working/datasets/ViSpeech/clean_test"

if os.path.exists(clean_test_dir):
    clean_dataset = FeatureDataset(clean_test_dir)
    clean_loader = DataLoader(clean_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    clean_results = evaluate_model(model, clean_loader, device)
    clean_metrics = print_results(clean_results, "Clean Test Set")
else:
    print(f"Clean test features not found at {clean_test_dir}")
    print("Run the extract test features cell first!")

In [None]:
# Evaluate on Noisy Test Set
noisy_test_dir = "/kaggle/working/datasets/ViSpeech/noisy_test"

if os.path.exists(noisy_test_dir):
    noisy_dataset = FeatureDataset(noisy_test_dir)
    noisy_loader = DataLoader(noisy_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    noisy_results = evaluate_model(model, noisy_loader, device)
    noisy_metrics = print_results(noisy_results, "Noisy Test Set")
else:
    print(f"Noisy test features not found at {noisy_test_dir}")
    print("Run the extract test features cell first!")

In [None]:
# Summary Table - Compare with Baseline (PACLIC 2024 - ResNet34)
print("\n" + "="*70)
print("COMPARISON WITH BASELINE (PACLIC 2024 - ResNet34)")
print("="*70)

# Baseline results from PACLIC 2024
baseline = {
    'gender': {'clean': 95.35, 'noisy': 88.71},
    'dialect': {'clean': 59.49, 'noisy': 45.67}
}

# Create comparison table
results_data = []

if 'clean_metrics' in dir() and 'noisy_metrics' in dir():
    for task in ['gender', 'dialect']:
        for test_set in ['clean', 'noisy']:
            baseline_val = baseline[task][test_set]
            our_val = clean_metrics[f'{task}_acc'] if test_set == 'clean' else noisy_metrics[f'{task}_acc']
            delta = our_val - baseline_val
            delta_str = f"+{delta:.2f}" if delta > 0 else f"{delta:.2f}"
            
            results_data.append({
                'Task': task.capitalize(),
                'Test Set': test_set.capitalize(),
                'Baseline (ResNet34)': f"{baseline_val:.2f}%",
                'Our Model (WavLM)': f"{our_val:.2f}%",
                'Delta': delta_str
            })

    df_results = pd.DataFrame(results_data)
    print(df_results.to_string(index=False))
    
    print("\n" + "="*70)
    print("SUMMARY")
    print("="*70)
    print(f"Clean Test - Gender: {clean_metrics['gender_acc']:.2f}% | Dialect: {clean_metrics['dialect_acc']:.2f}%")
    print(f"Noisy Test - Gender: {noisy_metrics['gender_acc']:.2f}% | Dialect: {noisy_metrics['dialect_acc']:.2f}%")
else:
    print("Run evaluation cells first!")