In [2]:
# ============================================================
# Vietnamese Speaker Profiling - Kaggle Training
# ============================================================
!git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git

fatal: destination path 'Profiling_gender_dialect' already exists and is not an empty directory.


In [None]:
# !rm -rf Profiling_gender_dialect
# !git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git

Cloning into 'Profiling_gender_dialect'...
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Enumerating objects: 98, done.[K
remote: Counting objects: 100% (98/98), done.[K
remote: Compressing objects: 100% (62/62), done.[K
remote: Total 98 (delta 51), reused 82 (delta 35), pack-reused 0 (from 0)[K
Receiving objects: 100% (98/98), 107.13 KiB | 1.47 MiB/s, done.
Resolving deltas: 100% (51/51), done.
remote: Total 98 (delta 51), reused 82 (delta 35), pack-reused 0 (from 0)[K
Receiving objects: 100% (98/98), 107.13 KiB | 1.47 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [2]:
cd Profiling_gender_dialect

/kaggle/working/Profiling_gender_dialect


In [3]:
ls

app.py               eval.py      [0m[01;34mnotebooks[0m/       requirements.txt
compare_encoders.py  finetune.py  prepare_data.py  [01;34msrc[0m/
[01;34mconfigs[0m/             infer.py     README.md


In [4]:
# Install dependencies
!pip install -r requirements.txt



In [5]:
# Check dataset structure and metadata columns
import os
import pandas as pd

VISPEECH_ROOT = "/kaggle/input/vispeech"

print("Dataset structure:")
for item in os.listdir(VISPEECH_ROOT):
    item_path = os.path.join(VISPEECH_ROOT, item)
    if os.path.isdir(item_path):
        print(f"  {item}/")
        for subitem in os.listdir(item_path)[:3]:
            print(f"    - {subitem}")
    else:
        print(f"  {item}")

# Check metadata format
print("\n" + "=" * 50)
print("METADATA INFO:")
print("=" * 50)
meta_path = os.path.join(VISPEECH_ROOT, "metadata/trainset.csv")
df = pd.read_csv(meta_path)
print(f"Columns: {list(df.columns)}")
print(f"Total samples: {len(df)}")
print(f"\nFirst 3 rows:")
print(df.head(3).to_string())

# Check column names for finetune.py compatibility
print("\n" + "=" * 50)
print("COLUMN MAPPING CHECK:")
print("=" * 50)
required_cols = {
    'audio_name': ['audio_name', 'filename', 'file', 'path'],
    'gender': ['gender', 'sex'],
    'dialect': ['dialect', 'accent', 'region'],
    'speaker': ['speaker', 'speaker_id', 'spk_id']
}

for target, candidates in required_cols.items():
    found = None
    for col in candidates:
        if col in df.columns:
            found = col
            break
    if found:
        print(f"[OK] {target}: found as '{found}'")
    else:
        print(f"[MISSING] {target}: NOT FOUND (need one of {candidates})")

Dataset structure:
  trainset/
    - ViSpeech_00569.mp3
    - ViSpeech_04453.mp3
    - ViSpeech_03028.mp3
  metadata/
    - trainset.csv
    - clean_testset.csv
    - noisy_testset.csv
  noisy_testset/
    - ViSpeech_10402.mp3
    - ViSpeech_10495.mp3
    - ViSpeech_10020.mp3
  clean_testset/
    - ViSpeech_09610.mp3
    - ViSpeech_09244.mp3
    - ViSpeech_09148.mp3

METADATA INFO:
Columns: ['audio_name', 'dialect', 'gender', 'speaker']
Total samples: 8166

First 3 rows:
           audio_name  dialect gender  speaker
0  ViSpeech_00001.mp3  Central   Male  SPK0001
1  ViSpeech_00002.mp3  Central   Male  SPK0001
2  ViSpeech_00003.mp3  Central   Male  SPK0001

COLUMN MAPPING CHECK:
[OK] audio_name: found as 'audio_name'
[OK] gender: found as 'gender'
[OK] dialect: found as 'dialect'
[OK] speaker: found as 'speaker'


In [7]:
# ============================================================
# TEST 3 ENCODER TYPES (5 epochs each)
# ============================================================
import yaml
import os

ENCODERS_TO_TEST = [
    # "microsoft/wavlm-base-plus",
    # "facebook/hubert-base-ls960",
    "facebook/wav2vec2-base",
    # "facebook/wav2vec2-large-960h"
]

base_config = """
# Test Configuration - Full Model Finetuning
model:
  name: "{encoder_name}"
  num_genders: 2
  num_dialects: 3
  dropout: 0.15
  head_hidden_dim: 256
  freeze_encoder: false 

training:
  batch_size: 32
  gradient_accumulation_steps: 4
  learning_rate: 5e-5
  num_epochs: 5  # Quick test
  warmup_ratio: 0.1
  weight_decay: 0.01
  gradient_clip: 1.0
  lr_scheduler: "linear"
  fp16: true
  dataloader_num_workers: 2

loss:
  dialect_weight: 2.5

mlflow:
  enabled: false

data:
  train_meta: "/kaggle/input/vispeech/metadata/trainset.csv"
  train_audio: "/kaggle/input/vispeech/trainset"
  val_split: 0.15

audio:
  sampling_rate: 16000
  max_duration: 5

augmentation:
  enabled: true
  prob: 0.8

output:
  dir: "/kaggle/working/output_{encoder_short}"
  save_total_limit: 1
  metric_for_best_model: "dialect_acc"

early_stopping:
  patience: 3
  threshold: 0.0001

labels:
  gender:
    Male: 0
    Female: 1
  dialect:
    North: 0
    Central: 1
    South: 2

seed: 42
"""

results = {}

for encoder in ENCODERS_TO_TEST:
    encoder_short = encoder.split("/")[-1]
    print("\n" + "=" * 70)
    print(f"TESTING: {encoder}")
    print("=" * 70)
    
    # Create config for this encoder
    config_content = base_config.format(
        encoder_name=encoder,
        encoder_short=encoder_short
    )
    
    config_path = f"configs/test_{encoder_short}.yaml"
    with open(config_path, "w") as f:
        f.write(config_content)
    
    print(f"Config saved: {config_path}")
    print(f"freeze_encoder: false (full finetuning)")
    print(f"Training 5 epochs...")
    
    # Run training
    exit_code = os.system(f"python finetune.py --config {config_path}")
    
    if exit_code == 0:
        results[encoder] = "SUCCESS"
        print(f"\n{encoder_short}: Training completed!")
    else:
        results[encoder] = f"FAILED (exit code: {exit_code})"
        print(f"\n{encoder_short}: Training failed!")

# Summary
print("\n" + "=" * 70)
print("TEST SUMMARY")
print("=" * 70)
for encoder, status in results.items():
    print(f"  {encoder}: {status}")


TESTING: facebook/wav2vec2-large-960h
Config saved: configs/test_wav2vec2-large-960h.yaml
freeze_encoder: false (full finetuning)
Training 5 epochs...


2025-11-28 04:22:51.285819: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764303771.307524    1482 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764303771.314207    1482 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'
Attri

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.0}


  4%|▍         | 50/1120 [02:32<52:04,  2.92s/it] 

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 0.0, 'epoch': 0.22}


  5%|▍         | 51/1120 [02:35<52:15,  2.93s/it]Traceback (most recent call last):
  File "/kaggle/working/Profiling_gender_dialect/finetune.py", line 506, in <module>
    main(args.config)
  File "/kaggle/working/Profiling_gender_dialect/finetune.py", line 463, in main
    trainer.train()
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2206, in train
    return inner_training_loop(
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 2548, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/transformers/trainer.py", line 3797, in training_step
    self.accelerator.backward(loss, **kwargs)
  File "/usr/local/lib/python3.11/dist-packages/accelerate/accelerator.py", line 2574, in backward
    self.scaler.scale(loss).backward(**kwargs)
 


wav2vec2-large-960h: Training failed!

TEST SUMMARY
  facebook/wav2vec2-large-960h: FAILED (exit code: 256)


In [43]:
# Check saved model
import os

output_model_dir = "/kaggle/working/output/best_model"
if os.path.exists(output_model_dir):
    print(f"Model saved at: {output_model_dir}")
    print("Files:")
    for f in os.listdir(output_model_dir):
        size = os.path.getsize(os.path.join(output_model_dir, f)) / 1024 / 1024
        print(f"  - {f} ({size:.2f} MB)")
else:
    print("Model not found. Check training logs.")

Model saved at: /kaggle/working/output/best_model
Files:
  - training_args.bin (0.01 MB)
  - model.safetensors (3.89 MB)


In [None]:
# ============================================================
# EVALUATION - Evaluate best model on test sets  
# ============================================================
# Sử dụng raw audio mode (không cần extract features)
!python eval.py \
    --checkpoint /kaggle/working/output/best_model \
    --config configs/finetune.yaml \
    --test_name clean_test \
    --test_name2 noisy_test \
    --output_dir /kaggle/working/output/evaluation