Tôi sẽ sử dụng file này, sau đó import lên Kaggle và chạy để sử dụng GPU và Ram của kaggle

In [None]:
!git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git
!apt-get install -y ffmpeg

In [None]:
cd Profiling_gender_dialect

In [None]:
!pip install -r requirements.txt
!pip install -q transformers==4.44.0 accelerate==0.33.0 datasets==2.21.0
!pip install -q librosa soundfile audiomentations==0.35.0 wandb safetensors

In [None]:
# ============================================================
# FIX: Patch models.py for attention mask size mismatch
# ============================================================
fix_code = '''
        # Get hidden states from either raw audio or pre-extracted features
        if input_features is not None:
            # Use pre-extracted features directly
            hidden_states = input_features
        elif input_values is not None:
            # Extract features from encoder
            hidden_states = self._encode(input_values, attention_mask)
        else:
            raise ValueError("Either input_values or input_features must be provided")
        
        # Create proper attention mask for hidden states (encoder downsamples audio)
        # Hidden states have different sequence length than input audio
        if attention_mask is not None and hidden_states.shape[1] != attention_mask.shape[1]:
            # Create new mask based on hidden states length
            batch_size, seq_len, _ = hidden_states.shape
            pooled_mask = torch.ones(batch_size, seq_len, device=hidden_states.device)
        else:
            pooled_mask = attention_mask
        
        # Attentive pooling
        pooled, attn_weights = self.attentive_pooling(hidden_states, pooled_mask)
'''

old_code = '''
        # Get hidden states from either raw audio or pre-extracted features
        if input_features is not None:
            # Use pre-extracted features directly
            hidden_states = input_features
        elif input_values is not None:
            # Extract features from encoder
            hidden_states = self._encode(input_values, attention_mask)
        else:
            raise ValueError("Either input_values or input_features must be provided")
        
        # Attentive pooling
        pooled, attn_weights = self.attentive_pooling(hidden_states, attention_mask)
'''

with open("src/models.py", "r") as f:
    content = f.read()

if old_code in content:
    content = content.replace(old_code, fix_code)
    with open("src/models.py", "w") as f:
        f.write(content)
    logger.info("Patched src/models.py - Fixed attention mask size mismatch")
else:
    logger.info("src/models.py already patched or different version")

In [None]:
# ============================================================
# Train 3 ENCODER TYPES (20 epochs each)
# ============================================================
import os
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("training")

ENCODERS_TO_TEST = [
    "microsoft/wavlm-base-plus",
    "facebook/hubert-base-ls960",
    "facebook/wav2vec2-base",
]

base_config = """
model:
  name: "{encoder_name}"
  num_genders: 2
  num_dialects: 3
  dropout: 0.15
  head_hidden_dim: 256
  freeze_encoder: false 

training:
  batch_size: 32
  gradient_accumulation_steps: 4
  learning_rate: 3.5e-5
  num_epochs: 20  
  warmup_ratio: 0.1
  weight_decay: 0.01
  gradient_clip: 1.0
  lr_scheduler: "linear"
  fp16: true
  dataloader_num_workers: 2

loss:
  dialect_weight: 2.5

wandb:
  enabled: false

data:
  source: "vispeech"
  vispeech_root: "/kaggle/input/vispeech"
  train_meta: "/kaggle/input/vispeech/metadata/trainset.csv"
  train_audio: "/kaggle/input/vispeech/trainset"
  clean_test_meta: "/kaggle/input/vispeech/metadata/clean_testset.csv"
  clean_test_audio: "/kaggle/input/vispeech/clean_testset"
  noisy_test_meta: "/kaggle/input/vispeech/metadata/noisy_testset.csv"
  noisy_test_audio: "/kaggle/input/vispeech/noisy_testset"
  val_split: 0.15

audio:
  sampling_rate: 16000
  max_duration: 5

augmentation:
  enabled: true
  prob: 0.8

output:
  dir: "/kaggle/working/output_{encoder_short}"
  save_total_limit: 1
  metric_for_best_model: "dialect_acc"

early_stopping:
  patience: 3
  threshold: 0.00001

labels:
  gender:
    Male: 0
    Female: 1
  dialect:
    North: 0
    Central: 1
    South: 2

seed: 42
"""

results = {}

for encoder in ENCODERS_TO_TEST:
    encoder_short = encoder.split("/")[-1]
    logger.info("=" * 70)
    logger.info(f"TRAINING: {encoder}")
    logger.info("=" * 70)
    
    config_content = base_config.format(
        encoder_name=encoder,
        encoder_short=encoder_short
    )
    
    config_path = f"configs/train_{encoder_short}.yaml"
    with open(config_path, "w") as f:
        f.write(config_content)
    
    logger.info(f"Config: {config_path}")
    logger.info(f"Output: /kaggle/working/output_{encoder_short}")
    
    exit_code = os.system(f"python finetune.py --config {config_path}")
    
    if exit_code == 0:
        results[encoder] = "SUCCESS"
        logger.info(f"{encoder_short}: Training completed")
    else:
        results[encoder] = f"FAILED (exit code: {exit_code})"
        logger.error(f"{encoder_short}: Training failed")

logger.info("=" * 70)
logger.info("TRAINING SUMMARY")
logger.info("=" * 70)
for encoder, status in results.items():
    logger.info(f"  {encoder}: {status}")

Eval with ViSpeech dataset

In [None]:
# ============================================================
# CHECK SAVED MODELS
# ============================================================
import os

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]

logger.info("=" * 70)
logger.info("SAVED MODELS")
logger.info("=" * 70)

for encoder in ENCODERS:
    model_dir = f"/kaggle/working/output_{encoder}/best_model"
    if os.path.exists(model_dir):
        logger.info(f"{encoder}:")
        total_size = 0
        for f in sorted(os.listdir(model_dir)):
            size = os.path.getsize(os.path.join(model_dir, f)) / 1024 / 1024
            total_size += size
            logger.info(f"  {f}: {size:.1f} MB")
        logger.info(f"  Total: {total_size:.1f} MB")
    else:
        logger.warning(f"{encoder}: Model not found")

In [None]:
# ============================================================
# EVALUATE ALL MODELS ON CLEAN & NOISY TEST SETS
# ============================================================
import os

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]
eval_results = {}

for encoder in ENCODERS:
    model_dir = f"/kaggle/working/output_{encoder}/best_model"
    config_path = f"configs/train_{encoder}.yaml"
    
    if not os.path.exists(model_dir):
        logger.warning(f"{encoder}: Model not found, skipping eval")
        continue
    
    logger.info("=" * 70)
    logger.info(f"EVALUATING: {encoder}")
    logger.info("=" * 70)
    
    # Evaluate on clean test
    logger.info("Clean test set...")
    exit_code = os.system(
        f"python eval.py --checkpoint {model_dir} --config {config_path} "
        f"--test_name clean_test --output_dir /kaggle/working/output_{encoder}/eval"
    )
    
    # Evaluate on noisy test
    logger.info("Noisy test set...")
    exit_code = os.system(
        f"python eval.py --checkpoint {model_dir} --config {config_path} "
        f"--test_name noisy_test --output_dir /kaggle/working/output_{encoder}/eval"
    )
    
    if exit_code == 0:
        eval_results[encoder] = "SUCCESS"
    else:
        eval_results[encoder] = "FAILED"

logger.info("=" * 70)
logger.info("EVALUATION SUMMARY")
logger.info("=" * 70)
for encoder, status in eval_results.items():
    logger.info(f"  {encoder}: {status}")

In [None]:
# ============================================================
# SAVE MODELS TO KAGGLE OUTPUT
# ============================================================
import shutil
import os

OUTPUT_DIR = "/kaggle/working/final_models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]

logger.info("=" * 70)
logger.info("COPYING MODELS TO OUTPUT")
logger.info("=" * 70)

for encoder in ENCODERS:
    src_dir = f"/kaggle/working/output_{encoder}/best_model"
    dst_dir = f"{OUTPUT_DIR}/{encoder}"
    
    if os.path.exists(src_dir):
        if os.path.exists(dst_dir):
            shutil.rmtree(dst_dir)
        shutil.copytree(src_dir, dst_dir)
        logger.info(f"Copied: {encoder}")
    else:
        logger.warning(f"Not found: {encoder}")

# Also copy eval results
for encoder in ENCODERS:
    eval_dir = f"/kaggle/working/output_{encoder}/eval"
    if os.path.exists(eval_dir):
        dst_eval = f"{OUTPUT_DIR}/{encoder}/eval"
        if os.path.exists(dst_eval):
            shutil.rmtree(dst_eval)
        shutil.copytree(eval_dir, dst_eval)
        logger.info(f"Copied eval: {encoder}")

logger.info(f"All models saved to: {OUTPUT_DIR}")

In [None]:
# ============================================================
# LIST FINAL OUTPUT
# ============================================================
import os

OUTPUT_DIR = "/kaggle/working/final_models"

logger.info("=" * 70)
logger.info("FINAL OUTPUT STRUCTURE")
logger.info("=" * 70)

for root, dirs, files in os.walk(OUTPUT_DIR):
    level = root.replace(OUTPUT_DIR, '').count(os.sep)
    indent = '  ' * level
    logger.info(f"{indent}{os.path.basename(root)}/")
    sub_indent = '  ' * (level + 1)
    for file in files:
        size = os.path.getsize(os.path.join(root, file)) / 1024 / 1024
        logger.info(f"{sub_indent}{file} ({size:.1f} MB)")