Tôi sẽ sử dụng file này, sau đó import lên Kaggle và chạy để sử dụng GPU và Ram của kaggle

In [None]:
!git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git
!apt-get install -y ffmpeg

In [None]:
cd Profiling_gender_dialect

In [None]:
!pip install -r requirements.txt
!pip install -q transformers==4.44.0 accelerate==0.33.0 datasets==2.21.0
!pip install -q librosa soundfile audiomentations==0.35.0 wandb safetensors

In [None]:
# ============================================================
# FINETUNE WITH ViMD DATASET
# ============================================================
import os

ENCODER = "facebook/wav2vec2-base"
encoder_short = ENCODER.split("/")[-1]
WANDB_API_KEY = "f05e29c3466ec288e97041e0e3d541c4087096a6"

vimd_config = f"""
model:
  name: "{ENCODER}"
  num_genders: 2
  num_dialects: 3
  dropout: 0.2
  head_hidden_dim: 256
  freeze_encoder: false

training:
  batch_size: 8
  gradient_accumulation_steps: 4
  learning_rate: 2e-5
  num_epochs: 10
  warmup_ratio: 0.1
  weight_decay: 0.01
  gradient_clip: 0.5
  lr_scheduler: "cosine"
  fp16: true
  dataloader_num_workers: 2

loss:
  dialect_weight: 2.5

wandb:
  enabled: true
  api_key: "{WANDB_API_KEY}"
  project: "vimd-speaker-profiling"
  run_name: "{encoder_short}"

data:
  source: "vimd"
  vimd_path: "/kaggle/input/vimd-dataset"

audio:
  sampling_rate: 16000
  max_duration: 5

augmentation:
  enabled: true
  prob: 0.8

output:
  dir: "/kaggle/working/output_vimd"
  save_total_limit: 1
  metric_for_best_model: "dialect_acc"

early_stopping:
  patience: 5
  threshold: 0.001

labels:
  gender:
    Male: 0
    Female: 1
    0: 0
    1: 1
  dialect:
    North: 0
    Central: 1
    South: 2

seed: 42
"""

config_path = "configs/vimd_train.yaml"
with open(config_path, "w") as f:
    f.write(vimd_config)

logger.info(f"Config saved: {config_path}")
logger.info(f"Encoder: {ENCODER}")
logger.info(f"Batch size: 8, Gradient accumulation: 4 (effective batch: 32)")
logger.info(f"WandB: enabled, project=vimd-speaker-profiling")

In [None]:
# ============================================================
# START TRAINING
# ============================================================
logger.info("=" * 70)
logger.info(f"TRAINING: {ENCODER}")
logger.info("=" * 70)

exit_code = os.system(f"python finetune.py --config {config_path}")

if exit_code == 0:
    logger.info("Training completed successfully")
else:
    logger.error(f"Training failed with exit code: {exit_code}")

Eval with ViDM dataset

In [None]:
# ============================================================
# CHECK SAVED MODEL
# ============================================================
import os

model_dir = "/kaggle/working/output_vimd/best_model"

logger.info("=" * 70)
logger.info("SAVED MODEL")
logger.info("=" * 70)

if os.path.exists(model_dir):
    total_size = 0
    for f in sorted(os.listdir(model_dir)):
        size = os.path.getsize(os.path.join(model_dir, f)) / 1024 / 1024
        total_size += size
        logger.info(f"  {f}: {size:.1f} MB")
    logger.info(f"  Total: {total_size:.1f} MB")
else:
    logger.warning("Model not found")

In [None]:
# ============================================================
# EVALUATE MODEL ON ViMD TEST SET
# ============================================================
import os

model_dir = "/kaggle/working/output_vimd/best_model"
config_path = "configs/vimd_train.yaml"

if not os.path.exists(model_dir):
    logger.error("Model not found, skipping eval")
else:
    logger.info("=" * 70)
    logger.info("EVALUATING ON ViMD TEST SET")
    logger.info("=" * 70)
    
    exit_code = os.system(
        f"python eval.py --checkpoint {model_dir} --config {config_path} "
        f"--test_name vimd_test --output_dir /kaggle/working/output_vimd/eval"
    )
    
    if exit_code == 0:
        logger.info("Evaluation completed successfully")
    else:
        logger.error(f"Evaluation failed with exit code: {exit_code}")

In [None]:
# ============================================================
# DISPLAY CONFUSION MATRIX
# ============================================================
import os
from IPython.display import Image, display
import json

eval_dir = "/kaggle/working/output_vimd/eval"

# Display confusion matrix plot
cm_plot = os.path.join(eval_dir, "confusion_matrix_vimd_test_set.png")
if os.path.exists(cm_plot):
    logger.info("Confusion Matrix Plot:")
    display(Image(filename=cm_plot))
else:
    logger.warning(f"Confusion matrix plot not found at: {cm_plot}")

# Load and display results JSON
results_file = os.path.join(eval_dir, "results.json")
if os.path.exists(results_file):
    with open(results_file, 'r') as f:
        results = json.load(f)
    
    for r in results:
        logger.info("=" * 70)
        logger.info(f"Dataset: {r['dataset']}")
        logger.info(f"Gender Accuracy: {r['gender_acc']:.2f}%")
        logger.info(f"Dialect Accuracy: {r['dialect_acc']:.2f}%")
        logger.info("=" * 70)

In [None]:
# ============================================================
# SAVE MODEL TO KAGGLE OUTPUT
# ============================================================
import shutil
import os

OUTPUT_DIR = "/kaggle/working/final_model_vimd"
os.makedirs(OUTPUT_DIR, exist_ok=True)

model_dir = "/kaggle/working/output_vimd/best_model"
eval_dir = "/kaggle/working/output_vimd/eval"

logger.info("=" * 70)
logger.info("COPYING MODEL TO OUTPUT")
logger.info("=" * 70)

if os.path.exists(model_dir):
    dst_dir = f"{OUTPUT_DIR}/best_model"
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
    shutil.copytree(model_dir, dst_dir)
    logger.info(f"Copied model to: {dst_dir}")
else:
    logger.warning("Model not found")

if os.path.exists(eval_dir):
    dst_eval = f"{OUTPUT_DIR}/eval"
    if os.path.exists(dst_eval):
        shutil.rmtree(dst_eval)
    shutil.copytree(eval_dir, dst_eval)
    logger.info(f"Copied eval to: {dst_eval}")

logger.info(f"All files saved to: {OUTPUT_DIR}")

In [None]:
# ============================================================
# LIST FINAL OUTPUT
# ============================================================
import os

OUTPUT_DIR = "/kaggle/working/final_model_vimd"

logger.info("=" * 70)
logger.info("FINAL OUTPUT STRUCTURE")
logger.info("=" * 70)

if os.path.exists(OUTPUT_DIR):
    for root, dirs, files in os.walk(OUTPUT_DIR):
        level = root.replace(OUTPUT_DIR, '').count(os.sep)
        indent = '  ' * level
        logger.info(f"{indent}{os.path.basename(root)}/")
        sub_indent = '  ' * (level + 1)
        for file in files:
            size = os.path.getsize(os.path.join(root, file)) / 1024 / 1024
            logger.info(f"{sub_indent}{file} ({size:.1f} MB)")
else:
    logger.warning("Output directory not found")