Tôi sẽ sử dụng file này, sau đó import lên Kaggle và chạy để sử dụng GPU và Ram của kaggle

In [None]:
!git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git
!apt-get install -y ffmpeg

In [None]:
cd Profiling_gender_dialect

In [None]:
!pip install -r requirements.txt
!pip install -q transformers==4.44.0 accelerate==0.33.0 datasets==2.21.0
!pip install -q librosa soundfile audiomentations==0.35.0 wandb safetensors

In [None]:
# ============================================================
# Train 3 ENCODER TYPES (20 epochs each)
# ============================================================
import os
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("training")

ENCODERS_TO_TEST = [
    "microsoft/wavlm-base-plus",
    "facebook/hubert-base-ls960",
    "facebook/wav2vec2-base",
]

WANDB_API_KEY = "f05e29c3466ec288e97041e0e3d541c4087096a6"

base_config = """
model:
  name: "{encoder_name}"
  num_genders: 2
  num_dialects: 3
  dropout: 0.25
  head_hidden_dim: 256
  freeze_encoder: false 

training:
  batch_size: 32
  gradient_accumulation_steps: 4
  learning_rate: 2e-5
  num_epochs: 20  
  warmup_ratio: 0.1
  weight_decay: 0.01
  gradient_clip: 0.5
  lr_scheduler: "cosine"
  fp16: true
  dataloader_num_workers: 2

loss:
  dialect_weight: 3

wandb:
  enabled: true
  api_key: "{wandb_key}"
  project: "vispeech-speaker-profiling"
  run_name: "{encoder_short}"

data:
  source: "vispeech"
  vispeech_root: "/kaggle/input/vispeech"
  train_meta: "/kaggle/input/vispeech/metadata/trainset.csv"
  train_audio: "/kaggle/input/vispeech/trainset"
  clean_test_meta: "/kaggle/input/vispeech/metadata/clean_testset.csv"
  clean_test_audio: "/kaggle/input/vispeech/clean_testset"
  noisy_test_meta: "/kaggle/input/vispeech/metadata/noisy_testset.csv"
  noisy_test_audio: "/kaggle/input/vispeech/noisy_testset"
  val_split: 0.15

audio:
  sampling_rate: 16000
  max_duration: 5

augmentation:
  enabled: true
  prob: 0.8

output:
  dir: "/kaggle/working/output_{encoder_short}"
  save_total_limit: 1
  metric_for_best_model: "dialect_acc"

early_stopping:
  patience: 5
  threshold: 0.001

labels:
  gender:
    Male: 0
    Female: 1
  dialect:
    North: 0
    Central: 1
    South: 2

seed: 42
"""

results = {}

for encoder in ENCODERS_TO_TEST:
    encoder_short = encoder.split("/")[-1]
    logger.info("=" * 70)
    logger.info(f"TRAINING: {encoder}")
    logger.info("=" * 70)
    
    config_content = base_config.format(
        encoder_name=encoder,
        encoder_short=encoder_short,
        wandb_key=WANDB_API_KEY
    )
    
    config_path = f"configs/train_{encoder_short}.yaml"
    with open(config_path, "w") as f:
        f.write(config_content)
    
    logger.info(f"Config: {config_path}")
    logger.info(f"Output: /kaggle/working/output_{encoder_short}")
    logger.info(f"WandB: enabled, project=vispeech-speaker-profiling")
    
    exit_code = os.system(f"python finetune.py --config {config_path}")
    
    if exit_code == 0:
        results[encoder] = "SUCCESS"
        logger.info(f"{encoder_short}: Training completed")
    else:
        results[encoder] = f"FAILED (exit code: {exit_code})"
        logger.error(f"{encoder_short}: Training failed")

logger.info("=" * 70)
logger.info("TRAINING SUMMARY")
logger.info("=" * 70)
for encoder, status in results.items():
    logger.info(f"  {encoder}: {status}")

## Evaluate BEFORE Training (Pretrained Models - No Finetuning)
Đánh giá tất cả pretrained models (chưa finetune) trên clean_testset để có baseline so sánh với sau khi train.

In [None]:
# ============================================================
# EVALUATE PRETRAINED MODELS (BEFORE FINETUNING)
# ============================================================
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoFeatureExtractor
import librosa
import os

from src.models import MultiTaskSpeakerModel

logger.info("=" * 70)
logger.info("EVALUATING PRETRAINED MODELS (BEFORE FINETUNING)")
logger.info("=" * 70)

# Load test metadata
test_meta_path = "/kaggle/input/vispeech/metadata/clean_testset.csv"
test_audio_dir = "/kaggle/input/vispeech/clean_testset"
test_df = pd.read_csv(test_meta_path)

logger.info(f"Test samples: {len(test_df)}")

# Label mappings
gender_map = {'Male': 0, 'Female': 1}
dialect_map = {'North': 0, 'Central': 1, 'South': 2}

sampling_rate = 16000
max_duration = 5
max_length = sampling_rate * max_duration

# Store results for all encoders
pretrained_results = {}

for encoder in ENCODERS_TO_TEST:
    encoder_short = encoder.split("/")[-1]
    
    logger.info("-" * 50)
    logger.info(f"Evaluating: {encoder}")
    
    # Initialize pretrained model (random classification heads)
    model = MultiTaskSpeakerModel(
        model_name=encoder,
        num_genders=2,
        num_dialects=3,
        dropout=0.25,
        head_hidden_dim=256
    )
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    # Load feature extractor
    feature_extractor = AutoFeatureExtractor.from_pretrained(encoder)
    
    # Evaluate
    gender_preds, gender_labels = [], []
    dialect_preds, dialect_labels = [], []
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=encoder_short):
        try:
            audio_path = os.path.join(test_audio_dir, row['audio_name'])
            
            # Load audio
            audio, _ = librosa.load(audio_path, sr=sampling_rate, mono=True)
            
            # Trim silence
            audio, _ = librosa.effects.trim(audio, top_db=20)
            
            # Normalize
            if len(audio) > 0:
                audio = audio / (np.max(np.abs(audio)) + 1e-8)
            
            # Pad or truncate
            if len(audio) < max_length:
                audio = np.pad(audio, (0, max_length - len(audio)))
            else:
                start = (len(audio) - max_length) // 2
                audio = audio[start:start + max_length]
            
            # Extract features
            inputs = feature_extractor(
                audio,
                sampling_rate=sampling_rate,
                return_tensors="pt",
                padding=True
            )
            
            # Predict
            with torch.no_grad():
                outputs = model(inputs.input_values.to(device))
                g_pred = torch.argmax(outputs['gender_logits'], dim=-1).cpu().item()
                d_pred = torch.argmax(outputs['dialect_logits'], dim=-1).cpu().item()
            
            # Get labels
            g_label = gender_map.get(row['gender'], 0)
            d_label = dialect_map.get(row['dialect'], 0)
            
            gender_preds.append(g_pred)
            gender_labels.append(g_label)
            dialect_preds.append(d_pred)
            dialect_labels.append(d_label)
            
        except Exception as e:
            continue
    
    # Calculate metrics
    gender_acc = accuracy_score(gender_labels, gender_preds)
    gender_f1 = f1_score(gender_labels, gender_preds, average='weighted')
    dialect_acc = accuracy_score(dialect_labels, dialect_preds)
    dialect_f1 = f1_score(dialect_labels, dialect_preds, average='weighted')
    
    pretrained_results[encoder_short] = {
        'gender_acc': gender_acc,
        'gender_f1': gender_f1,
        'dialect_acc': dialect_acc,
        'dialect_f1': dialect_f1
    }
    
    logger.info(f"  Gender Acc: {gender_acc*100:.2f}% | Dialect Acc: {dialect_acc*100:.2f}%")
    
    # Clear GPU memory
    del model
    torch.cuda.empty_cache()

# Summary table
logger.info("=" * 70)
logger.info("PRETRAINED MODELS BASELINE (BEFORE FINETUNING)")
logger.info("=" * 70)
logger.info(f"{'Encoder':<25} {'Gender Acc':>12} {'Dialect Acc':>12}")
logger.info("-" * 51)

for encoder_short, metrics in pretrained_results.items():
    logger.info(f"{encoder_short:<25} {metrics['gender_acc']*100:>11.2f}% {metrics['dialect_acc']*100:>11.2f}%")

logger.info("-" * 51)
logger.info("(Random heads = ~50% gender, ~33% dialect expected)")

Eval with ViSpeech dataset

In [None]:
# ============================================================
# CHECK SAVED MODELS
# ============================================================
import os

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]

logger.info("=" * 70)
logger.info("SAVED MODELS")
logger.info("=" * 70)

for encoder in ENCODERS:
    model_dir = f"/kaggle/working/output_{encoder}/best_model"
    if os.path.exists(model_dir):
        logger.info(f"{encoder}:")
        total_size = 0
        for f in sorted(os.listdir(model_dir)):
            size = os.path.getsize(os.path.join(model_dir, f)) / 1024 / 1024
            total_size += size
            logger.info(f"  {f}: {size:.1f} MB")
        logger.info(f"  Total: {total_size:.1f} MB")
    else:
        logger.warning(f"{encoder}: Model not found")

In [None]:
# ============================================================
# EVALUATE ALL MODELS ON CLEAN & NOISY TEST SETS
# ============================================================
import os

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]
eval_results = {}

for encoder in ENCODERS:
    model_dir = f"/kaggle/working/output_{encoder}/best_model"
    config_path = f"configs/train_{encoder}.yaml"
    
    if not os.path.exists(model_dir):
        logger.warning(f"{encoder}: Model not found, skipping eval")
        continue
    
    logger.info("=" * 70)
    logger.info(f"EVALUATING: {encoder}")
    logger.info("=" * 70)
    
    # Evaluate on clean test
    logger.info("Clean test set...")
    exit_code = os.system(
        f"python eval.py --checkpoint {model_dir} --config {config_path} "
        f"--test_name clean_test --output_dir /kaggle/working/output_{encoder}/eval"
    )
    
    # Evaluate on noisy test
    logger.info("Noisy test set...")
    exit_code = os.system(
        f"python eval.py --checkpoint {model_dir} --config {config_path} "
        f"--test_name noisy_test --output_dir /kaggle/working/output_{encoder}/eval"
    )
    
    if exit_code == 0:
        eval_results[encoder] = "SUCCESS"
    else:
        eval_results[encoder] = "FAILED"

logger.info("=" * 70)
logger.info("EVALUATION SUMMARY")
logger.info("=" * 70)
for encoder, status in eval_results.items():
    logger.info(f"  {encoder}: {status}")

In [None]:
# ============================================================
# COMPARE: BEFORE vs AFTER FINETUNING (ALL ENCODERS)
# ============================================================
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoFeatureExtractor
import librosa
import os

from src.models import MultiTaskSpeakerModel
from src.utils import load_model_checkpoint

logger.info("=" * 70)
logger.info("EVALUATING FINETUNED MODELS (AFTER TRAINING)")
logger.info("=" * 70)

# Load test metadata
test_meta_path = "/kaggle/input/vispeech/metadata/clean_testset.csv"
test_audio_dir = "/kaggle/input/vispeech/clean_testset"
test_df = pd.read_csv(test_meta_path)

# Label mappings
gender_map = {'Male': 0, 'Female': 1}
dialect_map = {'North': 0, 'Central': 1, 'South': 2}

sampling_rate = 16000
max_length = sampling_rate * 5

# Store finetuned results
finetuned_results = {}

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]
ENCODER_FULL_NAMES = {
    "wavlm-base-plus": "microsoft/wavlm-base-plus",
    "hubert-base-ls960": "facebook/hubert-base-ls960",
    "wav2vec2-base": "facebook/wav2vec2-base"
}

for encoder_short in ENCODERS:
    model_dir = f"/kaggle/working/output_{encoder_short}/best_model"
    encoder_full = ENCODER_FULL_NAMES[encoder_short]
    
    if not os.path.exists(model_dir):
        logger.warning(f"{encoder_short}: Model not found, skipping")
        continue
    
    logger.info("-" * 50)
    logger.info(f"Evaluating finetuned: {encoder_short}")
    
    # Load finetuned model
    model = MultiTaskSpeakerModel(
        model_name=encoder_full,
        num_genders=2,
        num_dialects=3,
        dropout=0.25,
        head_hidden_dim=256
    )
    model = load_model_checkpoint(model, model_dir, "cuda")
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    model.eval()
    
    # Load feature extractor from saved model
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_dir)
    
    # Evaluate
    gender_preds, gender_labels = [], []
    dialect_preds, dialect_labels = [], []
    
    for _, row in tqdm(test_df.iterrows(), total=len(test_df), desc=encoder_short):
        try:
            audio_path = os.path.join(test_audio_dir, row['audio_name'])
            audio, _ = librosa.load(audio_path, sr=sampling_rate, mono=True)
            audio, _ = librosa.effects.trim(audio, top_db=20)
            
            if len(audio) > 0:
                audio = audio / (np.max(np.abs(audio)) + 1e-8)
            
            if len(audio) < max_length:
                audio = np.pad(audio, (0, max_length - len(audio)))
            else:
                start = (len(audio) - max_length) // 2
                audio = audio[start:start + max_length]
            
            inputs = feature_extractor(
                audio, sampling_rate=sampling_rate,
                return_tensors="pt", padding=True
            )
            
            with torch.no_grad():
                outputs = model(inputs.input_values.to(device))
                g_pred = torch.argmax(outputs['gender_logits'], dim=-1).cpu().item()
                d_pred = torch.argmax(outputs['dialect_logits'], dim=-1).cpu().item()
            
            g_label = gender_map.get(row['gender'], 0)
            d_label = dialect_map.get(row['dialect'], 0)
            
            gender_preds.append(g_pred)
            gender_labels.append(g_label)
            dialect_preds.append(d_pred)
            dialect_labels.append(d_label)
            
        except Exception:
            continue
    
    finetuned_results[encoder_short] = {
        'gender_acc': accuracy_score(gender_labels, gender_preds),
        'gender_f1': f1_score(gender_labels, gender_preds, average='weighted'),
        'dialect_acc': accuracy_score(dialect_labels, dialect_preds),
        'dialect_f1': f1_score(dialect_labels, dialect_preds, average='weighted')
    }
    
    logger.info(f"  Gender Acc: {finetuned_results[encoder_short]['gender_acc']*100:.2f}% | Dialect Acc: {finetuned_results[encoder_short]['dialect_acc']*100:.2f}%")
    
    del model
    torch.cuda.empty_cache()

# ============================================================
# COMPARISON TABLE: BEFORE vs AFTER
# ============================================================
logger.info("")
logger.info("=" * 90)
logger.info("COMPARISON: PRETRAINED vs FINETUNED (All Encoders)")
logger.info("=" * 90)

logger.info(f"{'Encoder':<20} | {'--- Gender Acc ---':^25} | {'--- Dialect Acc ---':^25}")
logger.info(f"{'':<20} | {'Before':>8} {'After':>8} {'Δ':>7} | {'Before':>8} {'After':>8} {'Δ':>7}")
logger.info("-" * 90)

for encoder_short in ENCODERS:
    if encoder_short not in finetuned_results:
        continue
    
    before = pretrained_results.get(encoder_short, {'gender_acc': 0, 'dialect_acc': 0})
    after = finetuned_results[encoder_short]
    
    g_before = before['gender_acc'] * 100
    g_after = after['gender_acc'] * 100
    g_delta = g_after - g_before
    
    d_before = before['dialect_acc'] * 100
    d_after = after['dialect_acc'] * 100
    d_delta = d_after - d_before
    
    logger.info(f"{encoder_short:<20} | {g_before:>7.2f}% {g_after:>7.2f}% {'+' if g_delta > 0 else ''}{g_delta:>6.2f}% | {d_before:>7.2f}% {d_after:>7.2f}% {'+' if d_delta > 0 else ''}{d_delta:>6.2f}%")

logger.info("-" * 90)

# Find best model
best_encoder = max(finetuned_results.keys(), key=lambda x: finetuned_results[x]['dialect_acc'])
best_metrics = finetuned_results[best_encoder]

logger.info(f"BEST MODEL: {best_encoder}")
logger.info(f"  Gender Accuracy:  {best_metrics['gender_acc']*100:.2f}%")
logger.info(f"  Dialect Accuracy: {best_metrics['dialect_acc']*100:.2f}%")
logger.info("=" * 90)

In [None]:
# ============================================================
# SAVE MODELS TO KAGGLE OUTPUT
# ============================================================
import shutil
import os

OUTPUT_DIR = "/kaggle/working/final_models"
os.makedirs(OUTPUT_DIR, exist_ok=True)

ENCODERS = ["wavlm-base-plus", "hubert-base-ls960", "wav2vec2-base"]

logger.info("=" * 70)
logger.info("COPYING MODELS TO OUTPUT")
logger.info("=" * 70)

for encoder in ENCODERS:
    src_dir = f"/kaggle/working/output_{encoder}/best_model"
    dst_dir = f"{OUTPUT_DIR}/{encoder}"
    
    if os.path.exists(src_dir):
        if os.path.exists(dst_dir):
            shutil.rmtree(dst_dir)
        shutil.copytree(src_dir, dst_dir)
        logger.info(f"Copied: {encoder}")
    else:
        logger.warning(f"Not found: {encoder}")

# Also copy eval results
for encoder in ENCODERS:
    eval_dir = f"/kaggle/working/output_{encoder}/eval"
    if os.path.exists(eval_dir):
        dst_eval = f"{OUTPUT_DIR}/{encoder}/eval"
        if os.path.exists(dst_eval):
            shutil.rmtree(dst_eval)
        shutil.copytree(eval_dir, dst_eval)
        logger.info(f"Copied eval: {encoder}")

logger.info(f"All models saved to: {OUTPUT_DIR}")

In [None]:
# ============================================================
# LIST FINAL OUTPUT
# ============================================================
import os

OUTPUT_DIR = "/kaggle/working/final_models"

logger.info("=" * 70)
logger.info("FINAL OUTPUT STRUCTURE")
logger.info("=" * 70)

for root, dirs, files in os.walk(OUTPUT_DIR):
    level = root.replace(OUTPUT_DIR, '').count(os.sep)
    indent = '  ' * level
    logger.info(f"{indent}{os.path.basename(root)}/")
    sub_indent = '  ' * (level + 1)
    for file in files:
        size = os.path.getsize(os.path.join(root, file)) / 1024 / 1024
        logger.info(f"{sub_indent}{file} ({size:.1f} MB)")