Tôi sẽ sử dụng file này, sau đó import lên Kaggle và chạy để sử dụng GPU và Ram của kaggle

In [None]:
!git clone https://github.com/VuThanhLam124/Profiling_gender_dialect.git
!apt-get install -y ffmpeg

In [None]:
cd Profiling_gender_dialect

In [None]:
!pip install -r requirements.txt
!pip install -q transformers==4.44.0 accelerate==0.33.0 datasets==2.21.0
!pip install -q librosa soundfile audiomentations==0.35.0 wandb safetensors

In [None]:
# ============================================================
# FINETUNE WITH ViMD DATASET
# ============================================================
import os
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger("training")

ENCODER = "vinai/PhoWhisper-base"
encoder_short = ENCODER.split("/")[-1]
WANDB_API_KEY = "f05e29c3466ec288e97041e0e3d541c4087096a6"

vimd_config = f"""
model:
  name: "{ENCODER}"
  num_genders: 2
  num_dialects: 3
  dropout: 0.25
  head_hidden_dim: 512
  freeze_encoder: false

training:
  batch_size: 32
  gradient_accumulation_steps: 4
  learning_rate: 2.5e-5
  num_epochs: 30
  warmup_ratio: 0.15
  weight_decay: 0.015
  gradient_clip: 0.5
  lr_scheduler: "cosine"
  fp16: true
  dataloader_num_workers: 2

loss:
  dialect_weight: 3

wandb:
  enabled: true
  api_key: "{WANDB_API_KEY}"
  project: "vimd-speaker-profiling"
  run_name: "{encoder_short}"

data:
  source: "vimd"
  vimd_path: "/kaggle/input/vimd-dataset"

audio:
  sampling_rate: 16000
  max_duration: 5

augmentation:
  enabled: true
  prob: 0.75

output:
  dir: "/kaggle/working/output_vimd"
  save_total_limit: 1
  metric_for_best_model: "dialect_acc"

early_stopping:
  patience: 5
  threshold: 0.001

labels:
  gender:
    Male: 0
    Female: 1
    0: 0
    1: 1
  dialect:
    North: 0
    Central: 1
    South: 2

seed: 42
"""

config_path = "configs/vimd_train.yaml"
with open(config_path, "w") as f:
    f.write(vimd_config)

logger.info(f"Config saved: {config_path}")
logger.info(f"Encoder: {ENCODER}")
logger.info(f"Batch size: 32, Gradient accumulation: 4 (effective batch: 32x4)")
logger.info(f"WandB: enabled, project=vimd-speaker-profiling")

In [None]:
# ============================================================
# START TRAINING
# ============================================================
logger.info("=" * 70)
logger.info(f"TRAINING: {ENCODER}")
logger.info("=" * 70)

exit_code = os.system(f"python finetune.py --config {config_path}")

if exit_code == 0:
    logger.info("Training completed successfully")
else:
    logger.error(f"Training failed with exit code: {exit_code}")

Eval with ViDM dataset

In [None]:
# ============================================================
# CHECK SAVED MODEL
# ============================================================
import os

model_dir = "/kaggle/working/output_vimd/best_model"

logger.info("=" * 70)
logger.info("SAVED MODEL")
logger.info("=" * 70)

if os.path.exists(model_dir):
    total_size = 0
    for f in sorted(os.listdir(model_dir)):
        size = os.path.getsize(os.path.join(model_dir, f)) / 1024 / 1024
        total_size += size
        logger.info(f"  {f}: {size:.1f} MB")
    logger.info(f"  Total: {total_size:.1f} MB")
else:
    logger.warning("Model not found")

In [None]:
# ============================================================
# EVALUATE MODEL ON ViMD TEST SET
# ============================================================
import os

model_dir = "/kaggle/working/output_vimd/best_model"
config_path = "configs/vimd_train.yaml"

if not os.path.exists(model_dir):
    logger.error("Model not found, skipping eval")
else:
    logger.info("=" * 70)
    logger.info("EVALUATING ON ViMD TEST SET")
    logger.info("=" * 70)
    
    exit_code = os.system(
        f"python eval.py --checkpoint {model_dir} --config {config_path} "
        f"--test_name vimd_test --output_dir /kaggle/working/output_vimd/eval"
    )
    
    if exit_code == 0:
        logger.info("Evaluation completed successfully")
    else:
        logger.error(f"Evaluation failed with exit code: {exit_code}")

In [None]:
# ============================================================
# SAVE MODEL TO KAGGLE OUTPUT
# ============================================================
import shutil
import os

OUTPUT_DIR = "/kaggle/working/final_model_vimd"
os.makedirs(OUTPUT_DIR, exist_ok=True)

model_dir = "/kaggle/working/output_vimd/best_model"
eval_dir = "/kaggle/working/output_vimd/eval"

logger.info("=" * 70)
logger.info("COPYING MODEL TO OUTPUT")
logger.info("=" * 70)

if os.path.exists(model_dir):
    dst_dir = f"{OUTPUT_DIR}/best_model"
    if os.path.exists(dst_dir):
        shutil.rmtree(dst_dir)
    shutil.copytree(model_dir, dst_dir)
    logger.info(f"Copied model to: {dst_dir}")
else:
    logger.warning("Model not found")

if os.path.exists(eval_dir):
    dst_eval = f"{OUTPUT_DIR}/eval"
    if os.path.exists(dst_eval):
        shutil.rmtree(dst_eval)
    shutil.copytree(eval_dir, dst_eval)
    logger.info(f"Copied eval to: {dst_eval}")

logger.info(f"All files saved to: {OUTPUT_DIR}")

In [None]:
# ============================================================
# LIST FINAL OUTPUT
# ============================================================
import os

OUTPUT_DIR = "/kaggle/working/final_model_vimd"

logger.info("=" * 70)
logger.info("FINAL OUTPUT STRUCTURE")
logger.info("=" * 70)

if os.path.exists(OUTPUT_DIR):
    for root, dirs, files in os.walk(OUTPUT_DIR):
        level = root.replace(OUTPUT_DIR, '').count(os.sep)
        indent = '  ' * level
        logger.info(f"{indent}{os.path.basename(root)}/")
        sub_indent = '  ' * (level + 1)
        for file in files:
            size = os.path.getsize(os.path.join(root, file)) / 1024 / 1024
            logger.info(f"{sub_indent}{file} ({size:.1f} MB)")
else:
    logger.warning("Output directory not found")

In [None]:
# ============================================================
# RESUME TRAINING (TRAIN THÊM)
# ============================================================
import os

# Checkpoint để resume
CHECKPOINT_DIR = "/kaggle/working/output_vimd/best_model"

# Số epoch train thêm
ADDITIONAL_EPOCHS = 10

logger.info("=" * 70)
logger.info("RESUME TRAINING")
logger.info("=" * 70)

if not os.path.exists(CHECKPOINT_DIR):
    logger.error(f"Checkpoint not found: {CHECKPOINT_DIR}")
else:
    # Tạo config mới với resume
    resume_config = f"""
model:
  name: "{ENCODER}"
  num_genders: 2
  num_dialects: 3
  dropout: 0.25
  head_hidden_dim: 512
  freeze_encoder: false

training:
  batch_size: 32
  gradient_accumulation_steps: 4
  learning_rate: 1e-5  # Learning rate thấp hơn khi resume
  num_epochs: {ADDITIONAL_EPOCHS}
  warmup_ratio: 0.05  # Warmup ngắn hơn
  weight_decay: 0.015
  gradient_clip: 0.5
  lr_scheduler: "cosine"
  fp16: true
  dataloader_num_workers: 2
  resume_from_checkpoint: "{CHECKPOINT_DIR}"  # Resume từ checkpoint

loss:
  dialect_weight: 3

wandb:
  enabled: true
  api_key: "{WANDB_API_KEY}"
  project: "vimd-speaker-profiling"
  run_name: "{encoder_short}-resume"

data:
  source: "vimd"
  vimd_path: "/kaggle/input/vimd-dataset"

audio:
  sampling_rate: 16000
  max_duration: 5

augmentation:
  enabled: true
  prob: 0.75

output:
  dir: "/kaggle/working/output_vimd_resume"
  save_total_limit: 1
  metric_for_best_model: "dialect_acc"

early_stopping:
  patience: 5
  threshold: 0.001

labels:
  gender:
    Male: 0
    Female: 1
    0: 0
    1: 1
  dialect:
    North: 0
    Central: 1
    South: 2

seed: 42
"""
    
    resume_config_path = "configs/vimd_resume.yaml"
    with open(resume_config_path, "w") as f:
        f.write(resume_config)
    
    logger.info(f"Resume config saved: {resume_config_path}")
    logger.info(f"Resume from: {CHECKPOINT_DIR}")
    logger.info(f"Additional epochs: {ADDITIONAL_EPOCHS}")
    logger.info(f"Learning rate: 1e-5 (reduced for fine-tuning)")
    
    # Start resume training
    exit_code = os.system(f"python finetune.py --config {resume_config_path}")
    
    if exit_code == 0:
        logger.info("Resume training completed successfully!")
        # Copy new best model
        new_model_dir = "/kaggle/working/output_vimd_resume/best_model"
        if os.path.exists(new_model_dir):
            import shutil
            dst = "/kaggle/working/final_model_vimd/best_model_resumed"
            if os.path.exists(dst):
                shutil.rmtree(dst)
            shutil.copytree(new_model_dir, dst)
            logger.info(f"Resumed model saved to: {dst}")
    else:
        logger.error(f"Resume training failed with exit code: {exit_code}")

In [None]:
# ============================================================
# EVALUATE RESUMED MODEL
# ============================================================
import os
import shutil

# Model sau khi train thêm
resumed_model_dir = "/kaggle/working/output_vimd_resume/best_model"
config_path = "configs/vimd_resume.yaml"

logger.info("=" * 70)
logger.info("EVALUATING RESUMED MODEL ON ViMD TEST SET")
logger.info("=" * 70)

if not os.path.exists(resumed_model_dir):
    logger.error(f"Resumed model not found: {resumed_model_dir}")
else:
    eval_output_dir = "/kaggle/working/output_vimd_resume/eval"
    
    exit_code = os.system(
        f"python eval.py --checkpoint {resumed_model_dir} --config {config_path} "
        f"--test_name vimd_test --output_dir {eval_output_dir}"
    )
    
    if exit_code == 0:
        logger.info("Evaluation completed successfully!")
        
        # Copy eval results to final output
        dst_eval = "/kaggle/working/final_model_vimd/eval_resumed"
        if os.path.exists(dst_eval):
            shutil.rmtree(dst_eval)
        shutil.copytree(eval_output_dir, dst_eval)
        logger.info(f"Eval results saved to: {dst_eval}")
        
        # Print evaluation results
        eval_file = os.path.join(eval_output_dir, "evaluation_results.json")
        if os.path.exists(eval_file):
            import json
            with open(eval_file, "r") as f:
                results = json.load(f)
            logger.info("=" * 50)
            logger.info("RESULTS:")
            logger.info(f"  Gender Accuracy: {results.get('gender_accuracy', 'N/A'):.4f}")
            logger.info(f"  Dialect Accuracy: {results.get('dialect_accuracy', 'N/A'):.4f}")
            logger.info("=" * 50)
    else:
        logger.error(f"Evaluation failed with exit code: {exit_code}")

In [None]:
# ============================================================
# LOAD AND PREPARE LSVSC DATASET
# ============================================================
import json
import numpy as np
from datasets import load_dataset

logger.info("=" * 70)
logger.info("LOADING LSVSC DATASET")
logger.info("=" * 70)

# Load dataset
ds = load_dataset("doof-ferb/LSVSC")
logger.info(f"Dataset splits: {ds.keys()}")
logger.info(f"Dataset info: {ds}")

# Use test split or validation
test_split = ds.get("test", ds.get("validation", None))

if test_split is None:
    logger.error("No test or validation split found")
else:
    logger.info(f"Using split with {len(test_split)} samples")
    
    # Show sample structure
    if len(test_split) > 0:
        sample = test_split[0]
        logger.info(f"Sample keys: {sample.keys()}")
        for key in sample.keys():
            val = sample[key]
            if isinstance(val, (list, np.ndarray)):
                logger.info(f"  {key}: {type(val).__name__} with shape {len(val)}")
            else:
                logger.info(f"  {key}: {val}")

In [None]:
# ============================================================
# EVALUATE ON LSVSC DATASET
# ============================================================
import torch
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import librosa
import os
import json

logger.info("=" * 70)
logger.info("EVALUATING ON LSVSC DATASET")
logger.info("=" * 70)

ENCODER = "vinai/PhoWhisper-base"
# Model checkpoint path
model_checkpoint = "/kaggle/input/fine-tune-vimd/final_model_vimd/best_model_resumed"

if not os.path.exists(model_checkpoint):
    logger.error(f"Model not found: {model_checkpoint}")
else:
    from src.models import MultiTaskSpeakerModel
    from safetensors.torch import load_file
    
    # Load model with correct architecture (head_hidden_dim=512 from training config)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = MultiTaskSpeakerModel(ENCODER, head_hidden_dim=512)
    
    safetensors_path = os.path.join(model_checkpoint, "model.safetensors")
    checkpoint = load_file(safetensors_path)
    model.load_state_dict(checkpoint)
    model.to(device)
    model.eval()
    
    logger.info(f"Model loaded successfully from: {model_checkpoint}")
    
    # Get processor
    if "whisper" in ENCODER.lower():
        from transformers import WhisperFeatureExtractor
        processor = WhisperFeatureExtractor.from_pretrained(ENCODER)
    else:
        from transformers import Wav2Vec2FeatureExtractor
        processor = Wav2Vec2FeatureExtractor.from_pretrained(ENCODER)
    
    # Prepare LSVSC data
    is_whisper = "whisper" in ENCODER.lower()
    sampling_rate = 16000
    
    gender_preds = []
    dialect_preds = []
    gender_gts = []
    dialect_gts = []
    
    logger.info(f"Processing {len(test_split)} samples...")
    
    # Process each sample
    for idx, sample in enumerate(test_split):
        if idx % 500 == 0:
            logger.info(f"  Processing sample {idx}/{len(test_split)}")
        
        try:
            # Get audio - LSVSC has 'audio' field with dict structure
            audio_dict = sample["audio"]
            waveform = np.array(audio_dict["array"], dtype=np.float32)
            sr = audio_dict["sampling_rate"]
            
            # Resample if needed
            if sr != sampling_rate:
                waveform = librosa.resample(waveform, orig_sr=sr, target_sr=sampling_rate)
            
            # Process audio based on model type
            if is_whisper:
                # Pad to 30 seconds
                whisper_length = sampling_rate * 30
                if len(waveform) < whisper_length:
                    waveform = np.pad(waveform, (0, whisper_length - len(waveform)))
                else:
                    waveform = waveform[:whisper_length]
                
                inputs = processor(waveform, sampling_rate=sampling_rate, return_tensors="pt")
                input_tensor = inputs.input_features.to(device)
            else:
                inputs = processor(waveform, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
                input_tensor = inputs.input_values.to(device)
            
            # Inference
            with torch.no_grad():
                outputs = model(input_tensor)
                gender_logits = outputs['gender_logits']
                dialect_logits = outputs['dialect_logits']
            
            gender_pred = gender_logits.argmax(dim=-1).item()
            dialect_pred = dialect_logits.argmax(dim=-1).item()
            
            # Get ground truth - LSVSC format: "female" / "male", "northern dialect" / "central dialect" / "southern dialect"
            gender_label = sample["gender"].lower()
            gender_gt = 0 if "female" in gender_label else 1
            
            dialect_label = sample["dialect"].lower()
            if "northern" in dialect_label:
                dialect_gt = 0
            elif "central" in dialect_label:
                dialect_gt = 1
            else:  # southern
                dialect_gt = 2
            
            # Only append if inference succeeded
            gender_preds.append(gender_pred)
            dialect_preds.append(dialect_pred)
            gender_gts.append(gender_gt)
            dialect_gts.append(dialect_gt)
        
        except Exception as e:
            if idx < 5:  # Only log first few errors
                logger.warning(f"Error processing sample {idx}: {e}")
            continue
    
    # Calculate metrics
    logger.info("=" * 70)
    logger.info("LSVSC EVALUATION RESULTS")
    logger.info("=" * 70)
    
    if gender_gts and gender_preds:
        gender_acc = accuracy_score(gender_gts, gender_preds)
        logger.info(f"\nGender Classification (Female=0, Male=1):")
        logger.info(f"  Accuracy: {gender_acc:.4f}")
        logger.info(f"  Samples: {len(gender_gts)}")
        cm_gender = confusion_matrix(gender_gts, gender_preds)
        logger.info(f"  Confusion Matrix:\n{cm_gender}")
        logger.info(f"\n{classification_report(gender_gts, gender_preds, target_names=['Female', 'Male'])}")
    else:
        logger.warning("No gender ground truth data")
    
    if dialect_gts and dialect_preds:
        dialect_acc = accuracy_score(dialect_gts, dialect_preds)
        logger.info(f"\nDialect Classification (North=0, Central=1, South=2):")
        logger.info(f"  Accuracy: {dialect_acc:.4f}")
        logger.info(f"  Samples: {len(dialect_gts)}")
        cm_dialect = confusion_matrix(dialect_gts, dialect_preds)
        logger.info(f"  Confusion Matrix:\n{cm_dialect}")
        logger.info(f"\n{classification_report(dialect_gts, dialect_preds, target_names=['North', 'Central', 'South'])}")
    else:
        logger.warning("No dialect ground truth data")
    
    # Save results
    results = {
        "dataset": "LSVSC",
        "model_checkpoint": model_checkpoint,
        "encoder": ENCODER,
        "num_samples_total": len(test_split),
        "num_samples_processed": len(gender_gts),
        "num_samples_failed": len(test_split) - len(gender_gts),
        "gender_accuracy": float(accuracy_score(gender_gts, gender_preds)) if gender_gts and gender_preds else None,
        "dialect_accuracy": float(accuracy_score(dialect_gts, dialect_preds)) if dialect_gts and dialect_preds else None,
        "gender_confusion_matrix": cm_gender.tolist() if 'cm_gender' in locals() else None,
        "dialect_confusion_matrix": cm_dialect.tolist() if 'cm_dialect' in locals() else None,
    }
    
    results_file = "/kaggle/working/final_model_vimd/lsvsc_evaluation.json"
    os.makedirs(os.path.dirname(results_file), exist_ok=True)
    with open(results_file, "w") as f:
        json.dump(results, f, indent=2)
    
    logger.info(f"\nResults saved to: {results_file}")