In [None]:
!pip install librosa soundfile wandb

In [None]:
!pip install audiomentations==0.35.0
# phần này phải tải đúng phiên bản này, nếu không sẽ dễ lỗi do từ các bản sau con audio augment này xài numpy 2.0 dễ conflict với bọn hugging face

import wandb
wandb.login(key="your_wandb_api_key_here")
print("WandB logged in successfully!")
print("Done")

In [None]:
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import librosa
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from transformers import (
    Wav2Vec2FeatureExtractor,
    WavLMModel,
    Trainer,
    TrainingArguments,
    EarlyStoppingCallback
)
from torch.utils.data import Dataset
from audiomentations import Compose, AddGaussianNoise, TimeStretch, PitchShift, Shift, Gain
import random
import warnings
warnings.filterwarnings('ignore')

print("All libraries imported successfully")

# 2. CONFIGURATION
class Config:
    # Paths
    VISPEECH_ROOT = Path('điền đường dẫn đúng vào đây')
    TRAIN_META = VISPEECH_ROOT / 'metadata/trainset.csv'
    CLEAN_TEST_META = VISPEECH_ROOT / 'metadata/clean_testset.csv'
    NOISY_TEST_META = VISPEECH_ROOT / 'metadata/noisy_testset.csv'
    TRAIN_AUDIO = VISPEECH_ROOT / 'trainset'
    CLEAN_TEST_AUDIO = VISPEECH_ROOT / 'clean_testset'
    NOISY_TEST_AUDIO = VISPEECH_ROOT / 'noisy_testset'
    
    # Model
    MODEL_NAME = "microsoft/wavlm-base-plus"
    
    # Audio Processing
    SAMPLING_RATE = 16000
    MAX_DURATION = 5  
    
    # Training
    BATCH_SIZE = 32       
    LEARNING_RATE = 5e-5    
    NUM_EPOCHS = 15      
    WARMUP_RATIO = 0.125    
    WEIGHT_DECAY = 0.0125  
    GRADIENT_CLIP = 1.0
    
    # Data Augmentation
    AUGMENT_PROB = 0.8
    
    # Loss weighting
    DIALECT_LOSS_WEIGHT = 3.0 
    
    # Label mappings
    GENDER_MAP = {'Male': 0, 'Female': 1}
    DIALECT_MAP = {'North': 0, 'Central': 1, 'South': 2}
    
    # Output
    OUTPUT_DIR = '/kaggle/working/speaker-profiling'
    
    # WandB
    WANDB_PROJECT = "speaker-profiling-vispeech"
    
    # Reproducibility
    SEED = 42

config = Config()

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(config.SEED)

print("\nCONFIGURATION")
print(f"Model: {config.MODEL_NAME}")
print(f"Architecture: WavLM + Attentive Pooling + LayerNorm")
print(f"Sampling Rate: {config.SAMPLING_RATE} Hz")
print(f"Max Duration: {config.MAX_DURATION}s")
print(f"Batch Size: {config.BATCH_SIZE}")
print(f"Learning Rate: {config.LEARNING_RATE}")
print(f"Epochs: {config.NUM_EPOCHS}")
print(f"Weight Decay: {config.WEIGHT_DECAY}")
print(f"Augmentation Prob: {config.AUGMENT_PROB}")
print(f"Dialect Loss Weight: {config.DIALECT_LOSS_WEIGHT}x")
print(f"WandB Project: {config.WANDB_PROJECT}")


In [None]:
# 3. DATA LOADING & PREPARATION

def load_and_prepare_data():
    print("\nLoading metadata...")
    train_df = pd.read_csv(config.TRAIN_META)
    clean_test_df = pd.read_csv(config.CLEAN_TEST_META)
    noisy_test_df = pd.read_csv(config.NOISY_TEST_META)
    
    for df in [train_df, clean_test_df, noisy_test_df]:
        df['gender_label'] = df['gender'].map(config.GENDER_MAP)
        df['dialect_label'] = df['dialect'].map(config.DIALECT_MAP)
    
    unique_speakers = train_df['speaker'].unique()
    train_speakers, val_speakers = train_test_split(
        unique_speakers,
        test_size=0.15,
        random_state=config.SEED,
        shuffle=True
    )
    
    train_data = train_df[train_df['speaker'].isin(train_speakers)].reset_index(drop=True)
    val_data = train_df[train_df['speaker'].isin(val_speakers)].reset_index(drop=True)
    
    print(f"\nData loaded:")
    print(f"Train: {len(train_data):,} samples ({len(train_speakers)} speakers)")
    print(f"Validation: {len(val_data):,} samples ({len(val_speakers)} speakers)")
    print(f"Clean Test: {len(clean_test_df):,} samples")
    print(f"Noisy Test: {len(noisy_test_df):,} samples")
    
    assert len(set(train_speakers) & set(val_speakers)) == 0, "Speaker leakage detected!"
    print("No speaker leakage between train/val")
    
    return train_data, val_data, clean_test_df, noisy_test_df

train_df, val_df, clean_test_df, noisy_test_df = load_and_prepare_data()


# 4. DATA AUGMENTATION

class AudioAugmentation:
    def __init__(self, sampling_rate=16000, augment_prob=0.8):
        self.sampling_rate = sampling_rate
        self.augment_prob = augment_prob
        
        self.augment = Compose([
            AddGaussianNoise(min_amplitude=0.001, max_amplitude=0.015, p=0.5),
            TimeStretch(min_rate=0.8, max_rate=1.2, leave_length_unchanged=False, p=0.5),
            PitchShift(min_semitones=-4, max_semitones=4, p=0.5),
            Shift(min_shift=-0.5, max_shift=0.5, p=0.3),
            Gain(min_gain_db=-12, max_gain_db=12, p=0.5),
        ])
    
    def __call__(self, audio):
        if random.random() < self.augment_prob:
            return self.augment(samples=audio, sample_rate=self.sampling_rate)
        return audio


# 5. DATASET CLASS

class ViSpeechDataset(Dataset):
    def __init__(self, dataframe, audio_dir, feature_extractor,
                 sampling_rate=16000, max_duration=5, is_training=True):
        self.df = dataframe.reset_index(drop=True)
        self.audio_dir = Path(audio_dir)
        self.feature_extractor = feature_extractor
        self.sampling_rate = sampling_rate
        self.max_length = int(sampling_rate * max_duration)
        self.is_training = is_training
        
        if is_training:
            self.augmentation = AudioAugmentation(sampling_rate, augment_prob=config.AUGMENT_PROB)
            print(f"Augmentation ENABLED (prob={config.AUGMENT_PROB})")
        else:
            self.augmentation = None
            print(f"Augmentation DISABLED")
    
    def __len__(self):
        return len(self.df)
    
    def load_audio(self, audio_name):
        audio_path = self.audio_dir / audio_name
        
        try:
            audio, sr = librosa.load(audio_path, sr=self.sampling_rate, mono=True)
            audio, _ = librosa.effects.trim(audio, top_db=20)
            
            if self.is_training and self.augmentation is not None:
                audio = self.augmentation(audio)
            
            audio = audio / (np.max(np.abs(audio)) + 1e-8)
            
            if len(audio) < self.max_length:
                audio = np.pad(audio, (0, self.max_length - len(audio)))
            else:
                if self.is_training:
                    start = np.random.randint(0, len(audio) - self.max_length + 1)
                else:
                    start = (len(audio) - self.max_length) // 2
                audio = audio[start:start + self.max_length]
            
            return audio
            
        except Exception as e:
            print(f"Error loading {audio_path}: {e}")
            return np.zeros(self.max_length)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        audio = self.load_audio(row['audio_name'])
        
        inputs = self.feature_extractor(
            audio,
            sampling_rate=self.sampling_rate,
            return_tensors="pt",
            padding=True
        )
        
        return {
            'input_values': inputs.input_values.squeeze(0),
            'gender_labels': torch.tensor(row['gender_label'], dtype=torch.long),
            'dialect_labels': torch.tensor(row['dialect_label'], dtype=torch.long)
        }


In [None]:
# 6. ATTENTIVE POOLING MODULE

class AttentivePooling(nn.Module):
    def __init__(self, hidden_size):
        super().__init__()
        self.attention = nn.Sequential(
            nn.Linear(hidden_size, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1, bias=False)
        )
    
    def forward(self, x, mask=None):
        attn_weights = self.attention(x)
        
        if mask is not None:
            mask = mask.unsqueeze(-1)
            attn_weights = attn_weights.masked_fill(mask == 0, -1e9)
        
        attn_weights = F.softmax(attn_weights, dim=1)
        pooled = torch.sum(x * attn_weights, dim=1)
        
        return pooled, attn_weights.squeeze(-1)


# 7. MULTI-TASK MODEL

class MultiTaskSpeakerModel(nn.Module):
    def __init__(self, model_name, num_genders=2, num_dialects=3,
                 freeze_encoder=False, dropout=0.1, head_hidden_dim=256):
        super().__init__()
        
        self.wavlm = WavLMModel.from_pretrained(model_name)
        
        if freeze_encoder:
            for param in self.wavlm.parameters():
                param.requires_grad = False
            print("Encoder FROZEN")
        
        hidden_size = self.wavlm.config.hidden_size
        
        self.attentive_pooling = AttentivePooling(hidden_size)
        self.layer_norm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout)
        
        self.gender_head = nn.Sequential(
            nn.Linear(hidden_size, head_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(head_hidden_dim, num_genders)
        )
        
        self.dialect_head = nn.Sequential(
            nn.Linear(hidden_size, head_hidden_dim),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(head_hidden_dim, head_hidden_dim // 2),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(head_hidden_dim // 2, num_dialects)
        )
        
        print(f"Architecture: WavLM + Attentive Pooling + LayerNorm")
        print(f"Hidden size: {hidden_size}")
        print(f"Head hidden dim: {head_hidden_dim}")
        print(f"Dropout: {dropout}")
        
    def forward(self, input_values, attention_mask=None, 
                gender_labels=None, dialect_labels=None):
        
        outputs = self.wavlm(input_values, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        
        pooled, attn_weights = self.attentive_pooling(hidden_states, attention_mask)
        pooled = self.layer_norm(pooled)
        pooled = self.dropout(pooled)
        
        gender_logits = self.gender_head(pooled)
        dialect_logits = self.dialect_head(pooled)
        
        loss = None
        if gender_labels is not None and dialect_labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            gender_loss = loss_fct(gender_logits, gender_labels)
            dialect_loss = loss_fct(dialect_logits, dialect_labels)
            loss = gender_loss + config.DIALECT_LOSS_WEIGHT * dialect_loss
        
        return {
            'loss': loss,
            'gender_logits': gender_logits,
            'dialect_logits': dialect_logits,
            'attention_weights': attn_weights
        }


# 8. CUSTOM TRAINER

class MultiTaskTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        gender_labels = inputs.pop("gender_labels")
        dialect_labels = inputs.pop("dialect_labels")
        
        outputs = model(
            input_values=inputs["input_values"],
            gender_labels=gender_labels,
            dialect_labels=dialect_labels
        )
        
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss
    
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        gender_labels = inputs.pop("gender_labels")
        dialect_labels = inputs.pop("dialect_labels")
        
        with torch.no_grad():
            outputs = model(
                input_values=inputs["input_values"],
                gender_labels=gender_labels,
                dialect_labels=dialect_labels
            )
            loss = outputs["loss"]
        
        return (
            loss,
            (outputs["gender_logits"], outputs["dialect_logits"]),
            (gender_labels, dialect_labels)
        )


# 9. METRICS

def compute_metrics(pred):
    gender_logits, dialect_logits = pred.predictions
    gender_labels, dialect_labels = pred.label_ids
    
    gender_preds = np.argmax(gender_logits, axis=-1)
    dialect_preds = np.argmax(dialect_logits, axis=-1)
    
    gender_acc = accuracy_score(gender_labels, gender_preds)
    gender_f1 = f1_score(gender_labels, gender_preds, average='weighted')
    dialect_acc = accuracy_score(dialect_labels, dialect_preds)
    dialect_f1 = f1_score(dialect_labels, dialect_preds, average='weighted')
    
    return {
        'gender_acc': gender_acc,
        'gender_f1': gender_f1,
        'dialect_acc': dialect_acc,
        'dialect_f1': dialect_f1,
        'combined_f1': (gender_f1 + dialect_f1) / 2
    }


# 10. INITIALIZE MODEL & DATASETS
print("INITIALIZING MODEL")

print("\nLoading feature extractor...")
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(config.MODEL_NAME)
print("Feature extractor loaded")

print("\nCreating datasets...")
print("Training dataset:")
train_dataset = ViSpeechDataset(
    train_df, config.TRAIN_AUDIO, feature_extractor,
    sampling_rate=config.SAMPLING_RATE,
    max_duration=config.MAX_DURATION,
    is_training=True
)

print("Validation dataset:")
val_dataset = ViSpeechDataset(
    val_df, config.TRAIN_AUDIO, feature_extractor,
    sampling_rate=config.SAMPLING_RATE,
    max_duration=config.MAX_DURATION,
    is_training=False
)

print("Clean test dataset:")
clean_test_dataset = ViSpeechDataset(
    clean_test_df, config.CLEAN_TEST_AUDIO, feature_extractor,
    sampling_rate=config.SAMPLING_RATE,
    max_duration=config.MAX_DURATION,
    is_training=False
)

print("Noisy test dataset:")
noisy_test_dataset = ViSpeechDataset(
    noisy_test_df, config.NOISY_TEST_AUDIO, feature_extractor,
    sampling_rate=config.SAMPLING_RATE,
    max_duration=config.MAX_DURATION,
    is_training=False
)

print("\nLoading model...")
model = MultiTaskSpeakerModel(
    config.MODEL_NAME,
    num_genders=2,
    num_dialects=3,
    freeze_encoder=False,
    dropout=0.1,
    head_hidden_dim=256
)

total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
new_params = trainable_params - sum(p.numel() for p in model.wavlm.parameters() if p.requires_grad)

print(f"\nModel loaded: {config.MODEL_NAME}")
print(f"Total parameters: {total_params:,}")
print(f"Trainable parameters: {trainable_params:,}")
print(f"New parameters added: {new_params:,}")


In [None]:
# 11. INITIALIZE WANDB
print("\nINITIALIZING WANDB")

wandb.init(
    project=config.WANDB_PROJECT,
    name="wavlm-attentive-layernorm",
    config={
        "model_name": config.MODEL_NAME,
        "architecture": "WavLM + Attentive Pooling + LayerNorm",
        "sampling_rate": config.SAMPLING_RATE,
        "max_duration": config.MAX_DURATION,
        "batch_size": config.BATCH_SIZE,
        "learning_rate": config.LEARNING_RATE,
        "num_epochs": config.NUM_EPOCHS,
        "warmup_ratio": config.WARMUP_RATIO,
        "weight_decay": config.WEIGHT_DECAY,
        "gradient_clip": config.GRADIENT_CLIP,
        "augment_prob": config.AUGMENT_PROB,
        "dialect_loss_weight": config.DIALECT_LOSS_WEIGHT,
        "head_hidden_dim": 256,
        "train_samples": len(train_dataset),
        "val_samples": len(val_dataset),
        "clean_test_samples": len(clean_test_dataset),
        "noisy_test_samples": len(noisy_test_dataset),
    },
    tags=["speaker-profiling", "wavlm", "multi-task", "vietnamese"]
)

print(f"WandB initialized: {wandb.run.url}")


# 12. TRAINING ARGUMENTS
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    eval_strategy='epoch',
    save_strategy='epoch',
    
    learning_rate=config.LEARNING_RATE,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    num_train_epochs=config.NUM_EPOCHS,
    weight_decay=config.WEIGHT_DECAY,
    warmup_ratio=config.WARMUP_RATIO,
    max_grad_norm=config.GRADIENT_CLIP,
    
    lr_scheduler_type='linear',
    
    load_best_model_at_end=True,
    metric_for_best_model='dialect_acc',
    greater_is_better=True,
    save_total_limit=3,
    
    fp16=True,
    dataloader_num_workers=2,
    
    logging_steps=50,
    logging_first_step=True,
    report_to='wandb',
    
    remove_unused_columns=False,
    seed=config.SEED,
)

early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.0025
)

print("Training arguments configured")


In [None]:
# 13. TRAINER & TRAINING
print("\nTRAINING")

trainer = MultiTaskTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[early_stopping],
)

print("\nStarting training...")
print(f"Architecture: WavLM + Attentive Pooling + LayerNorm")
print(f"Epochs: {config.NUM_EPOCHS}")
print(f"Steps per epoch: ~{len(train_dataset) // config.BATCH_SIZE}")
print(f"Total steps: ~{(len(train_dataset) // config.BATCH_SIZE) * config.NUM_EPOCHS}")
print(f"WandB tracking: {wandb.run.url}")

trainer.train()

print("\nTraining completed!")


In [None]:
# 14. EVALUATION
def evaluate_and_report(trainer, dataset, dataset_name):
    print(f"\nEVALUATING ON {dataset_name.upper()}")
    
    results = trainer.predict(dataset)
    gender_logits, dialect_logits = results.predictions
    gender_labels, dialect_labels = results.label_ids
    
    gender_preds = np.argmax(gender_logits, axis=-1)
    dialect_preds = np.argmax(dialect_logits, axis=-1)
    
    gender_acc = accuracy_score(gender_labels, gender_preds) * 100
    dialect_acc = accuracy_score(dialect_labels, dialect_preds) * 100
    gender_f1 = f1_score(gender_labels, gender_preds, average='weighted') * 100
    dialect_f1 = f1_score(dialect_labels, dialect_preds, average='weighted') * 100
    
    print(f"\nOverall Metrics:")
    print(f"Gender -> Accuracy: {gender_acc:.2f}% | F1: {gender_f1:.2f}%")
    print(f"Dialect -> Accuracy: {dialect_acc:.2f}% | F1: {dialect_f1:.2f}%")
    
    print(f"\nGender Classification Report:")
    print(classification_report(gender_labels, gender_preds,
                               target_names=['Male', 'Female'],
                               digits=4))
    
    print(f"\nDialect Classification Report:")
    print(classification_report(dialect_labels, dialect_preds,
                               target_names=['North', 'Central', 'South'],
                               digits=4))
    
    print(f"\nGender Confusion Matrix:")
    gender_cm = confusion_matrix(gender_labels, gender_preds)
    print(gender_cm)
    
    print(f"\nDialect Confusion Matrix:")
    dialect_cm = confusion_matrix(dialect_labels, dialect_preds)
    print(dialect_cm)
    
    wandb.log({
        f"{dataset_name}/gender_acc": gender_acc,
        f"{dataset_name}/gender_f1": gender_f1,
        f"{dataset_name}/dialect_acc": dialect_acc,
        f"{dataset_name}/dialect_f1": dialect_f1,
        f"{dataset_name}/combined_f1": (gender_f1 + dialect_f1) / 2,
    })
    
    wandb.log({
        f"{dataset_name}/gender_confusion_matrix": wandb.plot.confusion_matrix(
            probs=None,
            y_true=gender_labels,
            preds=gender_preds,
            class_names=['Male', 'Female']
        ),
        f"{dataset_name}/dialect_confusion_matrix": wandb.plot.confusion_matrix(
            probs=None,
            y_true=dialect_labels,
            preds=dialect_preds,
            class_names=['North', 'Central', 'South']
        )
    })
    
    return {
        'gender_acc': gender_acc,
        'gender_f1': gender_f1,
        'dialect_acc': dialect_acc,
        'dialect_f1': dialect_f1
    }

clean_results = evaluate_and_report(trainer, clean_test_dataset, "clean_test")
noisy_results = evaluate_and_report(trainer, noisy_test_dataset, "noisy_test")


# 15. COMPARISON WITH BASELINE
baseline_results = {
    'gender': {'clean': 98.73, 'noisy': 98.14},
    'dialect': {'clean': 81.47, 'noisy': 74.80}
}

our_results = {
    'gender': {'clean': clean_results['gender_acc'], 'noisy': noisy_results['gender_acc']},
    'dialect': {'clean': clean_results['dialect_acc'], 'noisy': noisy_results['dialect_acc']}
}

print("\nCOMPARISON WITH BASELINE (PACLIC 2024 - ResNet34)")
print(f"\n{'Task':<15} {'Test Set':<12} {'Baseline':<12} {'Our Model':<12} {'Delta':<12}")
print("-"*60)

comparison_data = []
for task in ['gender', 'dialect']:
    for test_set in ['clean', 'noisy']:
        baseline_val = baseline_results[task][test_set]
        our_val = our_results[task][test_set]
        delta = our_val - baseline_val
        delta_str = f"{delta:+.2f}%"
        
        print(f"{task.capitalize():<15} {test_set.capitalize():<12} "
              f"{baseline_val:<12.2f} {our_val:<12.2f} {delta_str:<12}")
        
        comparison_data.append([task, test_set, baseline_val, our_val, delta])

wandb.log({
    "baseline_comparison": wandb.Table(
        data=comparison_data,
        columns=["Task", "Test Set", "Baseline", "Our Model", "Delta"]
    )
})

print("\nSUMMARY:")
avg_improvement = sum([d[4] for d in comparison_data]) / len(comparison_data)
print(f"Average Improvement: {avg_improvement:+.2f}%")

wandb.log({
    "summary/avg_improvement": avg_improvement,
    "summary/clean_gender_delta": our_results['gender']['clean'] - baseline_results['gender']['clean'],
    "summary/clean_dialect_delta": our_results['dialect']['clean'] - baseline_results['dialect']['clean'],
    "summary/noisy_gender_delta": our_results['gender']['noisy'] - baseline_results['gender']['noisy'],
    "summary/noisy_dialect_delta": our_results['dialect']['noisy'] - baseline_results['dialect']['noisy'],
})


In [None]:
# 16. SAVE MODEL
print("\nSAVING MODEL")

output_dir = config.OUTPUT_DIR + '/best_model'
trainer.save_model(output_dir)
feature_extractor.save_pretrained(output_dir)

print(f"Model saved to: {output_dir}")

artifact = wandb.Artifact(
    name="speaker-profiling-model",
    type="model",
    description="WavLM + Attentive Pooling + LayerNorm for Speaker Profiling"
)
artifact.add_dir(output_dir)
wandb.log_artifact(artifact)

print(f"Model artifact logged to WandB")

wandb.finish()

print("\nPIPELINE COMPLETED")
print(f"Architecture: WavLM + Attentive Pooling + LayerNorm")
print("Check WandB dashboard for detailed metrics!")
