In [1]:
import json
import os
import random
from dataclasses import asdict, dataclass
from typing import Any, Dict, Tuple

from datetime import datetime
import audiomentations
import librosa
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import transformers
from datasets import Dataset
from prettytable import PrettyTable
from sklearn.model_selection import StratifiedKFold, train_test_split
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import DataLoader
from torchmetrics import MetricCollection
from torchmetrics.classification import MulticlassAccuracy, MulticlassF1Score, MulticlassPrecision, MulticlassRecall
from tqdm import tqdm
from transformers import AutoFeatureExtractor, AutoModelForAudioClassification, AutoTokenizer
from transformers.feature_extraction_utils import BatchFeature
from transformers import AutoFeatureExtractor, WhisperForAudioClassification
from datasets import load_dataset

transformers.logging.set_verbosity_error()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
@dataclass
class Config:
    # data args
    train_csv: str = "/scratch/network/mk8574/audio_sentiment_challenge/data/train.csv"
    test_csv: str = "/scratch/network/mk8574/audio_sentiment_challenge/data/test.csv"

    # model args
    pretrained_name: str = "jonatasgrosman/wav2vec2-large-xlsr-53-english"
    
    train_serial = datetime.now().strftime("%Y%m%d_%H%M%S") + "|" + pretrained_name.replace("/", "|")

    # k-fold
    k_fold_num: int = 0  # if you want to use k-fold validation, set positive integer value.
    k_fold_idx: int = 1

    # save dir
    save_dir: str = f"/scratch/network/mk8574/audio_sentiment_challenge/baseline_mk/results/{train_serial}/"

    # hparams
    seed: int = 42
    lr: float = 5e-4
    batch_size: int = 4
    gradient_accumulate_step: int = 4  # total batch size = batch_size * gradient_accumulate_step
    max_epoch: int = batch_size * gradient_accumulate_step
    early_stop_patience = 5

In [3]:
config = Config()

if not os.path.exists(config.save_dir):
    os.makedirs(config.save_dir)

with open(os.path.join(config.save_dir, "config.json"), "w") as config_file:
    json.dump(asdict(config), config_file, indent=4, sort_keys=False)

In [4]:
feature_extractor = AutoFeatureExtractor.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
model = WhisperForAudioClassification.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")
# tokenizer = AutoTokenizer.from_pretrained("sanchit-gandhi/whisper-medium-fleurs-lang-id")

In [5]:
seed = config.seed

random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
# torch.backends.cudnn.deterministic = True
# torch.backends.cudnn.benchmark = False

In [6]:
ds = load_dataset("audiofolder", data_dir = '../data')

Resolving data files: 100%|██████████| 5002/5002 [00:00<00:00, 626191.16it/s]
Resolving data files: 100%|██████████| 1882/1882 [00:00<00:00, 607439.79it/s]


In [7]:
ds['train']

Dataset({
    features: ['audio', 'label'],
    num_rows: 5001
})

In [8]:
sample = next(iter(ds['train']))
print(sample)

{'audio': {'path': '/scratch/network/mk8574/audio_sentiment_challenge/data/train/TRAIN_0000.wav', 'array': array([0.00750732, 0.00820923, 0.00793457, ..., 0.        , 0.        ,
       0.        ]), 'sampling_rate': 16000}, 'label': None}


In [9]:
len(sample['audio']['array'])

37904

In [10]:
inputs = feature_extractor(sample['audio']['array'], sampling_rate = sample['audio']['sampling_rate'])

input_features = torch.Tensor(np.array(inputs.input_features))
print(input_features)

tensor([[[ 0.3318,  0.4619,  0.3513,  ..., -1.2950, -1.2950, -1.2950],
         [ 0.0703,  0.2434,  0.3929,  ..., -1.2950, -1.2950, -1.2950],
         [ 0.4198,  0.3761,  0.4405,  ..., -1.2950, -1.2950, -1.2950],
         ...,
         [-0.5193, -0.6057, -0.6061,  ..., -1.2950, -1.2950, -1.2950],
         [-0.5937, -0.6416, -0.6173,  ..., -1.2950, -1.2950, -1.2950],
         [-0.5799, -0.6215, -0.5754,  ..., -1.2950, -1.2950, -1.2950]]])


In [11]:
with torch.no_grad():
    logits = model(input_features).logits

predicted_class_ids = torch.argmax(logits).item()
predicted_label = model.config.id2label[predicted_class_ids]
predicted_label

'Hebrew'

In [12]:
model.config.id2label = {i:i for i in range(6)}
model.config.label2id = model.config.id2label

In [13]:
model.config.num_labels = 6

In [14]:
from transformers import TrainingArguments, Trainer
import evaluate

training_args = TrainingArguments(output_dir = "test_trainer", report_to = 'none')
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    
    return metric.compute(predictions = predictions, references = labels)

Using the latest cached version of the module from /home/mk8574/.cache/huggingface/modules/evaluate_modules/metrics/evaluate-metric--accuracy/f887c0aab52c2d38e1f8a215681126379eca617f96c447638f751434e8e65b14 (last modified on Wed Nov 22 21:05:41 2023) since it couldn't be found locally at evaluate-metric--accuracy, or remotely on the Hugging Face Hub.


In [15]:
class MyDataSet(Dataset):
    def __init__(self, df, feature_extractor, mode='train', transforms=None):
        self.df = df
        self.feature_extractor = feature_extractor
        self.mode = mode
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        path = os.path.join('../data', self.df['path'][idx][2:])
        
        waveform, sample_rate = librosa.load(path)
        sr = self.feature_extractor.sampling_rate
        if sr != sample_rate:
            waveform = librosa.resample(waveform, orig_sr=sample_rate, target_sr=sr)
        
        if self.transforms is not None:
            waveform = self.transforms(samples=np.array(waveform, dtype=np.float32), sample_rate=sr)
        
        input_values = self.feature_extractor(torch.Tensor(waveform), sampling_rate=sr, return_tensors="pt", padding=True).input_features
        
        if self.mode != 'test':
            label = self.df['label'][idx]
            return input_values.squeeze(), label
        else:
            return input_values.squeeze()

In [16]:
train_df = pd.read_csv('../data/train.csv')
train_df, valid_df = train_test_split(train_df, test_size=0.2, random_state=42)

train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)

train_loader = MyDataSet(train_df, feature_extractor)
valid_loader = MyDataSet(valid_df, feature_extractor)

In [17]:
# model[0]

In [18]:
class DataModule:
    def __init__(
        self,
        feature_extractor: AutoFeatureExtractor,
        transforms: list = None,
    ):
        self.feature_extractor = feature_extractor
        self.transforms = transforms

    def to_dataset(self, df: pd.DataFrame) -> Dataset:
        def load_waveform(row):
            waveform, sample_rate = librosa.load(row["path"])
            waveform = librosa.resample(
                waveform,
                orig_sr=sample_rate,
                target_sr=self.feature_extractor.sampling_rate,
            )
            row["waveform"] = waveform

            return row

        dataset = Dataset.from_pandas(df)
        dataset = dataset.map(load_waveform, num_proc=4)
        if "label" in dataset.column_names:
            dataset = dataset.rename_column("label", "labels")

        return dataset

    def apply_transforms(self, dataset: Dataset) -> Dataset:
        def apply_transforms(batch):
            waveforms = [self.transforms(samples=np.array(waveform, dtype=np.float32), sample_rate=self.feature_extractor.sampling_rate) for waveform in batch["waveform"]]
            batch["waveform"] = waveforms

            return batch

        if self.transforms:
            dataset = dataset.with_transform(apply_transforms)

        return dataset

    def collate_fn(self, batch: list) -> BatchFeature:
        if hasattr(self.feature_extractor, "nb_max_frames"):
            padding = "max_length"
        else:
            padding = "longest"

        waveforms = [data["waveform"] for data in batch]
        model_inputs = self.feature_extractor(
            waveforms,
            sampling_rate=self.feature_extractor.sampling_rate,
            padding=padding,
            return_tensors="pt",
        )

        if "labels" in batch[0]:
            labels = [data["labels"] for data in batch]
            model_inputs["labels"] = torch.tensor(labels)

        return model_inputs

In [19]:
class MetricScore(nn.Module):
    def __init__(self):
        super().__init__()

        metrics = MetricCollection(
            {
                "accuracy": MulticlassAccuracy(num_classes=6),
                "recall": MulticlassRecall(num_classes=6),
                "precision": MulticlassPrecision(num_classes=6),
                "f1": MulticlassF1Score(num_classes=6),
            }
        )
        self.train_metrics = metrics.clone()
        self.valid_metrics = metrics.clone()

        self.train_losses = []
        self.valid_losses = []

    def add_train_metrics(self, logits: torch.Tensor, labels: torch.Tensor):
        self.train_metrics.update(logits, labels)

    def add_valid_metrics(self, logits: torch.Tensor, labels: torch.Tensor):
        self.valid_metrics.update(logits, labels)

    def add_train_loss(self, loss: torch.Tensor):
        self.train_losses.append(loss.item())

    def add_valid_loss(self, loss: torch.Tensor):
        self.valid_losses.append(loss.item())

    def compute_train(self) -> Dict[str, Any]:
        scores = self.train_metrics.compute()
        for metric_key, score in scores.items():
            if isinstance(score, torch.Tensor):
                scores[metric_key] = score.item()
        scores.update({"loss": np.mean(self.train_losses)})

        return scores

    def compute_valid(self) -> Dict[str, Any]:
        scores = self.valid_metrics.compute()
        for metric_key, score in scores.items():
            if isinstance(score, torch.Tensor):
                scores[metric_key] = score.item()
        scores.update({"loss": np.mean(self.valid_losses)})

        return scores

    def reset(self):
        self.train_metrics.reset()
        self.valid_metrics.reset()

        self.train_losses = []
        self.valid_losses = []

    def print_summary(self, epoch_idx: int):
        train_result = self.compute_train()
        valid_result = self.compute_valid()

        assert list(train_result.keys()) == list(valid_result.keys())

        pt = PrettyTable()
        pt.field_names = [f"epoch {epoch_idx}"] + list(train_result.keys())

        train_row = ["train"]
        for score in train_result.values():
            train_row.append(round(score, 3))
        pt.add_row(train_row)

        valid_row = ["valid"]
        for score in valid_result.values():
            valid_row.append(round(score, 3))
        pt.add_row(valid_row)

        print(pt, end="\n\n")

In [20]:
def fit(
    model: AutoModelForAudioClassification,
    train_loader: DataLoader,
    valid_loader: DataLoader,
    max_epoch: int = 64,
    lr: float = 5e-4,
    gradient_accumulate_step: int = 1,
    early_stop_patience: int = 5,
) -> dict:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = model.to(device)
    metric_scores = MetricScore().to(device)

    optimizer = AdamW(
        [{"params": module.parameters(), "lr": lr if name == "classifier" else lr * 0.1} for name, module in model.named_children()],
        weight_decay=0.1,
    )
    lr_scheduler = ReduceLROnPlateau(optimizer, mode="max", factor=0.5, patience=3)

    # run finetune
    best_score = 0.0
    best_state_dict = None

    early_stop_count = 0
    for epoch_idx in range(1, max_epoch):
        with torch.set_grad_enabled(True), tqdm(total=len(train_loader), desc=f"[Epoch {epoch_idx}/{max_epoch}] training", leave=False) as pbar:
            model.train()
            for step_idx, batch in enumerate(train_loader, 1):
                batch = batch.to(device)
                
                # print(batch['input_features'].shape)
                output = model(**batch)
                loss = output.loss
                loss.backward()

                if step_idx % gradient_accumulate_step == 0 or step_idx == len(train_loader):
                    optimizer.step()
                    optimizer.zero_grad()            

                metric_scores.add_train_loss(loss)
                metric_scores.add_train_metrics(output.logits, batch.labels)

                pbar.update()
                pbar.set_postfix({"train loss": loss.item()})

        with torch.set_grad_enabled(False), tqdm(total=len(valid_loader), desc=f"[Epoch {epoch_idx}/{max_epoch}] validation", leave=False) as pbar:
            model.eval()
            for batch in valid_loader:
                batch = batch.to(device)

                output = model(**batch)
                loss = output.loss

                metric_scores.add_valid_loss(loss)
                metric_scores.add_valid_metrics(output.logits, batch.labels)

                pbar.update()
                pbar.set_postfix({"valid loss": loss.item()})

        epoch_score = metric_scores.compute_valid()["accuracy"]
        metric_scores.print_summary(epoch_idx=epoch_idx)
        metric_scores.reset()

        lr_scheduler.step(epoch_score)

        if epoch_score > best_score:
            best_score = epoch_score
            best_state_dict = model.state_dict()
            early_stop_count = 0
        else:
            early_stop_count += 1
            if early_stop_count == early_stop_patience:
                print("*** EARLY STOPPED ***")
                break
    
    return best_state_dict

In [21]:
@torch.no_grad()
def predict(
    model: AutoModelForAudioClassification,
    feature_extractor: AutoFeatureExtractor,
    test_dataset: Dataset,
    batch_size: int = 16,
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model.to(device)
    model.eval()

    predict_logits = {"id": []}
    predict_logits.update({class_id: [] for class_id in range(6)})

    predict_class = {"id": [], "label": []}

    for batch_idx in tqdm(range(0, len(test_dataset), batch_size), desc="prediction"):
        bs, bi = batch_idx, batch_idx + batch_size
        batch = test_dataset[bs:bi]

        if hasattr(feature_extractor, "nb_max_frames"):
            padding = "max_length"
        else:
            padding = "longest"

        model_inputs = feature_extractor(
            batch["waveform"],
            sampling_rate=feature_extractor.sampling_rate,
            padding=padding,
            return_tensors="pt",
        ).to(device)

        model_output = model(**model_inputs)

        batch_logits = model_output.logits.cpu()
        batch_predict = model_output.logits.argmax(dim=-1).cpu()

        predict_logits["id"] += batch["id"]
        for class_id in range(6):
            predict_logits[class_id] += batch_logits[:, class_id].tolist()

        predict_class["id"] += batch["id"]
        predict_class["label"] += batch_predict.tolist()

    predict_logits = pd.DataFrame(predict_logits)
    predict_class = pd.DataFrame(predict_class)

    return predict_logits, predict_class

In [22]:
transforms = audiomentations.OneOf(
    [
        audiomentations.AddGaussianNoise(p=0.75),
        audiomentations.PitchShift(p=0.75),
        audiomentations.PeakingFilter(p=0.75),
        audiomentations.SevenBandParametricEQ(p=0.75),
        audiomentations.BandPassFilter(p=0.75),
        audiomentations.BandStopFilter(p=0.75),
        audiomentations.AirAbsorption(p=0.75),
        audiomentations.ClippingDistortion(p=0.75),
        audiomentations.HighPassFilter(p=0.75),
        audiomentations.HighShelfFilter(p=0.75),
        audiomentations.Limiter(p=0.75),
        audiomentations.LowPassFilter(p=0.75),
        audiomentations.LowShelfFilter(p=0.75),
    ]
)

In [23]:
# create train data and valid data
df = pd.read_csv(config.train_csv)
df["path"] = df["path"].apply(lambda x: os.path.join(os.path.dirname(config.train_csv), x[2:]))

if config.k_fold_num > 0:
    skf = StratifiedKFold(n_splits=config.k_fold_num)
    train_indices, valid_indices = list(skf.split(df, df["label"]))[config.k_fold_idx]
    train_df, valid_df = df.iloc[train_indices], df.iloc[valid_indices]
else:
    train_df, valid_df = train_test_split(df, train_size=0.9, stratify=df["label"], random_state=seed)

# create test data
test_df = pd.read_csv(config.test_csv)
test_df["path"] = test_df["path"].apply(lambda x: os.path.join(os.path.dirname(config.test_csv), x[2:]))

In [24]:
data_module = DataModule(
    feature_extractor=feature_extractor,
    transforms=transforms,
)

# train_loader = DataLoader(
#     train_, 
#     # train_dataset,
#     batch_size=config.batch_size,
#     shuffle=True,
#     collate_fn=data_module.collate_fn,
# )
# valid_loader = DataLoader(
#     ds,
#     batch_size=config.batch_size,
#     shuffle=False,
#     collate_fn=data_module.collate_fn,
# )

In [25]:
best_state_dict = fit(
    model=model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    max_epoch=config.max_epoch,
    lr=config.lr,
    gradient_accumulate_step=config.gradient_accumulate_step,
    early_stop_patience=config.early_stop_patience,
)

model.load_state_dict(best_state_dict)
model.save_pretrained(os.path.join(config.save_dir, "best_model"))
feature_extractor.save_pretrained(os.path.join(config.save_dir, "best_model"))

                                                               

AttributeError: 'MyDataSet' object has no attribute '_indices'