In [None]:
import wandb
wandb.login(key="APIKEY")

In [None]:
!pip install transformers datasets evaluate accelerate librosa
!pip install --upgrade gdown

In [None]:
!pip install datasets==2.14.6
!pip install pandas==1.5.3

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
from glob import glob

# from tqdm import tqdm
from tqdm.notebook import tqdm
import librosa
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    recall_score,
    precision_score,
    accuracy_score,
    ConfusionMatrixDisplay,
    f1_score
)
from scipy.stats import spearmanr
import torch
from datasets import load_dataset, load_metric
from transformers import (
    AutoFeatureExtractor,
    AutoModelForAudioClassification,
    TrainingArguments,
    Trainer,
    AdamW,
    EarlyStoppingCallback
)
import matplotlib.pyplot as plt

SEED=3

import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input/crema-d/CREMA-D-master/AudioMP3'):
#     for filename in filenames:
#         print(filename)
save_path = "/kaggle/working"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Prepare Data

In [None]:
data = []

for path in tqdm(glob("/kaggle/input/d/return0root/crema-d/CREMA-D/AudioWAV/*.wav")):
    name = str(path).split('/')[-1].split('.')[0]
    actor_id, sentence, emotion, level = name.split('_')
    try:
        y,sr = librosa.load(path, sr=16000)
        data.append({
            "file": path,
            "actor_id": actor_id,
            "sentence": sentence,
            "label": emotion,
            "level": level
        })
    except Exception as e:
        raise(e)
df = pd.DataFrame(data)

In [None]:
df = pd.DataFrame(data)

In [None]:
df.head(2)

In [None]:
# SentenceFilenames.csv - list of movie files used in study
# finishedEmoResponses.csv - the first emotional response with timing.
# finishedResponses.csv - the final emotional Responses with emotion levels with repeated and practice responses removed, used to tabulate the votes

df_sentence = pd.read_csv('/kaggle/input/d/return0root/crema-d/CREMA-D/SentenceFilenames.csv')
df_first_resp = pd.read_csv('/kaggle/input/d/return0root/crema-d/CREMA-D/finishedEmoResponses.csv')
df_final_resp = pd.read_csv('/kaggle/input/d/return0root/crema-d/CREMA-D/finishedResponses.csv', low_memory=False)

In [None]:
df_first_resp['numTries'].value_counts()

In [None]:
df_final_resp['numTries'].value_counts()

In [None]:
train_df, dev_df = train_test_split(df, test_size=0.3, random_state=SEED,
                                    stratify=df["label"])
dev_df, test_df = train_test_split(dev_df, test_size=0.5, random_state=SEED,
                                   stratify=dev_df["label"])

train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

# remove unused features in training models
# train_df.drop(['actor_id','sentence', 'level'], axis=1, inplace=True)
# dev_df.drop(['actor_id','sentence', 'level'], axis=1, inplace=True)
# test_df.drop(['actor_id','sentence', 'level'], axis=1, inplace=True)

train_df.to_csv(f"{save_path}/train.csv", encoding="utf-8", index=False)
dev_df.to_csv(f"{save_path}/dev.csv", encoding="utf-8", index=False)
test_df.to_csv(f"{save_path}/test.csv", encoding="utf-8", index=False)

print(train_df.shape)
print(dev_df.shape)
print(test_df.shape)

In [None]:
data_files = {
    "train": f"{save_path}/train.csv",
    "validation": f"{save_path}/dev.csv",
    "test": f"{save_path}/test.csv"
}

dataset = load_dataset("csv", data_files=data_files)
train_dataset = dataset["train"]
dev_dataset = dataset["validation"]
test_dataset = dataset["test"]


print(dataset)

label_list = sorted(train_dataset.unique('label'))

In [None]:
# Base = 90M parameters; Large = 300M parameters

model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english" # pre-trained on multi-lingual speech, fine-tuning on English

# Feel free to look for and experiment with other models at HuggingFace Hub https://huggingface.co/

In [None]:
feature_extractor=AutoFeatureExtractor.from_pretrained(model_name_or_path)
model=AutoModelForAudioClassification.from_pretrained(model_name_or_path,
                                      num_labels=len(train_dataset.unique("label")),
                                      label2id={label: i for i, label in enumerate(label_list)},
                                      id2label={i: label for i, label in enumerate(label_list)}
                                      )
model.freeze_feature_encoder()

In [None]:
def label_to_id(label, label_list):
    if len(label_list) > 0:
        return label_list.index(label) if label in label_list else -1
    return label
def prepare_example(example):
    example["audio"], example["sampling_rate"] = librosa.load(example["file"], sr=feature_extractor.sampling_rate)
    example["duration_in_seconds"] = len(example["audio"]) / feature_extractor.sampling_rate
    example["label"] = label_to_id(example["label"], label_list)
    return example
def preprocess_function(examples):
    audio_arrays = examples["audio"]
    inputs = feature_extractor(
        audio_arrays,
        sampling_rate=feature_extractor.sampling_rate
    )
    return inputs

In [None]:
dataset = dataset.map(prepare_example, remove_columns=['file'])
dataset = dataset.map(preprocess_function, batched=True, batch_size=1)

In [None]:
# delete processed data
# !rm -rf /kaggle/working/data/preprocessed

In [None]:
dataset.save_to_disk(f"{save_path}/data/preprocessed/")

## Train

In [None]:
from datasets import load_from_disk

dataset = load_from_disk(f"{save_path}/data/preprocessed/")
train_dataset = dataset["train"]
dev_dataset = dataset["validation"]
test_dataset = dataset["test"]


print(dataset)

label_list = sorted(train_dataset.unique('label'))
label_list

In [None]:
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated())
print(torch.cuda.memory_reserved())

In [None]:
# Batch size = per_device_train_batch_size * gradient_accumulation_steps
# Parameters to tune: learning rate, epochs, (batch size)
# More details on hyperparameter tuning in https://github.com/google-research/tuning_playbook

def compute_metrics(pred):
    """Computes accuracy on a batch of predictions"""
    predictions = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(pred.label_ids, predictions)
    precision = precision_score(pred.label_ids, predictions, average='macro')
    recall = recall_score(pred.label_ids, predictions, average='macro')
    f1 = f1_score(pred.label_ids, predictions, average='macro')
    return {"accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1}


learning_rates = [1e-3, 1e-4, 1e-5] # first round
num_epochs = 5
# learning_rates = [1.5e-4, 1e-4, 0.5e-4] # second round
# num_epochs = 10
# learning_rates = [1.25e-4, 1.5e-4, 1.75e-4] # third round
# num_epochs = 15
evaluations = []

model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english" # pre-trained on multi-lingual speech, fine-tuning on English

feature_extractor=AutoFeatureExtractor.from_pretrained(model_name_or_path)

for lr in learning_rates:
    torch.cuda.empty_cache()
  # 🐝 1️⃣ Start a new run to track this script
    with wandb.init(
        # Set the project where this run will be logged
        project="SER",
        entity="black-noodles",
        # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
        name=f"{model_name_or_path}_{lr}_{num_epochs}_cosine", 
        # Track hyperparameters and run metadata
        config={
        "learning_rate": lr,
        "architecture": model_name_or_path,
        "dataset": "CREMA-D",
        "epochs": num_epochs,
    }):
        # renew model
        model=AutoModelForAudioClassification.from_pretrained(model_name_or_path,
                                              num_labels=len(train_dataset.unique("label")),
                                              label2id={label: i for i, label in enumerate(label_list)},
                                              id2label={i: label for i, label in enumerate(label_list)}
                                              )
        model.freeze_feature_encoder()
        
        # start training
        training_args = TrainingArguments(
            output_dir=f"{save_path}/{model_name_or_path}-speech-emotion-recognition",
            per_device_train_batch_size=32, # require more GPU memory, this set can exploit 16GB memory
            gradient_accumulation_steps=4,
            per_device_eval_batch_size=32,
            num_train_epochs=num_epochs,
            warmup_ratio=0.1,
            learning_rate=lr,
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            save_total_limit=2,
            logging_steps=10,
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            push_to_hub=False,
            gradient_checkpointing=True,
            fp16=True,
            report_to=None,
            lr_scheduler_type="cosine"
        )

        # optimizer = AdamW(model.parameters(), lr = lr) 
        # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=5e-6, verbose=True)


        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=dev_dataset,
            tokenizer=feature_extractor,
            # optimizers= (optimizer, lr_scheduler),
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
        )


        trainer.train()

        predictions = trainer.predict(dev_dataset)
        
        result = compute_metrics(predictions)

        wandb.log(result)
        evaluations.append(result['f1'])
      
  # Mark the run as finished
wandb.finish()



In [None]:
best_lr = learning_rates[np.argmax(evaluations)]

learning_rates = [best_lr*1.5, best_lr, best_lr*0.5] # second round
num_epochs = 10
evaluations = []

model_name_or_path = "jonatasgrosman/wav2vec2-large-xlsr-53-english" # pre-trained on multi-lingual speech, fine-tuning on English

feature_extractor=AutoFeatureExtractor.from_pretrained(model_name_or_path)

for lr in learning_rates:
    torch.cuda.empty_cache()
  # 🐝 1️⃣ Start a new run to track this script
    with wandb.init(
        # Set the project where this run will be logged
        project="SER",
        entity="black-noodles",
        # We pass a run name (otherwise it’ll be randomly assigned, like sunshine-lollypop-10)
        name=f"{model_name_or_path}_{lr}_{num_epochs}_cosine", 
        # Track hyperparameters and run metadata
        config={
        "learning_rate": lr,
        "architecture": model_name_or_path,
        "dataset": "CREMA-D",
        "epochs": num_epochs,
    }):
        # renew model
        model=AutoModelForAudioClassification.from_pretrained(model_name_or_path,
                                              num_labels=len(train_dataset.unique("label")),
                                              label2id={label: i for i, label in enumerate(label_list)},
                                              id2label={i: label for i, label in enumerate(label_list)}
                                              )
        model.freeze_feature_encoder()
        
        # start training
        training_args = TrainingArguments(
            output_dir=f"{save_path}/{model_name_or_path}-speech-emotion-recognition",
            per_device_train_batch_size=32, # require more GPU memory, this set can exploit 16GB memory
            gradient_accumulation_steps=4,
            per_device_eval_batch_size=32,
            num_train_epochs=num_epochs,
            warmup_ratio=0.1,
            learning_rate=lr,
            evaluation_strategy = "epoch",
            save_strategy = "epoch",
            save_total_limit=2,
            logging_steps=10,
            load_best_model_at_end=True,
            metric_for_best_model='f1',
            greater_is_better=True,
            push_to_hub=False,
            gradient_checkpointing=True,
            fp16=True,
            report_to=None,
            lr_scheduler_type="cosine"
        )

        # optimizer = AdamW(model.parameters(), lr = lr) 
        # lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5, min_lr=5e-6, verbose=True)


        trainer = Trainer(
            model=model,
            args=training_args,
            compute_metrics=compute_metrics,
            train_dataset=train_dataset,
            eval_dataset=dev_dataset,
            tokenizer=feature_extractor,
            # optimizers= (optimizer, lr_scheduler),
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
        )


        trainer.train()

        predictions = trainer.predict(dev_dataset)
        
        result = compute_metrics(predictions)

        wandb.log(result)
        evaluations.append(result['f1'])
      
  # Mark the run as finished
wandb.finish()