# Evaluation notebook for [CryCeleb2023 challenge](https://huggingface.co/spaces/competitions/CryCeleb2023)

## This notebook does the following:
- Download the Cryceleb data from Hugging Face.
- Download a pretrained SpeechBrain model from Hugging Face.
- Compute embeddings.
- Compute similarity scores for pairs of embeddings.
- Compute the equal error rate of the scores and visualize results.
- Produces my_solution.csv that can be uploaded to the competition platform.

### Imports

In [None]:
# For Colab - uncomment and run the following to set up the repo
# !pip install speechbrain
# !git clone https://github.com/Ubenwa/cryceleb2023.git
# %cd cryceleb2023

In [None]:
%%capture

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import speechbrain as sb
import torch
from huggingface_hub import hf_hub_download
from IPython.display import display
from speechbrain.dataio.dataio import read_audio
from speechbrain.pretrained import EncoderClassifier, SpeakerRecognition
from speechbrain.utils.metric_stats import EER
from tqdm.notebook import tqdm

from crybrain import download_data

dataset_path = "data"

### Data

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
download_data(dataset_path)

In [None]:
# read metadata
metadata = pd.read_csv(
    f"{dataset_path}/metadata.csv", dtype={"baby_id": str, "chronological_index": str}
)
dev_metadata = metadata.loc[metadata["split"] == "dev"].copy()
# read sample submission
sample_submission = pd.read_csv(
    f"{dataset_path}/sample_submission.csv"
)  # scores are unfiorm random
# read verification pairs
dev_pairs = pd.read_csv(
    f"{dataset_path}/dev_pairs.csv", dtype={"baby_id_B": str, "baby_id_D": str}
)
test_pairs = pd.read_csv(f"{dataset_path}/test_pairs.csv")

display(
    metadata.head()
    .style.set_caption("metadata")
    .set_table_styles([{"selector": "caption", "props": [("font-size", "20px")]}])
)
display(
    dev_pairs.head()
    .style.set_caption("dev_pairs")
    .set_table_styles([{"selector": "caption", "props": [("font-size", "20px")]}])
)
display(
    test_pairs.head()
    .style.set_caption("test_pairs")
    .set_table_styles([{"selector": "caption", "props": [("font-size", "20px")]}])
)
display(
    sample_submission.head()
    .style.set_caption("sample_submission")
    .set_table_styles([{"selector": "caption", "props": [("font-size", "20px")]}])
)

### Initialize encoder

One way to verify if both pairs come from the same baby is to concatenate all the segments for each pair, compute the embedding of the concatenated cry, and compute the cosine similarity between the embeddings.

Let's load the model

In [None]:
!rm -rf spkrec-ecapa-voxceleb
encoder = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="spkrec-ecapa-voxceleb",
    run_opts={"device": "cuda"},  # comment out if no GPU available
)

In [None]:
# you can also plug in your encoder weights if you fine-tuned this model locally
# !rm spkrec-ecapa-voxceleb/embedding_model.ckpt
# !cp experiments/ecapa_voxceleb_ft_basic/ckpts/CKPT+epoch-4_valacc-0.57/embedding_model.ckpt spkrec-ecapa-voxceleb

# encoder = SpeakerRecognition.from_hparams(
#    source="speechbrain/spkrec-ecapa-voxceleb",
#    savedir="spkrec-ecapa-voxceleb",
#    run_opts={"device": "cuda"},  # comment out if no GPU available
# )

#### Compute Encodings

Change runtime type to GPU if using Colab

In [None]:
%%time
# read the segments
dev_metadata["cry"] = dev_metadata.apply(
    lambda row: read_audio(f'{dataset_path}/{row["file_name"]}').numpy(), axis=1
)
# concatenate all segments for each (baby_id, period) group
cry_dict = pd.DataFrame(
    dev_metadata.groupby(["baby_id", "period"])["cry"].agg(lambda x: np.concatenate(x.values)),
    columns=["cry"],
).to_dict(orient="index")
# encode the concatenated cries
for (baby_id, period), d in tqdm(cry_dict.items()):
    d["cry_encoded"] = encoder.encode_batch(torch.tensor(d["cry"]), normalize=False)

#### Compute Similarity Between Encodings

In [None]:
def compute_cosine_similarity_score(row, cry_dict):
    cos = torch.nn.CosineSimilarity(dim=-1)
    similarity_score = cos(
        cry_dict[(row["baby_id_B"], "B")]["cry_encoded"],
        cry_dict[(row["baby_id_D"], "D")]["cry_encoded"],
    )
    return similarity_score.item()


dev_pairs["score"] = dev_pairs.apply(
    lambda row: compute_cosine_similarity_score(row=row, cry_dict=cry_dict), axis=1
)
display(dev_pairs.head())

In [None]:
def compute_eer_and_plot_verification_scores(pairs_df):
    """pairs_df must have 'score' and 'label' columns"""
    positive_scores = pairs_df.loc[pairs_df["label"] == 1]["score"].values
    negative_scores = pairs_df.loc[pairs_df["label"] == 0]["score"].values
    eer, threshold = EER(torch.tensor(positive_scores), torch.tensor(negative_scores))
    ax = sns.histplot(pairs_df, x="score", hue="label", stat="percent", common_norm=False)
    ax.set_title(f"EER={round(eer, 4)} - Thresh={round(threshold, 4)}")
    plt.axvline(x=[threshold], color="red", ls="--")
    return eer, threshold


eer, threshold = compute_eer_and_plot_verification_scores(pairs_df=dev_pairs)

The above plot displays the histogram of scores for +ive (same baby) and -ive (different baby) dev_pairs.\
A perfect verifier would attribute a higher score to all +ive pairs than any -ive pair.\
Your task is to come up with a scoring system which maximizes the separation between the two distributions, as measured by the EER.\
You can change the encoder module, the aggregation of cry segments, the similarity metric, or come up with a completely different process! \
You will be evaluated on the test_pairs.csv, for which ground truth labels are not provided.




Score the test_pairs and submit:



In [None]:
%%time
test_metadata = metadata.loc[metadata["split"] == "test"].copy()
# read the segments
test_metadata["cry"] = test_metadata.apply(
    lambda row: read_audio(f'{dataset_path}/{row["file_name"]}').numpy(), axis=1
)
# concatenate all segments for each (baby_id, period) group
cry_dict_test = pd.DataFrame(
    test_metadata.groupby(["baby_id", "period"])["cry"].agg(lambda x: np.concatenate(x.values)),
    columns=["cry"],
).to_dict(orient="index")
# encode the concatenated cries
for (baby_id, period), d in tqdm(cry_dict_test.items()):
    d["cry_encoded"] = encoder.encode_batch(torch.tensor(d["cry"]), normalize=False)

# compute cosine similarity between all pairs
test_pairs["score"] = test_pairs.apply(
    lambda row: compute_cosine_similarity_score(row=row, cry_dict=cry_dict_test), axis=1
)
display(test_pairs.head())

In [None]:
# submission must match the 'sample_submission.csv' format exactly
my_submission = test_pairs[["id", "score"]]
my_submission.to_csv("my_submission.csv", index=False)
display(my_submission.head())

You can now download `my_submission.csv` and submit it to the challenge!