In [None]:
%load_ext autoreload
%autoreload 2

# Detecting audio issues in a condition monitoring dataset (audio)
This notebook aims at detecting issues in a **condition monitoring** dataset using **audio data**. As a basis it uses the DCASE challenge dataset where the goal is to detect if a machine is in a defect state or not.

In order to run the example install the **dependencies** as follows:

In [None]:
!pip install datasets

## Step 1: Download the dataset

In [None]:
# Import datasets to download the data
import datasets

In [None]:
# Download the dataset and convert to pandas dataframe
dataset = datasets.load_dataset(
        "renumics/dcase23-task2-enriched", "dev", split="all", streaming=False
    )
df = dataset.to_pandas()

In [None]:
# Sample the dataset randomly to make the example run faster
df = df.sample(1000)

# Step 2: Detect problematic data slices based on audio data

In [None]:
# The imports
from sklearn.metrics import accuracy_score
from renumics.spotlight import Audio
from sliceguard import SliceGuard

In [None]:
# Run slice detection based on general purpose audio embeddings (pretrained model trained on Audioset)
sg = SliceGuard()
issues = sg.find_issues(
    df,
    ["path"],
    "label",
    "dev_train_lof_anomaly",
    accuracy_score,
    metric_mode="max",
    min_support=5,
    min_drop=0.2
)

In [None]:
# Report the results in Renumics Spotlight
sg.report(spotlight_dtype={"path": Audio})