# Audio Mining Notebook, Spring Rotation 2025, SALT Lab
Steven Dillmann, Stanford University, stevendi@stanford.edu

In [27]:
# External imports
from datasets import load_dataset
from datasets import concatenate_datasets
import os
from huggingface_hub import login
from IPython.display import Audio, display
import requests
import soundfile as sf
import io
from dotenv import load_dotenv
from cartesia import Cartesia
from cartesia.voice_changer.client import VoiceChangerClient
import wizmap 
from sentence_transformers import SentenceTransformer
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_distances
import matplotlib.pyplot as plt
import umap
import plotly.express as px
import pandas as pd
import dash
from dash import dcc, html
from dash.dependencies import Input, Output

import numpy as np
np.Infinity = np.inf

# Internal imports
from utils.voice_changer import VoiceChanger

# API Keys
load_dotenv()
hf_key = os.getenv("HF_API_KEY")
cartesia_key = os.getenv(("CARTESIA_API_KEY"))
login(hf_key)
os.environ["CARTESIA_API_KEY"] = cartesia_key


## 1. Data

In [2]:
# === Load Data ===

# Load the dataset
gs = load_dataset("speechcolab/gigaspeech", "xs", trust_remote_code=True)
gs = concatenate_datasets([ds for ds in gs.values()])

# Filter to podcast data and sort
gs = gs.filter(lambda example: "podcast" in example["original_full_path"].lower())
gs = gs.sort(["original_full_path", "segment_id"])

# Filter certain sentences
def filter_sentences(example):
    # Filter out sentences that are too long or too short
    # if len(example["text"]) > 50 or len(example["text"]) < 3:
    #     return False
    # Filter out sentences that contain certain words
    # if any(word in example["text"].lower() for word in ["cartesia", "gpt", "chatgpt"]):
    #     return False
    # Filter out sentences that are equal to MUSIC, OTHER, NOISE or SIL
    if example["text"].lower() in ["<music>", "<other>", "<noise>", "<sil>"]:
        return False
    return True
gs = gs.filter(filter_sentences)
print(gs)

for example in gs.select(range(20)):
    print(example["original_full_path"])
    print(example["segment_id"])
    display(Audio(example["audio"]["array"], rate=example["audio"]["sampling_rate"]))

Dataset({
    features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
    num_rows: 13598
})
audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000000


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000001


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000002


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000003


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000004


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000005


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000006


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000007


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000008


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000009


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000010


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000011


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000013


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000014


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000015


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000017


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000018


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000019


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000020


audio/podcast/P0000/POD1000000004.opus
POD1000000004_S0000021


## 2. Embeddings

In [3]:
# === Sentence-BERT Embeddings ===

# Extract embeddings
sentences = list(gs['text'])
model = SentenceTransformer('all-MiniLM-L6-v2') 
embeddings = model.encode(sentences, normalize_embeddings=True) 

# UMAP projection
reducer = umap.UMAP(metric="cosine", random_state=42)
coords = reducer.fit_transform(embeddings)


  warn(
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [28]:
import plotly.express as px
import pandas as pd
import ipywidgets as widgets
from IPython.display import display

# Add metadata
df = pd.DataFrame({
    "x": coords[:, 0],
    "y": coords[:, 1],
    "sentence": gs["text"],
    "original_full_path": gs["original_full_path"],
    "speaker": gs["speaker"],
    "segment_id": gs["segment_id"],
    "title": gs["title"],
    "url": gs["url"],
    "category": gs["category"],
})
color_columns = ["original_full_path", "category"]


# Dropdown
dropdown = widgets.Dropdown(
    options=color_columns,
    value="original_full_path",
    description='Color by:',
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='300px')
)

# Initial figure
def create_figure(color_by):
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color=color_by,
        hover_data={
            "sentence": True,
            "original_full_path": True,
            "speaker": True,
            "segment_id": True,
            "title": True,
            "url": True,
            "category": True,
            "x": False,
            "y": False,
        },
        opacity=0.7
    )
    fig.update_traces(marker=dict(size=3))
    fig.update_layout(
        title="Sentence-BERT embeddings (UMAP)",
        width=1200,
        height=800,
        showlegend = True #(color_by == "category")
    )
    return fig

fig = create_figure(dropdown.value)
fig_display = widgets.Output()

with fig_display:
    fig.show()

# Callback
def update_plot(change):
    with fig_display:
        fig_display.clear_output(wait=True)
        fig = create_figure(change["new"])
        fig.show()

dropdown.observe(update_plot, names="value")

# Show widgets
display(dropdown, fig_display)


Dropdown(description='Color by:', layout=Layout(width='300px'), options=('original_full_path', 'category'), st…

Output()

In [None]:
# === Your data ===
df = pd.DataFrame({
    "x": coords[:, 0],
    "y": coords[:, 1],
    "sentence": gs["text"],
    "original_full_path": gs["original_full_path"],
    "speaker": gs["speaker"],
    "segment_id": gs["segment_id"],
    "title": gs["title"],
    "url": gs["url"],
    "category": gs["category"],
})
color_columns = ["original_full_path", "category"]

# === Initialize app ===
app = dash.Dash(__name__)

app.layout = html.Div([
    dcc.Dropdown(
        id='color-dropdown',
        options=[{"label": col, "value": col} for col in color_columns],
        value="original_full_path",
        style={'width': '300px'}
    ),
    dcc.Graph(id='scatter-plot')
])

# === Callback ===
@app.callback(
    Output('scatter-plot', 'figure'),
    Input('color-dropdown', 'value')
)
def update_figure(color_by):
    fig = px.scatter(
        df,
        x="x",
        y="y",
        color=color_by,
        hover_data={
            "sentence": True,
            "original_full_path": True,
            "speaker": True,
            "segment_id": True,
            "title": True,
            "url": True,
            "category": True,
            "x": False,
            "y": False,
        },
        opacity=0.7
    )
    fig.update_traces(marker=dict(size=3))
    fig.update_layout(
        title="Sentence-BERT embeddings (UMAP)",
        width=1200,
        height=800,
        showlegend = True # (color_by == "category")
    )
    return fig

if __name__ == '__main__':
    app.run(debug=True)

# Open http://127.0.0.1:8050/ in browser


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


