# Audio Mining Notebook, Spring Rotation 2025, SALT Lab
Steven Dillmann, Stanford University, stevendi@stanford.edu

In [None]:
# External imports
from datasets import load_dataset
from datasets import concatenate_datasets
import os
from huggingface_hub import login
from IPython.display import Audio, display
import requests
import soundfile as sf
import io
from dotenv import load_dotenv

# Internal imports
from utils.voice_changer import VoiceChanger

# API Keys
load_dotenv()
hf_token = os.getenv("HF_API_KEY")
cartesia_token = os.getenv(("CARTESIA_API_KEY"))


## 1. Data

#### Load Data

In [2]:
# === Load Data ===
gs = load_dataset("speechcolab/gigaspeech", "xs", trust_remote_code=True)
gs = concatenate_datasets([ds for ds in gs.values()])
print(gs)

Dataset({
    features: ['segment_id', 'speaker', 'text', 'audio', 'begin_time', 'end_time', 'audio_id', 'title', 'url', 'source', 'category', 'original_full_path'],
    num_rows: 41758
})


#### Display Audio Data 

In [3]:
# === Display Audio Example ===

# Choose example
example_id = 1
audio_input = gs[example_id]["audio"] 
transcription = gs[example_id]["text"]
full_path = gs[example_id]["original_full_path"]

# Display the audio
display(Audio(audio_input["array"], rate=audio_input["sampling_rate"]))
print("Transcription:", transcription)
print("Original Full Path:", full_path)

# Save the audio to a file .wav
sf.write("audio.wav", audio_input["array"], audio_input["sampling_rate"])

# Load the audio file
# audio, samplerate = sf.read("audio.wav")
# print("Audio shape:", audio.shape)
# print("Sample rate:", samplerate)

Transcription: SIX TOMATOES <PERIOD>
Original Full Path: audio/audiobook/P0011/AUD0000001043.opus


#### Stitch Audio Data

In [4]:
# === Stitch Audio ===

# Filter for the same full path
df_path = gs.filter(lambda example: example["original_full_path"] == full_path)
print("Number of examples with the same full path:", len(df_path))

# Stitch the audio
df_path = df_path.sort("segment_id")
for i in range(10):
  print(df_path[i]['segment_id'])
  print(df_path[i]['text'])
  display(Audio(df_path[i]["audio"]["array"], rate=df_path[i]["audio"]["sampling_rate"]))

Number of examples with the same full path: 1057
AUD0000001043_S0000006
BUT BEYOND THIS POINT THE SITUATION WAS BITTER <PERIOD>


AUD0000001043_S0000007
HIS TWO KIDS <COMMA> BUBS <COMMA> SEVEN <COMMA> AND EVELYN <COMMA> NINE CLAD IN SPACE-SUITS THAT WERE SLIGHTLY OVERSIZE TO ALLOW FOR THE GROWTH OF YOUNG BODIES WERE BOTH BAWLING <PERIOD>


AUD0000001043_S0000008
HE COULD HEAR THEM THROUGH HIS OXYGEN-HELMET RADIOPHONES <PERIOD>


AUD0000001043_S0000010
AT HIS ELBOW <COMMA> HIS WIFE <COMMA> ROSE <COMMA> HER HEART-SHAPED FACE AND GREY EYES FRAMED BY THE WIDE FACE-WINDOW OF HER ARMOR <COMMA>


AUD0000001043_S0000011
WAS TRYING DESPERATELY TO CHOKE BACK TEARS <COMMA> AND BE BRAVE <PERIOD>


AUD0000001043_S0000012
REMEMBER WE'VE GOT TO MAKE GOOD HERE <COMMA> JOHNNY <COMMA>


AUD0000001043_S0000013
SHE WAS SAYING <PERIOD>


AUD0000001043_S0000014
REMEMBER WHAT THE HOMESTEADERS OFFICE PEOPLE TOLD US THAT WITH MODERN EQUIPMENT AND THE RIGHT FRAME OF MIND <COMMA>


AUD0000001043_S0000015
LIFE CAN BE NICE OUT HERE <PERIOD>


AUD0000001043_S0000016
IT'S WORKED ON OTHER ASTEROIDS <PERIOD>


## 2. Voice Changer

In [11]:
# === Change Voice ===
vc = VoiceChanger(config_path="utils/configs/voice_changer_config.yaml")
input_audio_path = "audio.wav"
output_audio_path = "audio_changed.wav"
vc.run_pipeline(input_audio_path, output_audio_path)

✅ Voice transformation successful.


In [12]:
# Stitch the audio
df_path = df_path.sort("segment_id")
for i in range(10):
  print(df_path[i]['segment_id'])
  print(df_path[i]['text'])
  audio_input = df_path[i]["audio"] 
  # save the audio to a file
  sf.write(f"audio{i}.wav", audio_input["array"], audio_input["sampling_rate"])
  vc.run_pipeline(f"audio{i}.wav", f"audio_changed{i}.wav")


AUD0000001043_S0000006
BUT BEYOND THIS POINT THE SITUATION WAS BITTER <PERIOD>
✅ Voice transformation successful.


AUD0000001043_S0000007
HIS TWO KIDS <COMMA> BUBS <COMMA> SEVEN <COMMA> AND EVELYN <COMMA> NINE CLAD IN SPACE-SUITS THAT WERE SLIGHTLY OVERSIZE TO ALLOW FOR THE GROWTH OF YOUNG BODIES WERE BOTH BAWLING <PERIOD>
✅ Voice transformation successful.


AUD0000001043_S0000008
HE COULD HEAR THEM THROUGH HIS OXYGEN-HELMET RADIOPHONES <PERIOD>
✅ Voice transformation successful.


AUD0000001043_S0000010
AT HIS ELBOW <COMMA> HIS WIFE <COMMA> ROSE <COMMA> HER HEART-SHAPED FACE AND GREY EYES FRAMED BY THE WIDE FACE-WINDOW OF HER ARMOR <COMMA>
✅ Voice transformation successful.


AUD0000001043_S0000011
WAS TRYING DESPERATELY TO CHOKE BACK TEARS <COMMA> AND BE BRAVE <PERIOD>
✅ Voice transformation successful.


AUD0000001043_S0000012
REMEMBER WE'VE GOT TO MAKE GOOD HERE <COMMA> JOHNNY <COMMA>
✅ Voice transformation successful.


AUD0000001043_S0000013
SHE WAS SAYING <PERIOD>
✅ Voice transformation successful.


AUD0000001043_S0000014
REMEMBER WHAT THE HOMESTEADERS OFFICE PEOPLE TOLD US THAT WITH MODERN EQUIPMENT AND THE RIGHT FRAME OF MIND <COMMA>
✅ Voice transformation successful.


AUD0000001043_S0000015
LIFE CAN BE NICE OUT HERE <PERIOD>
✅ Voice transformation successful.


AUD0000001043_S0000016
IT'S WORKED ON OTHER ASTEROIDS <PERIOD>
✅ Voice transformation successful.
