## Extract Pauses
- V1 on 9/27/2025 11am ET

In [None]:
# Install ffmpeg, pydub, boto3
!pip install ffmpeg-python imageio[ffmpeg]
!pip install pydub
!pip install boto3

In [None]:
import boto3
from google.colab import userdata
import io
import imageio_ffmpeg
from IPython.display import Audio
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
from pydub import AudioSegment
from pydub.silence import detect_silence

In [None]:
# S3 base path
audio_path = "s3://asrelder-data/common_voice/clips/"

In [None]:
# Establish session with credentials
bucket = "asrelder-data"
session = boto3.Session(
    aws_access_key_id=userdata.get("AWS_ACCESS"),
    aws_secret_access_key=userdata.get("AWS_SECRET"),
    region_name="us-east-1"
)
s3 = session.client("s3")

In [None]:
# Make sure that pydub is pointing at the ffmpeg we downloaded
AudioSegment.converter = imageio_ffmpeg.get_ffmpeg_exe()
AudioSegment.ffmpeg = AudioSegment.converter
AudioSegment.ffprobe = AudioSegment.converter

#### Download 'common_voices_sample1.csv' from Capstone Drive

In [None]:
# Download common_voices_sample1.csv
file_id = "1RwWlFOMBBOmimmLRLj-4cr43pgEn_vGj"
download_url = f"https://drive.google.com/uc?id={file_id}"
!gdown {download_url} -O sample.csv

In [None]:
# Sample DataFrame
sample_df = pd.read_csv("sample.csv")
print(f"sample_df.shape: {sample_df.shape}")
sample_df.head(2)

In [None]:
# Test boto3 fetch on one file
prefix = "common_voice/clips/"
filename = "common_voice_en_20131778.mp3"
obj = s3.get_object(Bucket=bucket, Key=prefix+filename)
audio_bytes = io.BytesIO(obj["Body"].read())
print(f"Loaded {filename} into memory: {len(audio_bytes.getvalue())} bytes")

# Length of clip in seconds (seek 0 rewind to start)
audio_bytes.seek(0)
clip = AudioSegment.from_file(audio_bytes, format="mp3")
print(f"Duration in seconds: {clip.duration_seconds}")

# Play the audio
audio_bytes.seek(0)
Audio(audio_bytes.read(), rate=16000)

In [None]:
def resolve_s3_key(fname: str, bucket: str, prefix: str) -> str | None:
    """
    Given a base filename, check for valid extensions in S3.
    Returns the full key if it exists, otherwise None.
    """
    base, ext = os.path.splitext(fname.strip())

    # If an extension is already provided, try it directly
    if ext:
        key = prefix + fname.strip()
        try:
            s3.head_object(Bucket=bucket, Key=key)
            return key
        except s3.exceptions.ClientError:
            return None

    # Otherwise, try common audio extensions
    for cand_ext in [".mp3", ".wav", ".flac", ".m4a", ".ogg"]:
        key = prefix + base + cand_ext
        try:
            s3.head_object(Bucket=bucket, Key=key)
            return key
        except s3.exceptions.ClientError:
            continue

    return None

In [None]:
def get_longest_silence(fname: str):
    try:
        key = resolve_s3_key(fname, bucket, prefix)
        if key is None:
            print(f"No matching file in S3 for {fname}")
            return None, None
        obj = s3.get_object(Bucket=bucket, Key=key)
        audio_bytes = io.BytesIO(obj["Body"].read())
        clip = AudioSegment.from_file(audio_bytes, format="mp3")
        silences = detect_silence(
            clip,
            min_silence_len=200,
            silence_thresh=clip.dBFS - 16
        )
        if silences:
            silences_sec = [(s/1000, e/1000) for s, e in silences]
            start, end = max(silences_sec, key=lambda x: x[1] - x[0])
            return start, end - start
        else:
            return None, 0.0
    except Exception as e:
        print(f"Error processing {fname}: {e}")
        return None, None

In [None]:
sample_df[["pause_timestamp", "pause_len"]] = sample_df["path"].apply(
    lambda f: pd.Series(get_longest_silence(f))
)

In [None]:
# Summary statistics
print(f"# clips succesfully fetched pause: {sample_df["pause_len"].notna().sum()}")
print(f"# clips unable to fetch pause: {sample_df["pause_len"].isna().sum()}")
print()
print(f"Longest pause: {sample_df["pause_len"].max():.2f}")
print(f"Shortest pause: {sample_df["pause_len"].min():.2f}")
print(f"Median pause: {sample_df["pause_len"].median():.2f}")
print(f"Mean average pause: {sample_df["pause_len"].mean():.2f}")

In [None]:
# Graph pause distribution
pause_data = sample_df["pause_len"].dropna()
bins = np.arange(0, pause_data.max() + 0.25, 0.25)

plt.figure(figsize=(10,3))
pause_data.hist(bins=bins, edgecolor=None)
plt.xticks(np.arange(0, pause_data.max() + 0.25, 0.25), rotation=0)
plt.xlabel("Pause length (seconds)")
plt.ylabel("Frequency")
plt.title("Distribution of pause lengths")
plt.grid(False)
plt.show()

In [None]:
# Write pause data to csv
non_null_pause_len = sample_df[sample_df["pause_len"] > 0]
non_null_pause_len[[
    "client_id",
    "path",
    "sentence_id",
    "age",
    "gender",
    "accents",
    "pause_timestamp",
    "pause_len",
]].to_csv("pause_lengths.csv")