In [1]:
from pydub import AudioSegment
import os

In [2]:
path = "~/data/musicnet/music/022021.mp3"
path = os.path.expanduser(path)

In [3]:
# Load MP3 file
audio = AudioSegment.from_mp3(path)

# Export as WAV
audio.export("converted_audio.wav", format="wav")

<_io.BufferedRandom name='converted_audio.wav'>

In [10]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import numpy as np
from pydub import AudioSegment, silence
import librosa
import base64
from io import BytesIO

# Function to convert MP3 to audio array
def load_audio(file_path):
    """Load audio file and return samples and sample rate"""
    audio = AudioSegment.from_file(file_path)
    samples = np.array(audio.get_array_of_samples())
    if audio.channels == 2:
        samples = samples.reshape((-1, 2))
        # For stereo, let's use the average of both channels
        samples = np.mean(samples, axis=1)
    
    return samples, audio.frame_rate, audio.duration_seconds

# Function to detect non-silent segments
def detect_recordings(audio_segment, min_silence_len=500, silence_thresh=-40):
    """Detect non-silent segments in audio"""
    non_silent_ranges = silence.detect_nonsilent(
        audio_segment, 
        min_silence_len=min_silence_len, 
        silence_thresh=silence_thresh
    )
    return non_silent_ranges

# Function to create interactive plot
def create_interactive_audio_plot(samples, sample_rate, duration, segments):
    """Create an interactive Plotly visualization of the audio"""
    # Create time axis
    time = np.linspace(0, duration, num=len(samples))
    
    # Create subplots: waveform on top, recording density on bottom
    fig = make_subplots(
        rows=2, cols=1,
        subplot_titles=('Audio Waveform', 'Recording Density per 10-Second Segment'),
        vertical_spacing=0.1,
        row_heights=[0.7, 0.3]
    )
    
    # Add waveform trace
    fig.add_trace(
        go.Scatter(
            x=time, 
            y=samples,
            mode='lines',
            line=dict(color='royalblue', width=1),
            name='Waveform',
            hovertemplate='Time: %{x:.2f}s<br>Amplitude: %{y}<extra></extra>'
        ),
        row=1, col=1
    )
    
    # Calculate recording density per 10-second segment
    segment_duration = 10  # seconds
    num_segments = int(np.ceil(duration / segment_duration))
    densities = []
    
    for i in range(num_segments):
        start_time = i * segment_duration
        end_time = min((i + 1) * segment_duration, duration)
        
        # Count recordings in this segment
        count = 0
        for seg_start, seg_end in segments:
            seg_start_s = seg_start / 1000  # convert ms to seconds
            seg_end_s = seg_end / 1000
            # Check if recording overlaps with this time segment
            if not (seg_end_s <= start_time or seg_start_s >= end_time):
                count += 1
                
        densities.append(count)
    
    # Add density bar chart
    segment_starts = [i * segment_duration for i in range(num_segments)]
    segment_centers = [i * segment_duration + segment_duration/2 for i in range(num_segments)]
    
    fig.add_trace(
        go.Bar(
            x=segment_centers,
            y=densities,
            width=segment_duration * 0.8,
            name='Recordings per 10s',
            marker_color='coral',
            hovertemplate='Segment: %{x:.1f}s<br>Recordings: %{y}<extra></extra>'
        ),
        row=2, col=1
    )
    
    # Add vertical lines for segment boundaries
    for i in range(1, num_segments):
        fig.add_vline(
            x=i * segment_duration, 
            line=dict(color="gray", width=1, dash="dash"),
            row=1, col=1
        )
        fig.add_vline(
            x=i * segment_duration, 
            line=dict(color="gray", width=1, dash="dash"),
            row=2, col=1
        )
    
    # Add rectangles for non-silent segments
    for seg_start, seg_end in segments:
        seg_start_s = seg_start / 1000  # convert ms to seconds
        seg_end_s = seg_end / 1000
        fig.add_vrect(
            x0=seg_start_s, x1=seg_end_s,
            fillcolor="lightgreen", opacity=0.2,
            line_width=0, row=1, col=1
        )
    
    # Update layout
    fig.update_layout(
        height=800,
        title_text="Audio Analysis Dashboard",
        showlegend=False,
        hovermode='x unified'
    )
    
    # Update axes
    fig.update_xaxes(title_text="Time (seconds)", row=1, col=1)
    fig.update_xaxes(title_text="Time (seconds)", row=2, col=1)
    fig.update_yaxes(title_text="Amplitude", row=1, col=1)
    fig.update_yaxes(title_text="Number of Recordings", row=2, col=1)
    
    return fig, densities

# Function to clip audio into 10-second segments
def clip_audio_segments(audio, output_prefix="segment"):
    """Clip audio into 10-second segments and save them"""
    segment_length = 10 * 1000  # 10 seconds in milliseconds
    total_duration = len(audio)
    segments = []
    
    for i, start_time in enumerate(range(0, total_duration, segment_length)):
        end_time = min(start_time + segment_length, total_duration)
        segment = audio[start_time:end_time]
        filename = f"{output_prefix}_{i+1}.wav"
        segment.export(filename, format="wav")
        segments.append(filename)
    
    return segments

# Main execution
def analyze_audio(file_path):
    """Main function to analyze audio file"""
    # Load audio
    samples, sample_rate, duration = load_audio(file_path)
    
    # Load audio for silence detection (pydub)
    audio_pydub = AudioSegment.from_file(file_path)
    
    # Detect recordings (non-silent segments)
    recordings = detect_recordings(audio_pydub)
    
    # Create interactive plot
    fig, densities = create_interactive_audio_plot(
        samples, sample_rate, duration, recordings
    )
    
    # Clip audio into 10-second segments
    segments = clip_audio_segments(audio_pydub)
    
    return fig, densities, segments, duration



In [11]:
fig, densities, segments, duration = analyze_audio(path)
    
# Save plot as HTML file instead of showing it
fig.write_html("audio_analysis.html")
print("Interactive plot saved as 'audio_analysis.html'. Open this file in your browser to view.")

# Print results
print(f"Audio duration: {duration:.2f} seconds")
print(f"Number of 10-second segments: {len(densities)}")
for i, density in enumerate(densities):
    print(f"Segment {i+1}: {density} recordings")

print(f"\nClipped audio saved as: {segments}")

Interactive plot saved as 'audio_analysis.html'. Open this file in your browser to view.
Audio duration: 30.00 seconds
Number of 10-second segments: 4
Segment 1: 1 recordings
Segment 2: 1 recordings
Segment 3: 1 recordings
Segment 4: 1 recordings

Clipped audio saved as: ['segment_1.wav', 'segment_2.wav', 'segment_3.wav', 'segment_4.wav']


In [12]:
len(segments[0])

13

In [13]:
segments[0]

'segment_1.wav'