### Install Dependencies

In [2]:
%pip install gtts librosa soundfile pydub scipy numpy -q

[33mDEPRECATION: Loading egg at /Users/saikrishna/anaconda3/lib/python3.11/site-packages/chainercv-0.13.1-py3.11-macosx-11.1-arm64.egg is deprecated. pip 23.3 will enforce this behaviour change. A possible replacement is to use pip for package installation..[0m[33m
[0mNote: you may need to restart the kernel to use updated packages.


### Text-to-Audio Pipeline
Converts text into MP3

In [None]:
import os
from pathlib import Path
from gtts import gTTS
from typing import List, Optional


class TextToAudioPipeline:
    """Pipeline for converting text to MP3 audio files."""
    
    def __init__(self, output_dir: str = "./outputs/"):
        self.output_dir = Path(output_dir)
        self._init_output_dir()
    

    def _init_output_dir(self):
        """Ensures the output directory exists."""
        self.output_dir.mkdir(parents=True, exist_ok=True)
    

    def _get_unique_filename(self, base_name: str) -> str:
        """
        Checks for file existence and returns a unique filename string.
        Example: 'test' -> 'test_v1' if 'test.mp3' exists.
        """
        stem = Path(base_name).stem
        candidate_stem = stem
        counter = 1
        
        while (self.output_dir / f"{candidate_stem}.mp3").exists() or \
              (self.output_dir / f"{candidate_stem}.txt").exists():
            candidate_stem = f"{stem}_v{counter}"
            counter += 1
        
        return candidate_stem
    

    def convert(self, input_text: str, output_filename: str = "file_save") -> Optional[Path]:
        """Converts text to an MP3 and saves a transcript copy."""
        if not input_text.strip():
            print("Error: Input text is empty.")
            return None
        
        unique_stem = self._get_unique_filename(output_filename)
        audio_path = self.output_dir / f"{unique_stem}.mp3"
        text_path = self.output_dir / f"{unique_stem}.txt"
        
        try:
            print(f"Processing: '{unique_stem}'...")
            
            # Generate and save audio
            tts = gTTS(text=input_text, lang='en', slow=False)
            tts.save(str(audio_path))
            
            # Save transcript
            text_path.write_text(input_text, encoding="utf-8")
            
            print(f"\t[✓] Transcript: {text_path.absolute()}")
            print(f"\t[✓] Audio:      {audio_path.absolute()}\n")
            
            return audio_path
            
        except Exception as e:
            print(f"Failed to process {unique_stem}: {e}")
            return None
    
    
    def batch_convert(self, input_texts: List[str], file_names: Optional[List[str]] = None) -> List[Path]:
        """Batch convert list of texts to audio files."""
        results = []
        
        if file_names:
            assert len(input_texts) == len(file_names), "Texts and filenames must have same length"
            for text, name in zip(input_texts, file_names):
                result = self.convert(text, name)
                if result:
                    results.append(result)
        else:
            for text in input_texts:
                result = self.convert(text)
                if result:
                    results.append(result)
        
        return results


print("✓ TextToAudioPipeline class loaded successfully!")

✓ TextToAudioPipeline class loaded successfully!


### Audio Permutation Pipeline
Apply various audio transformations: pitch shifting, speed changes, and reverb.

In [None]:
import librosa
import soundfile as sf
import numpy as np
from scipy.signal import fftconvolve
from pydub import AudioSegment
import shutil
from pathlib import Path
from typing import Optional


class AudioPermutationPipeline:
    """Pipeline for applying audio transformations and effects."""
    
    def __init__(self, output_dir: str = "./outputs/"):
        self.output_dir = Path(output_dir)
        self._init_output_dir()
    

    def _init_output_dir(self):
        """Ensures the output directory exists."""
        self.output_dir.mkdir(parents=True, exist_ok=True)
    

    @staticmethod
    def shift_pitch(data, sr, n_steps=0):
        """
        Shifts pitch (without changing duration).
        - positive 'n_steps' increases pitch
        - negative 'n_steps' decreases pitch 
        """
        return librosa.effects.pitch_shift(y=data, sr=sr, n_steps=n_steps)
    

    @staticmethod
    def stretch_time(data, rate=1.0):
        """
        Changes speed (without changing pitch).
        - 'rate' > 1.0 increases speed
        - 'rate' < 1.0 decreases speed
        """
        return librosa.effects.time_stretch(y=data, rate=rate)
    

    @staticmethod
    def apply_reverb(data, sr, room_size=0.5, wet_dry=0.3):
        """
        Adds reverb effect to audio.
        - 'room_size' controls the reverb length (0.0 to 1.0)
        - 'wet_dry' mixes between original and reverb (0.0 = dry, 1.0 = wet)
        """
        reverb_duration = room_size * 2.0
        ir_length = int(reverb_duration * sr)
        
        t = np.linspace(0, reverb_duration, ir_length)
        decay = np.exp(-3.0 * t / reverb_duration)
        impulse = decay * np.random.randn(ir_length) * 0.1
        
        reverb_signal = fftconvolve(data, impulse, mode='same')
        output = (1 - wet_dry) * data + wet_dry * reverb_signal
        output = output / np.max(np.abs(output))
        
        return output
    

    @staticmethod
    def save_as_mp3(data, sr, output_path):
        """Save audio data as MP3 file."""
        temp_wav = str(output_path).replace('.mp3', '_temp.wav')
        sf.write(temp_wav, data, sr)
        
        audio = AudioSegment.from_wav(temp_wav)
        audio.export(output_path, format='mp3', bitrate='192k')
        
        os.remove(temp_wav)
    
    
    def process(self,
                input_path: str,
                pitch_increase: Optional[float] = None,
                pitch_decrease: Optional[float] = None,
                speed_increase: Optional[float] = None,
                speed_decrease: Optional[float] = None,
                reverb_room_size: Optional[float] = None):
        """
        Process audio file with various transformations.
        
        Args:
            input_path: path to input audio file
            pitch_increase: positive number of semitones to increase pitch
            pitch_decrease: negative number of semitones to decrease pitch
            speed_increase: speed factor > 1.0 to increase speed
            speed_decrease: speed factor < 1.0 to decrease speed
            reverb_room_size: reverb room size (0.0 to 1.0)
        """
        if not os.path.exists(input_path):
            print(f"Error: File '{input_path}' not found.")
            return
        
        # Validate inputs
        assert pitch_increase is None or pitch_increase > 0, "pitch_increase must be positive"
        assert pitch_decrease is None or pitch_decrease < 0, "pitch_decrease must be negative"
        assert speed_increase is None or speed_increase > 1.0, "speed_increase must be > 1.0"
        assert speed_decrease is None or (0 < speed_decrease < 1.0), "speed_decrease must be between 0 and 1"
        assert reverb_room_size is None or (0 < reverb_room_size <= 1.0), "reverb_room_size must be between 0 and 1"
        
        # Create output subdirectory
        base_name = Path(input_path).stem
        output_subdir = self.output_dir / base_name
        output_subdir.mkdir(parents=True, exist_ok=True)
        
        # Copy input file
        print(f"Copying input file to output directory...")
        original_copy = output_subdir / Path(input_path).name
        shutil.copy2(input_path, original_copy)
        print(f"\tCopied to: {original_copy}\n")
        
        # Load audio
        print(f"Loading audio file ('{input_path}')...")
        data, sr = librosa.load(input_path, sr=None)
        
        # Apply transformations
        if pitch_increase:
            print(f"\tGenerating pitch increase by {pitch_increase} semitones...")
            pitch_up = self.shift_pitch(data, sr, n_steps=pitch_increase)
            output_path = output_subdir / f"{base_name}_pitch_up.mp3"
            self.save_as_mp3(pitch_up, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")
        
        if pitch_decrease:
            print(f"\tGenerating pitch decrease by {pitch_decrease} semitones...")
            pitch_down = self.shift_pitch(data, sr, n_steps=pitch_decrease)
            output_path = output_subdir / f"{base_name}_pitch_down.mp3"
            self.save_as_mp3(pitch_down, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")
        
        if speed_increase:
            print(f"\tGenerating increased speed by factor of {speed_increase}...")
            speed_up = self.stretch_time(data, rate=speed_increase)
            output_path = output_subdir / f"{base_name}_speed_up.mp3"
            self.save_as_mp3(speed_up, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")
        
        if speed_decrease:
            print(f"\tGenerating decreased speed by factor of {speed_decrease}...")
            speed_down = self.stretch_time(data, rate=speed_decrease)
            output_path = output_subdir / f"{base_name}_speed_down.mp3"
            self.save_as_mp3(speed_down, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")
        
        if reverb_room_size:
            print(f"\tGenerating reverb (room size: {reverb_room_size})...")
            reverb = self.apply_reverb(data, sr, room_size=reverb_room_size)
            output_path = output_subdir / f"{base_name}_reverb.mp3"
            self.save_as_mp3(reverb, sr, output_path)
            print(f"\t\tSaved: {output_path}\n")
        
        print(f"Done! All files saved to: {output_subdir}\n")


print("✓ AudioPermutationPipeline class loaded successfully!")

✓ AudioPermutationPipeline class loaded successfully!


## Usage

In [5]:
# Initialize pipelines
text_pipeline = TextToAudioPipeline(output_dir="./outputs/")
audio_pipeline = AudioPermutationPipeline(output_dir="./outputs/")

print("Pipelines initialized!\n")

Pipelines initialized!



### Pipeline 1: Single Text-to-Audio Conversion

In [6]:
# Single conversion
sample_text = """Hello, World! This is an audio file. Now I will read some random text.
Paragraphs are the building blocks of papers. Many students define 
paragraphs in terms of length: a paragraph is a group of at least five 
sentences, a paragraph is half a page long, etc. In reality, though, the 
unity and coherence of ideas among sentences is what constitutes a paragraph."""

audio_file = text_pipeline.convert(sample_text, "sample_output")
print(f"Generated audio file: {audio_file}")

Processing: 'sample_output'...
	[✓] Transcript: /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/sample_output.txt
	[✓] Audio:      /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/sample_output.mp3

Generated audio file: outputs/sample_output.mp3


### Pipeline 1: Batch Text-to-Audio Conversion

In [7]:
# Batch conversion
texts = [
    "This is the first audio message.",
    "This is the second audio message.",
    "This is the third audio message."
]

filenames = ["message_1", "message_2", "message_3"]

audio_files = text_pipeline.batch_convert(texts, filenames)
print(f"\nGenerated {len(audio_files)} audio files.")

Processing: 'message_1'...
	[✓] Transcript: /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/message_1.txt
	[✓] Audio:      /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/message_1.mp3

Processing: 'message_2'...
	[✓] Transcript: /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/message_2.txt
	[✓] Audio:      /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/message_2.mp3

Processing: 'message_3'...
	[✓] Transcript: /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/message_3.txt
	[✓] Audio:      /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/message_3.mp3


Generated 3 audio files.


### Pipeline 2: Audio Permutations

In [None]:
# Apply transformations to an existing audio file
audio_pipeline.process(
    "./outputs/sample_output.mp3",  # Use the file we just created
    pitch_increase=4,
    pitch_decrease=-4,
    speed_increase=1.5,
    speed_decrease=0.5,
    reverb_room_size=0.5
)

Copying input file to output directory...
	Copied to: outputs/sample_output/sample_output.mp3

Loading audio file ('./outputs/sample_output.mp3')...
	Generating pitch increase by 4 semitones...
		Saved: outputs/sample_output/sample_output_pitch_up.mp3

	Generating pitch decrease by -4 semitones...
		Saved: outputs/sample_output/sample_output_pitch_down.mp3

	Generating increased speed by factor of 1.5...
		Saved: outputs/sample_output/sample_output_speed_up.mp3

	Generating decreased speed by factor of 0.5...
		Saved: outputs/sample_output/sample_output_speed_down.mp3

	Generating reverb (room size: 0.5)...
		Saved: outputs/sample_output/sample_output_reverb.mp3

Done! All files saved to: outputs/sample_output



### Linking Pipelines: Generate Text-to-Audio, then Generate Audio Permutations

In [9]:
# Chain both pipelines together
print("STEP 1: Converting text to audio...")

text = "The quick brown fox jumps over the lazy dog. This is a test of the audio processing pipeline."
audio_path = text_pipeline.convert(text, "complete_pipeline_test")

print("\nSTEP 2: Applying audio effects...")

if audio_path:
    audio_pipeline.process(
        str(audio_path),
        pitch_increase=5,
        pitch_decrease=-5,
        speed_increase=2.0,
        reverb_room_size=0.7
    )
    print("\n✓ Complete pipeline finished successfully!")
else:
    print("Failed to generate audio file.")

STEP 1: Converting text to audio...
Processing: 'complete_pipeline_test'...
	[✓] Transcript: /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/complete_pipeline_test.txt
	[✓] Audio:      /Users/saikrishna/Desktop/eccv_audio/eccv_audio/outputs/complete_pipeline_test.mp3


STEP 2: Applying audio effects...
Copying input file to output directory...
	Copied to: outputs/complete_pipeline_test/complete_pipeline_test.mp3

Loading audio file ('outputs/complete_pipeline_test.mp3')...
	Generating pitch increase by 5 semitones...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_pitch_up.mp3

	Generating pitch decrease by -5 semitones...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_pitch_down.mp3

	Generating increased speed by factor of 2.0...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_speed_up.mp3

	Generating reverb (room size: 0.7)...
		Saved: outputs/complete_pipeline_test/complete_pipeline_test_reverb.mp3

Done! All files saved to: ou