In [2]:
import os
import torch
import torchaudio
from process_sml import (
    AudioDatasetFolder, Compose, RandomTimeCrop, RandomTimeStretch,
    RandomPitchShift, RandomNoise, RandomDistortion, RandomVolume,
    compute_waveform, compute_spectrogram
)
import torch
import torchaudio
import sounddevice as sd


In [None]:

root = "pre_saved_tensors"
# Define cache path
waveform_cache_path = f"{root}/cached_waveform.pt"
spec_cache_path = f"{root}/cached_spec.pt"

os.mkdir(root)

# Check if already cached
if os.path.exists(waveform_cache_path) and os.path.exists(spec_cache_path):
    print("Loading from cache...")
    waveform = torch.load(waveform_cache_path)
    spec = torch.load(spec_cache_path)
else:
    print("Processing from raw MP3...")
    waveform, sample_rate = torchaudio.load(r"C:\Users\rifat\Downloads\Music\super-saw-bass-37512.mp3")

    if sample_rate != 16000:
        resampler = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
        waveform = resampler(waveform)
        sample_rate = 16000

    # Take first 5 seconds
    num_samples = sample_rate * 5
    waveform = waveform[:, :num_samples]

    # Save waveform
    torch.save(waveform, waveform_cache_path)

    # Compute spectrogram and save
    spec = compute_spectrogram(waveform)
    torch.save(spec, spec_cache_path)

# Optional: Play audio
# Audio(waveform.numpy(), rate=16000)

# Show shape
print(f"Shape of 5-second noise spectrogram: {spec.abs().shape}")


In [6]:

spec = torch.load("pre_saved_tensors/cached_spec.pt")
spec = spec.abs()
print(f"shape of the computed spectogram without megnititude : {spec.shape}")
wav = compute_waveform(spec)

# Make sure wav is 1D (mono) or 2D (2, N) (stereo)
def prepare_for_playback(wav: torch.Tensor) -> torch.Tensor:
    if wav.dim() == 2:
        # shape: (channels, time)
        if wav.size(0) > 2:
            wav = wav[:2]  # take first two channels only
        return wav
    elif wav.dim() == 1:
        return wav
    else:
        raise ValueError("Unexpected waveform shape")


# Make sure wav is 1D (mono) or 2D (2, N) (stereo)
def prepare_for_playback(wav: torch.Tensor) -> torch.Tensor:
    if wav.dim() == 2:
        # shape: (channels, time)
        if wav.size(0) > 2:
            wav = wav[:2]  # take first two channels only
        return wav
    elif wav.dim() == 1:
        return wav
    else:
        raise ValueError("Unexpected waveform shape")

# Example

# Fix shape for playback
wav = prepare_for_playback(wav)

# Convert to numpy and transpose if stereo
wav_np = wav.cpu().numpy()
if wav_np.ndim == 2:
    wav_np = wav_np.T  # (channels, time) → (time, channels)

# Play
sd.play(wav_np, samplerate=16000)
sd.wait()


# Fix shape for playback
wav = prepare_for_playback(wav)

# Convert to numpy and transpose if stereo
wav_np = wav.cpu().numpy()
if wav_np.ndim == 2:
    wav_np = wav_np.T  # (channels, time) → (time, channels)

# Play
sd.play(wav_np, samplerate=16000)
sd.wait()


shape of the computed spectogram without megnititude : torch.Size([2, 1025, 157])


In [8]:
torchaudio.save("output.wav", wav, sample_rate=16000)


In [None]:
class RandomNoise:
    def __init__(self, snr_db: float = 10.0):
        """
        Args:
            snr_db (float): Desired Signal-to-Noise Ratio in dB. Lower = noisier.
        """
        self.snr_db = snr_db

    def __call__(self, waveform: torch.Tensor) -> torch.Tensor:
        # Generate Gaussian noise with the same shape
        noise = torch.randn_like(waveform)

        # Convert SNR to a tensor
        snr = torch.tensor([self.snr_db], device=waveform.device)

        # Apply add_noise with broadcasting
        noisy_waveform = torchaudio.functional.add_noise(waveform, noise, snr)

        return noisy_waveform


In [7]:
import os
import torchaudio
import torchaudio.transforms as T
import torch

# Parameters
input_dir = "sample_noise"
output_file = "output.wav"
target_sample_rate = 16000  # Change as needed
target_channels = 1         # Mono output

# Helper function to ensure uniform sample rate and channels
def process_audio(file_path, target_sr, target_channels):
    waveform, sample_rate = torchaudio.load(file_path)

    # Convert to mono if needed
    if waveform.shape[0] != target_channels:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample if needed
    if sample_rate != target_sr:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sr)
        waveform = resampler(waveform)

    return waveform

# Load and concatenate all audio files
all_waveforms = []

for filename in sorted(os.listdir(input_dir)):
    if filename.endswith(('.mp3', '.wav')):
        filepath = os.path.join(input_dir, filename)
        print(f"Processing: {filepath}")
        audio = process_audio(filepath, target_sample_rate, target_channels)
        all_waveforms.append(audio)

# Concatenate all waveforms along the time axis
concatenated = torch.cat(all_waveforms, dim=1)

# Save to output .wav file
torchaudio.save(output_file, concatenated, sample_rate=target_sample_rate)

print(f"\n✅ All audio files concatenated and saved to {output_file}")


Processing: sample_noise\noise_01.wav
Processing: sample_noise\noise_02.mp3
Processing: sample_noise\noise_03.mp3
Processing: sample_noise\noise_05.mp3
Processing: sample_noise\noise_06.mp3
Processing: sample_noise\noise_08.mp3
Processing: sample_noise\noise_09.mp3
Processing: sample_noise\noise_10.mp3
Processing: sample_noise\noise_11.mp3
Processing: sample_noise\noise_12.mp3
Processing: sample_noise\noise_13.mp3
Processing: sample_noise\noise_14.mp3
Processing: sample_noise\noise_15.mp3
Processing: sample_noise\noise_16.mp3
Processing: sample_noise\noise_17.mp3
Processing: sample_noise\noise_18.mp3
Processing: sample_noise\noise_19.mp3
Processing: sample_noise\noise_20.mp3

✅ All audio files concatenated and saved to output.wav


In [1]:
!pip install pydub

