In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Directories

In [1]:
# Directories
VOXCELEB_DIR = "/kaggle/input/voxceleb1train/wav"
MUSAN_DIR = "/kaggle/input/musan-dataset/musan"
RIRS_DIR = "/kaggle/input/rirs-noises/RIRS_NOISES"
OUTPUT_DIR = "/kaggle/working/augmented_voxceleb"
OUTPUT_FILE = "/kaggle/working/pretrain_voxceleb.pt"  # Binary dataset file

# Here’s a PyTorch-based script that will:  

1. Load the **VoxCeleb** dataset.  
2. Apply **random augmentations** using the **MUSAN (music, speech, noise)** and **RIRS** datasets.  
3. Save the **augmented dataset** as a new set of preprocessed audio files for direct training.  

---

### **Setup Requirements**
Ensure you have the necessary Python libraries installed:  

```bash
pip install torchaudio torch numpy librosa soundfile
```

Also, ensure you have:  
- **VoxCeleb dataset** (`.wav` files).  
- **MUSAN dataset** (music, speech, noise).  
- **RIRS_NOISES dataset** (impulse responses).  

---

### **Code Implementation**
```python
import os
import torch
import torchaudio
import random
import numpy as np
import soundfile as sf
import librosa

# Directories
VOXCELEB_DIR = "/path/to/voxceleb"
MUSAN_DIR = "/path/to/musan"
RIRS_DIR = "/path/to/rirs_noises"
OUTPUT_DIR = "/path/to/augmented_voxceleb"

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load list of available augmentation files
def get_files(directory, ext=".wav"):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(ext)]

# Load augmentation files
musan_music_files = get_files(os.path.join(MUSAN_DIR, "music"))
musan_speech_files = get_files(os.path.join(MUSAN_DIR, "speech"))
musan_noise_files = get_files(os.path.join(MUSAN_DIR, "noise"))
rirs_files = get_files(os.path.join(RIRS_DIR, "simulated_rirs"))

# Load an audio file
def load_audio(file, target_sr=16000):
    waveform, sr = torchaudio.load(file)
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
    return waveform

# Apply augmentation
def apply_augmentation(wav, aug_file):
    aug_wav = load_audio(aug_file)
    min_len = min(wav.shape[1], aug_wav.shape[1])
    aug_wav = aug_wav[:, :min_len]  # Trim augmentation to match main audio length
    wav = wav[:, :min_len]
    
    # Apply mixing with random scaling
    mix_ratio = random.uniform(0.1, 0.5)
    return (1 - mix_ratio) * wav + mix_ratio * aug_wav

# Process all VoxCeleb files
voxceleb_files = get_files(VOXCELEB_DIR)
for idx, file in enumerate(voxceleb_files):
    try:
        wav = load_audio(file)

        # Randomly choose 2 augmentations
        aug_choices = random.sample([
            random.choice(musan_music_files),
            random.choice(musan_speech_files),
            random.choice(musan_noise_files),
            random.choice(rirs_files)
        ], k=2)

        # Apply augmentations
        for aug_file in aug_choices:
            wav = apply_augmentation(wav, aug_file)

        # Save the augmented audio
        output_file = os.path.join(OUTPUT_DIR, os.path.basename(file))
        sf.write(output_file, wav.numpy().squeeze(), 16000)

        print(f"[{idx+1}/{len(voxceleb_files)}] Processed: {file} -> {output_file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Augmentation Complete. Augmented files saved in:", OUTPUT_DIR)
```

---

### **How It Works**
1. **Loads** VoxCeleb audio files.  
2. **Randomly selects** 2 augmentation sources from **MUSAN (music, speech, noise)** and **RIRS_NOISES (reverberations)**.  
3. **Applies augmentation** by mixing the original waveform with the selected augmentations.  
4. **Saves** the augmented dataset for future training.  

---

### **Next Steps**
- Use the saved files directly for training without needing MUSAN and RIRS datasets.  
- Modify `mix_ratio` to control augmentation strength.  
- Add more augmentation strategies like **time-stretching, pitch shifting, and background noise variations** for robustness.  

Let me know if you need modifications!

## Code Implementation

In [None]:
import os
import torch
import torchaudio
import random
import numpy as np
import soundfile as sf
import librosa

# Directories
VOXCELEB_DIR = "/kaggle/input/voxceleb1train/wav"
MUSAN_DIR = "/kaggle/input/musan-dataset/musan"
RIRS_DIR = "/kaggle/input/rirs-noises/RIRS_NOISES"
OUTPUT_DIR = "/kaggle/working/augmented_voxceleb"

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Load list of available augmentation files
def get_files(directory, ext=".wav"):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(ext)]

# Load augmentation files
musan_music_files = get_files(os.path.join(MUSAN_DIR, "music"))
musan_speech_files = get_files(os.path.join(MUSAN_DIR, "speech"))
musan_noise_files = get_files(os.path.join(MUSAN_DIR, "noise"))
rirs_files = get_files(os.path.join(RIRS_DIR, "simulated_rirs"))

# Load an audio file
def load_audio(file, target_sr=16000):
    waveform, sr = torchaudio.load(file)
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
    return waveform

# Apply augmentation
def apply_augmentation(wav, aug_file):
    aug_wav = load_audio(aug_file)
    min_len = min(wav.shape[1], aug_wav.shape[1])
    aug_wav = aug_wav[:, :min_len]  # Trim augmentation to match main audio length
    wav = wav[:, :min_len]
    
    # Apply mixing with random scaling
    mix_ratio = random.uniform(0.1, 0.5)
    return (1 - mix_ratio) * wav + mix_ratio * aug_wav

# Process all VoxCeleb files
voxceleb_files = get_files(VOXCELEB_DIR)
for idx, file in enumerate(voxceleb_files):
    try:
        wav = load_audio(file)

        # Randomly choose 2 augmentations
        aug_choices = random.sample([
            random.choice(musan_music_files),
            random.choice(musan_speech_files),
            random.choice(musan_noise_files),
            random.choice(rirs_files)
        ], k=2)

        # Apply augmentations
        for aug_file in aug_choices:
            wav = apply_augmentation(wav, aug_file)

        # Save the augmented audio
        output_file = os.path.join(OUTPUT_DIR, os.path.basename(file))
        sf.write(output_file, wav.numpy().squeeze(), 16000)

        print(f"[{idx+1}/{len(voxceleb_files)}] Processed: {file} -> {output_file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

print("Augmentation Complete. Augmented files saved in:", OUTPUT_DIR)


# Got it! Instead of saving preprocessed audio files, I'll store the augmented dataset as a **binary file** (PyTorch `.pt` format), which you can directly load for training.  

This will:  
- Load **VoxCeleb** files.  
- Apply **random augmentations** using **MUSAN (music, speech, noise)** and **RIRS**.  
- Store the dataset as a **binary tensor file** (`.pt`) for direct use in training.  

---

### **Code Implementation**
```python
import os
import torch
import torchaudio
import random
import numpy as np

# Directories
VOXCELEB_DIR = "/path/to/voxceleb"
MUSAN_DIR = "/path/to/musan"
RIRS_DIR = "/path/to/rirs_noises"
OUTPUT_FILE = "/path/to/pretrain_voxceleb.pt"  # Binary dataset file

# Load augmentation files
def get_files(directory, ext=".wav"):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(ext)]

musan_music_files = get_files(os.path.join(MUSAN_DIR, "music"))
musan_speech_files = get_files(os.path.join(MUSAN_DIR, "speech"))
musan_noise_files = get_files(os.path.join(MUSAN_DIR, "noise"))
rirs_files = get_files(os.path.join(RIRS_DIR, "simulated_rirs"))

# Load an audio file
def load_audio(file, target_sr=16000):
    waveform, sr = torchaudio.load(file)
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
    return waveform

# Apply augmentation
def apply_augmentation(wav, aug_file):
    aug_wav = load_audio(aug_file)
    min_len = min(wav.shape[1], aug_wav.shape[1])
    aug_wav = aug_wav[:, :min_len]  # Trim augmentation to match main audio length
    wav = wav[:, :min_len]
    
    # Apply mixing with random scaling
    mix_ratio = random.uniform(0.1, 0.5)
    return (1 - mix_ratio) * wav + mix_ratio * aug_wav

# Process all VoxCeleb files and store in memory
dataset = []
voxceleb_files = get_files(VOXCELEB_DIR)

for idx, file in enumerate(voxceleb_files):
    try:
        wav = load_audio(file)

        # Randomly choose 2 augmentations
        aug_choices = random.sample([
            random.choice(musan_music_files),
            random.choice(musan_speech_files),
            random.choice(musan_noise_files),
            random.choice(rirs_files)
        ], k=2)

        # Apply augmentations
        for aug_file in aug_choices:
            wav = apply_augmentation(wav, aug_file)

        # Store as tensor with filename metadata
        dataset.append((wav, file))

        print(f"[{idx+1}/{len(voxceleb_files)}] Processed: {file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

# Save as binary file
torch.save(dataset, OUTPUT_FILE)
print("Dataset saved at:", OUTPUT_FILE)
```

---

### **How to Use the Saved Dataset**
```python
# Load preprocessed dataset
dataset = torch.load("/path/to/pretrain_voxceleb.pt")

# Access first audio sample
audio_tensor, filename = dataset[0]
print("Loaded Audio Shape:", audio_tensor.shape)
print("Original Filename:", filename)
```

---

### **Advantages**
✅ **Fast Training** – No need to reprocess audio each time.  
✅ **Efficient Storage** – Saves all data in a compact binary format.  
✅ **Metadata Retention** – Keeps original filenames for tracking.  

Let me know if you need further refinements!

## Code Implementation

In [3]:
import os
import torch
import torchaudio
import random
import numpy as np

# Directories
# VOXCELEB_DIR = "/path/to/voxceleb"
# MUSAN_DIR = "/path/to/musan"
# RIRS_DIR = "/path/to/rirs_noises"
# OUTPUT_FILE = "/kaggle/working/pretrain_voxceleb.pt"  # Binary dataset file

# Load augmentation files
def get_files(directory, ext=".wav"):
    return [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith(ext)]

musan_music_files = get_files(os.path.join(MUSAN_DIR, "music"))
musan_speech_files = get_files(os.path.join(MUSAN_DIR, "speech"))
musan_noise_files = get_files(os.path.join(MUSAN_DIR, "noise"))
rirs_files = get_files(os.path.join(RIRS_DIR, "simulated_rirs"))

# Load an audio file
def load_audio(file, target_sr=16000):
    waveform, sr = torchaudio.load(file)
    if sr != target_sr:
        waveform = torchaudio.transforms.Resample(orig_freq=sr, new_freq=target_sr)(waveform)
    return waveform

# Apply augmentation
def apply_augmentation(wav, aug_file):
    aug_wav = load_audio(aug_file)
    min_len = min(wav.shape[1], aug_wav.shape[1])
    aug_wav = aug_wav[:, :min_len]  # Trim augmentation to match main audio length
    wav = wav[:, :min_len]
    
    # Apply mixing with random scaling
    mix_ratio = random.uniform(0.1, 0.5)
    return (1 - mix_ratio) * wav + mix_ratio * aug_wav

# Process all VoxCeleb files and store in memory
dataset = []
voxceleb_files = get_files(VOXCELEB_DIR)

for idx, file in enumerate(voxceleb_files):
    try:
        wav = load_audio(file)

        # Randomly choose 2 augmentations
        aug_choices = random.sample([
            random.choice(musan_music_files),
            random.choice(musan_speech_files),
            random.choice(musan_noise_files),
            random.choice(rirs_files)
        ], k=2)

        # Apply augmentations
        for aug_file in aug_choices:
            wav = apply_augmentation(wav, aug_file)

        # Store as tensor with filename metadata
        dataset.append((wav, file))

        print(f"[{idx+1}/{len(voxceleb_files)}] Processed: {file}")

    except Exception as e:
        print(f"Error processing {file}: {e}")

# Save as binary file
torch.save(dataset, OUTPUT_FILE)
print("Dataset saved at:", OUTPUT_FILE)

Dataset saved at: /kaggle/working/pretrain_voxceleb.pt


## How to Use the Saved Dataset

In [None]:
# Load preprocessed dataset
dataset = torch.load("/kaggle/working/pretrain_voxceleb.pt")

# Access first audio sample
audio_tensor, filename = dataset[0]
print("Loaded Audio Shape:", audio_tensor.shape)
print("Original Filename:", filename)

# Now, I'll provide a **two-step process**:  

1. **Preprocessing & Feature Extraction**  
   - Load the augmented dataset from the binary `.pt` file.  
   - Extract necessary features (**MFCC, Mel spectrogram, Spectral Contrast, etc.**).  
   - Save extracted features as a **binary tensor file (`.pt`)** for training.  

2. **Loading & Using Preprocessed Features for Training**  
   - Load the preprocessed features.  
   - Use them directly in training.  

---

## **Step 1: Preprocessing & Feature Extraction**
```python
import torch
import torchaudio
import torchaudio.transforms as T
import os

# Paths
AUGMENTED_DATASET_FILE = "/path/to/pretrain_voxceleb.pt"  # Augmented dataset
FEATURES_OUTPUT_FILE = "/path/to/voxceleb_features.pt"  # Extracted features

# Define feature extraction
def extract_features(wav, sample_rate=16000):
    """Extracts various audio features from waveform."""
    # Convert to mono
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)

    # MFCC (Mel-Frequency Cepstral Coefficients)
    mfcc_transform = T.MFCC(sample_rate=sample_rate, n_mfcc=40)
    mfcc = mfcc_transform(wav)

    # Mel Spectrogram
    mel_spec_transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=64)
    mel_spec = mel_spec_transform(wav)

    # Spectral Contrast
    spectral_contrast = torchaudio.functional.spectral_centroid(wav, sample_rate)

    # Normalize all features
    mfcc = (mfcc - mfcc.mean()) / (mfcc.std() + 1e-6)
    mel_spec = (mel_spec - mel_spec.mean()) / (mel_spec.std() + 1e-6)
    spectral_contrast = (spectral_contrast - spectral_contrast.mean()) / (spectral_contrast.std() + 1e-6)

    return mfcc, mel_spec, spectral_contrast

# Load augmented dataset
dataset = torch.load(AUGMENTED_DATASET_FILE)
processed_features = []

for idx, (wav, filename) in enumerate(dataset):
    try:
        # Extract features
        mfcc, mel_spec, spectral_contrast = extract_features(wav)

        # Store as tuple (features, label/filename)
        processed_features.append((mfcc, mel_spec, spectral_contrast, filename))

        print(f"[{idx+1}/{len(dataset)}] Processed Features for: {filename}")

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Save extracted features
torch.save(processed_features, FEATURES_OUTPUT_FILE)
print("Feature extraction complete. Features saved at:", FEATURES_OUTPUT_FILE)
```

---

## **Step 2: Loading & Using Features for Training**
```python
# Load extracted features
features_dataset = torch.load("/path/to/voxceleb_features.pt")

# Example usage
for mfcc, mel_spec, spectral_contrast, filename in features_dataset:
    print(f"File: {filename}")
    print("MFCC Shape:", mfcc.shape)
    print("Mel Spectrogram Shape:", mel_spec.shape)
    print("Spectral Contrast Shape:", spectral_contrast.shape)
    break  # Print first sample
```

---

### **Advantages of This Approach**
✅ **Faster Training** – No need to re-extract features each time.  
✅ **Compact Storage** – Saves all extracted features efficiently.  
✅ **Direct Training Ready** – Just load and pass the features into your model.  

Let me know if you need more modifications!

## Step 1: Preprocessing & Feature Extraction

In [None]:
import torch
import torchaudio
import torchaudio.transforms as T
import os

# Paths
AUGMENTED_DATASET_FILE = "/kaggle/working/pretrain_voxceleb.pt"  # Augmented dataset
FEATURES_OUTPUT_FILE = "/kaggle/working/voxceleb_features.pt"  # Extracted features

# Define feature extraction
def extract_features(wav, sample_rate=16000):
    """Extracts various audio features from waveform."""
    # Convert to mono
    if wav.shape[0] > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)

    # MFCC (Mel-Frequency Cepstral Coefficients)
    mfcc_transform = T.MFCC(sample_rate=sample_rate, n_mfcc=40)
    mfcc = mfcc_transform(wav)

    # Mel Spectrogram
    mel_spec_transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=64)
    mel_spec = mel_spec_transform(wav)

    # Spectral Contrast
    spectral_contrast = torchaudio.functional.spectral_centroid(wav, sample_rate)

    # Normalize all features
    mfcc = (mfcc - mfcc.mean()) / (mfcc.std() + 1e-6)
    mel_spec = (mel_spec - mel_spec.mean()) / (mel_spec.std() + 1e-6)
    spectral_contrast = (spectral_contrast - spectral_contrast.mean()) / (spectral_contrast.std() + 1e-6)

    return mfcc, mel_spec, spectral_contrast

# Load augmented dataset
dataset = torch.load(AUGMENTED_DATASET_FILE)
processed_features = []

for idx, (wav, filename) in enumerate(dataset):
    try:
        # Extract features
        mfcc, mel_spec, spectral_contrast = extract_features(wav)

        # Store as tuple (features, label/filename)
        processed_features.append((mfcc, mel_spec, spectral_contrast, filename))

        print(f"[{idx+1}/{len(dataset)}] Processed Features for: {filename}")

    except Exception as e:
        print(f"Error processing {filename}: {e}")

# Save extracted features
torch.save(processed_features, FEATURES_OUTPUT_FILE)
print("Feature extraction complete. Features saved at:", FEATURES_OUTPUT_FILE)


## Step 2: Loading & Using Features for Training

In [None]:
# Load extracted features
features_dataset = torch.load("/kaggle/working/voxceleb_features.pt")

# Example usage
for mfcc, mel_spec, spectral_contrast, filename in features_dataset:
    print(f"File: {filename}")
    print("MFCC Shape:", mfcc.shape)
    print("Mel Spectrogram Shape:", mel_spec.shape)
    print("Spectral Contrast Shape:", spectral_contrast.shape)
    break  # Print first sample
