Drive Configuration

In [None]:
# Setup
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/ForestSounds
!pip install librosa torchaudio matplotlib

Conversion to spectograms

In [None]:
import librosa
import numpy as np
from pathlib import Path
import os
import matplotlib.pyplot as plt

# Parameters
SAMPLE_RATE = 22050
N_FFT = 2048
HOP_LENGTH = 512
N_MELS = 256
DURATION = 4  # secs/clip

# Paths
driveRawAudioPath = '/content/drive/MyDrive/ForestSounds/raw_audio'
outputFolderPath = '/content/drive/MyDrive/ForestSounds/training_spectrograms'
os.makedirs(outputFolderPath, exist_ok=True)

audioFiles = []

print("Check raw_audio directory...")
path = Path(driveRawAudioPath)

if path.exists():
    print(f"Directory exists: {driveRawAudioPath}")
    audioFiles = list(path.rglob("*.wav")) + list(path.rglob("*.mp3")) + list(path.rglob("*.flac")) + list(path.rglob("*.ogg"))
    print(f"Found {len(audioFiles)} audio file(s).")
    if len(audioFiles) == 0:
        print("No audio files found.")
    else:
        print(f"Files found: {[f.name for f in audioFiles]}")
else:
    print(f"Directory not found at {driveRawAudioPath}")

if len(audioFiles) > 0:
    print("\n Starting spectrogram creation...")

    clipCount = 0
    for audioFile in audioFiles:
        print(f"Processing: {audioFile.name}...")
        try:
            y, sr = librosa.load(audioFile, sr=SAMPLE_RATE)
            samplesPerClip = SAMPLE_RATE * DURATION
            numClips = len(y) // samplesPerClip
            print(f"  -> Split into {numClips} clips.")

            for i in range(numClips):
                startSample = i * samplesPerClip
                endSample = startSample + samplesPerClip
                clip = y[startSample:endSample]

                # Generate Mel-spectrogram
                melSpec = librosa.feature.melspectrogram(y=clip, sr=sr, n_fft=N_FFT,
                                                          hop_length=HOP_LENGTH, n_mels=N_MELS)
                logMelSpec = librosa.power_to_db(melSpec, ref=np.max)
                logMelSpec = np.clip(logMelSpec, -80, 0) / 80 * 2 + 1
                outputFilename = f"spec_{clipCount:05d}.npy"
                outputPath = os.path.join(outputFolderPath, outputFilename)
                np.save(outputPath, logMelSpec)
                clipCount += 1

        except Exception as e:
            print(f"Error processing {audioFile.name}: {e}")
            continue

    print(f"\n Created {clipCount} total spectrogram clips.")
    print(f"Saved in: {outputFolderPath}")

else:
    print("\n Stopped. No valid audio files were found to process.")

Build GAN model

In [None]:
import torch.nn as nn
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# Random noise into a spectrogram
class Generator(nn.Module):
    def __init__(self, latentDim=100):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.ConvTranspose2d(latentDim, 512, 4, 1, 0, bias=False),
            nn.BatchNorm2d(512),
            nn.ReLU(True),

            nn.ConvTranspose2d(512, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.ReLU(True),

            nn.ConvTranspose2d(256, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.ReLU(True),

            nn.ConvTranspose2d(128, 64, 4, 2, 1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(True),

            nn.ConvTranspose2d(64, 1, 4, 2, 1, bias=False),
            nn.Tanh()
        )

    def forward(self, input):
        return self.main(input)

# Discriminator (real or fake)
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv2d(1, 64, 4, 2, 1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(64, 128, 4, 2, 1, bias=False),
            nn.BatchNorm2d(128),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(128, 256, 4, 2, 1, bias=False),
            nn.BatchNorm2d(256),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(256, 512, 4, 2, 1, bias=False),
            nn.BatchNorm2d(512),
            nn.LeakyReLU(0.2, inplace=True),

            nn.Conv2d(512, 1, 4, 1, 0, bias=False),
            nn.Sigmoid()
        )

    def forward(self, input):
        return self.main(input).view(-1)

latentDim = 100
netG = Generator(latentDim).to(device)
netD = Discriminator().to(device)

print("Generator input shape (latent noise):", (1, latentDim, 1, 1))

# Test Generator with fake input
testNoise = torch.randn(1, latentDim, 1, 1, device=device)
testOutput = netG(testNoise)
print("Generator output shape:", testOutput.shape)

# Test Discriminator with the generator's output
dOutput = netD(testOutput)
print("Discriminator output shape (should be 1):", dOutput.shape)
print("Discriminator output value (should be between 0-1):", dOutput.item())

print("Model build and shape test completed!\n")

# Model summaries
print("Generator Architecture:")
print(netG)
print("\nDiscriminator Architecture:")
print(netD)

Training Loop

In [None]:
from torch.utils.data import DataLoader, Dataset
import numpy as np
import os
import torch
import torch.nn.functional as F

class SpectrogramDataset(Dataset):
    def __init__(self, folderPath, targetSize=(64, 64)):
        self.folderPath = folderPath
        self.filePaths = [os.path.join(folderPath, f) for f in os.listdir(folderPath) if f.endswith('.npy')]
        self.targetSize = targetSize

    def __len__(self):
        return len(self.filePaths)

    def __getitem__(self, idx):
        spec = np.load(self.filePaths[idx])
        specTensor = torch.FloatTensor(spec).unsqueeze(0)
        specTensorResized = F.interpolate(specTensor.unsqueeze(0), size=self.targetSize, mode='bilinear', align_corners=False).squeeze(0)
        return specTensorResized

datasetPath = '/content/drive/MyDrive/ForestSounds/training_spectrograms'
dataset = SpectrogramDataset(datasetPath, targetSize=(64, 64))
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

firstBatch = next(iter(dataloader))
print(f"Batch shape: {firstBatch.shape}")

print(f"Loaded {len(dataset)} spectrogram samples for training.")

# Clean slate
latentDim = 100
netG = Generator(latentDim).to(device)
netD = Discriminator().to(device)

optimizerG = torch.optim.Adam(netG.parameters(), lr=0.0002, betas=(0.5, 0.999))
optimizerD = torch.optim.Adam(netD.parameters(), lr=0.0002, betas=(0.5, 0.999))
criterion = nn.BCELoss()

gLosses = []
dLosses = []

fixedNoise = torch.randn(16, latentDim, 1, 1, device=device)

os.makedirs('training_samples', exist_ok=True)

print("Starting training...")
numEpochs = 500

for epoch in range(numEpochs):
    for i, realData in enumerate(dataloader):
        netD.zero_grad()

        realData = realData.to(device)
        batchSize = realData.size(0)
        realLabels = torch.ones(batchSize, device=device)

        outputReal = netD(realData)
        lossDReal = criterion(outputReal, realLabels)

        # Fake data
        noise = torch.randn(batchSize, latentDim, 1, 1, device=device)
        fakeData = netG(noise)
        fakeLabels = torch.zeros(batchSize, device=device)

        outputFake = netD(fakeData.detach())
        lossDFake = criterion(outputFake, fakeLabels)

        lossD = lossDReal + lossDFake
        lossD.backward()
        optimizerD.step()

        netG.zero_grad()
        output = netD(fakeData)
        lossG = criterion(output, realLabels)
        lossG.backward()
        optimizerG.step()

        gLosses.append(lossG.item())
        dLosses.append(lossD.item())

    print(f'Epoch [{epoch+1}/{numEpochs}]\tLoss_D: {lossD.item():.4f}\tLoss_G: {lossG.item():.4f}')

    # Every 10 epochs, save sample of generated spectrograms
    if (epoch + 1) % 10 == 0 or epoch == 0:
        with torch.no_grad():
            fakeSamples = netG(fixedNoise).detach().cpu()

        # Plot samples
        fig, axes = plt.subplots(2, 4, figsize=(10, 5))
        for i, ax in enumerate(axes.flat):
            if i < 8:
                spec = fakeSamples[i].squeeze().numpy()
                ax.imshow(spec, aspect='auto', origin='lower', cmap='viridis')
                ax.axis('off')
        plt.suptitle(f'Generated Spectrograms - Epoch {epoch+1}')
        plt.tight_layout()
        plt.savefig(f'training_samples/epoch_{epoch+1:03d}.png')
        plt.show()

        # Save the model checkpoint
        torch.save(netG.state_dict(), f'generator_epoch_{epoch+1}.pth')
        print(f'Saved sample images and model checkpoint at epoch {epoch+1}')

# Plot loss history after training
plt.figure(figsize=(10, 5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(gLosses, label="G")
plt.plot(dLosses, label="D")
plt.xlabel("Iterations")
plt.ylabel("Loss")
plt.legend()
plt.savefig('training_loss.png')
plt.show()

print("Training finished!")

New Sound Generation

In [None]:
netG.eval()

def generateCustomSound(model, totalDuration=30, clipDuration=4, outputPrefix="ai_forest"):
    """
    Generates a sound of any duration by stitching multiple clips together.

    Args:
        model: Your trained Generator model.
        totalDuration: Total length of the final sound in seconds.
        clipDuration: Length of each individual generated clip in seconds.
        outputPrefix: Base name for the output files.
    """
    import math
    from scipy.io import wavfile

    numClips = math.ceil(totalDuration / clipDuration)
    actualDuration = numClips * clipDuration
    print(f" {numClips} clips ({clipDuration}s each) for a {actualDuration}s total sound")

    allAudio = np.array([])
    individualClips = []

    for i in range(numClips):
        with torch.no_grad():
            # Generate a new random spectrogram for each clip
            noise = torch.randn(1, 100, 1, 1, device=device)
            fakeSpec = netG(noise).cpu().numpy().squeeze()

        # Convert the spectrogram back to audio
        spec = (fakeSpec - 1) / 2 * 80 - 80
        specPower = librosa.db_to_power(spec)
        audio = librosa.feature.inverse.mel_to_audio(specPower, sr=SAMPLE_RATE,
                                                    n_fft=N_FFT, hop_length=HOP_LENGTH)

        # Trim longer clips
        if len(audio) > SAMPLE_RATE * clipDuration:
            audio = audio[:SAMPLE_RATE * clipDuration]

        # Save individual clip
        clipFilename = f"{outputPrefix}_clip_{i+1:03d}.wav"
        audioNormalized = np.int16(audio / np.max(np.abs(audio)) * 32767)
        wavfile.write(clipFilename, SAMPLE_RATE, audioNormalized)
        individualClips.append(clipFilename)

        # Append clip to main audio
        allAudio = np.concatenate((allAudio, audio))
        print(f"  Generated clip {i+1}/{numClips} -> {clipFilename}")

    # Save long, combined audio file
    longFilename = f"{outputPrefix}_full_{actualDuration}s.wav"
    audioNormalized = np.int16(allAudio / np.max(np.abs(allAudio)) * 32767)
    wavfile.write(longFilename, SAMPLE_RATE, audioNormalized)

    print(f"\n Generation complete!")
    print(f"   Full sound: {longFilename} ({actualDuration}s)")
    print(f"   Individual clips: {numClips} files")

    return longFilename, individualClips

print("\n" + "="*50)
print("OGenerating long ambient soundscape")
print("="*50)
full_sound, clips = generateCustomSound(netG, totalDuration=1200, clipDuration=15, outputPrefix="ambient")

print("\n All sounds generated!")

Post-processing

In [None]:
import numpy as np
from scipy.io import wavfile
from scipy import signal
import soundfile as sf
import os
import glob

def postprocessAudioFile(inputFilename, outputFilename=None,
                          fadeDuration=3.0, noiseIntensity=0.01,
                          cutoffFreq=5000):
    if outputFilename is None:
        base, ext = os.path.splitext(inputFilename)
        outputFilename = f"{base}_polished{ext}"

    print(f" Processing: {os.path.basename(inputFilename)}")

    sampleRate, audioData = wavfile.read(inputFilename)
    audioFloat = audioData.astype(np.float32) / 32768.0

    # Fade in/out
    fadeLength = int(sampleRate * fadeDuration)
    if len(audioFloat) > fadeLength * 2:
        fadeIn = np.linspace(0, 1, fadeLength)
        fadeOut = np.linspace(1, 0, fadeLength)
        audioFloat[:fadeLength] = audioFloat[:fadeLength] * fadeIn
        audioFloat[-fadeLength:] = audioFloat[-fadeLength:] * fadeOut

    # Background ambience
    brownNoise = np.cumsum(np.random.randn(len(audioFloat)))
    brownNoise = brownNoise / np.max(np.abs(brownNoise))
    audioFloat = audioFloat + (brownNoise * noiseIntensity)

    # Gentle EQ (low-pass filter)
    sos = signal.butter(4, cutoffFreq, 'lowpass', fs=sampleRate, output='sos')
    audioFloat = signal.sosfilt(sos, audioFloat)

    # Normalize and save
    audioFloat = audioFloat / np.max(np.abs(audioFloat))
    sf.write(outputFilename, audioFloat, sampleRate)

    return audioFloat, sampleRate, outputFilename

def postprocessDirectory(filePattern, outputSuffix="_polished", **kwargs):
    audioFiles = glob.glob(filePattern)
    print(f"Found {len(audioFiles)} files to process matching '{filePattern}'")

    results = []
    for audioFile in audioFiles:
        base, ext = os.path.splitext(audioFile)
        outputFile = f"{base}{outputSuffix}{ext}"

        processedAudio, sr, outputPath = postprocessAudioFile(
            audioFile, outputFile, **kwargs
        )
        results.append(outputPath)

    return results

print("Audio Post-Processing")
print("="*50)

longResult = []
clipResults = []
allResults = []

print("-" * 30)
longResult = postprocessAudioFile(
    "ambient_full_1200s.wav",
    "ambient_polished.wav",
    fadeDuration=5.0,
    noiseIntensity=0.008,
    cutoffFreq=4000
)
print(f"Created: {longResult[2]}")

print("\n" + "="*50)
print("Post-processing completed")
print("Generated files:")

allProcessedFiles = []
if longResult and len(longResult) > 2:
    allProcessedFiles.append(longResult[2])
if clipResults:
    allProcessedFiles.extend(clipResults)
if allResults:
    allProcessedFiles.extend(allResults)

# Print the results
for file in allProcessedFiles:
    print(f"  - {file}")

# If no files were processed, show a message
if not allProcessedFiles:
    print("  No files were processed.")

# Bonus: Create a preview if we have any files
if allProcessedFiles:
    print(f"\n Playing preview of 1st processed file: {allProcessedFiles[0]}")
    from IPython.display import Audio
    display(Audio(allProcessedFiles[0]))
else:
    print("\n No files available for preview")