In [1]:
import os
import torch
import librosa
import torch.nn as nn
import torch.optim as optim
from audiosr import build_model

# Folders
LOW_FOLDER = "low"
HIGH_FOLDER = "high"

# Device (CPU/GPU)
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load pretrained model
model = build_model(model_name="basic", device=device).to(device)

# Loss + optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Load training pairs
pairs = []
for file in os.listdir(LOW_FOLDER):
    if file.endswith(".wav"):
        low_path = os.path.join(LOW_FOLDER, file)
        high_path = os.path.join(HIGH_FOLDER, file)
        if os.path.exists(high_path):
            # Load audio
            y_low, sr_low = librosa.load(low_path, sr=None)
            y_high, sr_high = librosa.load(high_path, sr=None)

            # Resample to same rate
            target_sr = 12000
            y_low = librosa.resample(y_low, sr_low, target_sr)
            y_high = librosa.resample(y_high, sr_high, target_sr)

            # Convert to tensors
            low_tensor = torch.tensor(y_low, dtype=torch.float32).unsqueeze(0).to(device)
            high_tensor = torch.tensor(y_high, dtype=torch.float32).unsqueeze(0).to(device)

            pairs.append((low_tensor, high_tensor))

print(f"Loaded {len(pairs)} training pairs")

# Fine-tune
for epoch in range(3):  # small number of epochs
    total_loss = 0.0
    for low, high in pairs:
        output = model(low)
        loss = criterion(output, high)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
    print(f"Epoch {epoch+1} | Loss: {total_loss:.4f}")

# Save model
torch.save(model.state_dict(), "fine_tuned_audiosr.pth")
print("✅ Saved fine_tuned_audiosr.pth")


  from pkg_resources import resource_filename


Loading AudioSR: basic
Loading model on cpu


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


DiffusionWrapper has 258.20 M params.


  WeightNorm.apply(module, name, dim)


Loaded 0 training pairs
Epoch 1 | Loss: 0.0000
Epoch 2 | Loss: 0.0000
Epoch 3 | Loss: 0.0000
✅ Saved fine_tuned_audiosr.pth


In [7]:
import torch
import soundfile as sf
from audiosr import build_model, super_resolution

# Load model
device = "cuda" if torch.cuda.is_available() else "cpu"
model = build_model(model_name="basic", device=device)

# Load fine-tuned weights
model.load_state_dict(torch.load("fine_tuned_audiosr.pth", map_location=device))

# Run super-resolution
audios = super_resolution(
    model,
    "song.wav",   # your low-quality 12kHz file
    seed=42,
    guidance_scale=3.5,
    ddim_steps=50,
    latent_t_per_second=12.8
)

# Convert tensor -> numpy
enhanced = audios[0]  # first sample
if isinstance(enhanced, torch.Tensor):
    enhanced = enhanced.cpu().numpy()

# Fix shape (make it [samples, channels] if needed)
if enhanced.ndim == 2 and enhanced.shape[0] < enhanced.shape[1]:
    enhanced = enhanced.T  # transpose to (samples, channels)

# Save at 48kHz
sf.write("enhanced_output.wav", enhanced, 48000)
print("✅ Enhanced audio saved as enhanced_output.wav (48 kHz)")


Loading AudioSR: basic
Loading model on cpu
DiffusionWrapper has 258.20 M params.
Running DDIM Sampling with 50 timesteps


DDIM Sampler: 100%|██████████| 50/50 [15:53<00:00, 19.06s/it] 


✅ Enhanced audio saved as enhanced_output.wav (48 kHz)


In [None]:
import soundfile as sf

# Load the audio file
data, samplerate = sf.read('audio.wav')

print("Sample Rate:", samplerate)
