In [1]:
import os
import json
import random
import numpy as np
import torch
from pathlib import Path

In [2]:
# Fix seeds
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

<torch._C.Generator at 0x2ccfffd4490>

In [3]:
# Base dirs
BASE_DIR = Path(r"C:/Users/ADMIN/Downloads/SSP/VoiceProject/processed")
TRAIN_META = BASE_DIR / "train_metadata.csv"
TEST_META  = BASE_DIR / "test_metadata.csv"
NORM_MEL_DIR = BASE_DIR / "normalized"   # mel .npy files
CHECKPOINT_DIR = BASE_DIR / "checkpoints/hifigan"
CHECKPOINT_DIR.mkdir(parents=True, exist_ok=True)

# Torch setup
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_AMP = True
print("Device:", DEVICE)

Device: cuda


In [5]:
# Load config 
import json

config_path = r"C:\Users\ADMIN\Downloads\SSP\VoiceProject\hifi-gan\config_v1.json"

with open(config_path, "r") as f:
    config = json.load(f)

SR = config["sampling_rate"]
N_MELS = config["num_mels"]
HOP_LENGTH = config["hop_size"]
WIN_LENGTH = config["win_size"]

# Stats for de-normalization
MEL_MEAN = np.load(BASE_DIR / "mel_mean.npy")
MEL_STD  = np.load(BASE_DIR / "mel_std.npy")

### Dataset

In [6]:
import pandas as pd
import torchaudio
from torch.utils.data import Dataset, DataLoader

def find_mel_file(utt_id: str):
    candidates = [
        NORM_MEL_DIR / f"{utt_id}_mel.npy",
        NORM_MEL_DIR / f"{utt_id}.npy",
    ]
    for p in candidates:
        if p.exists():
            return p
    return None

In [7]:
def load_mel_for_vocoder(utt_id: str):
    p = find_mel_file(utt_id)
    if p is None:
        raise FileNotFoundError(f"Mel not found for utt {utt_id}")
    mel = np.load(p).astype(np.float32)
    return mel * MEL_STD + MEL_MEAN

In [8]:
def load_wav(path: str, sr=SR):
    wav, orig_sr = torchaudio.load(path)
    if wav.ndim > 1:
        wav = wav.mean(dim=0, keepdim=True)
    wav = wav.squeeze(0)
    if orig_sr != sr:
        wav = torchaudio.functional.resample(wav, orig_sr, sr)
    return wav.numpy().astype(np.float32)


In [9]:
class HiFiGANDataset(Dataset):
    def __init__(self, metadata_csv, mel_dir, sr=SR):
        self.df = pd.read_csv(metadata_csv)
        self.mel_dir = Path(mel_dir)
        self.sr = sr
        rows = []
        for _, r in self.df.iterrows():
            utt = str(r['utt_id'])
            if find_mel_file(utt) and Path(r['path']).exists():
                rows.append(r)
        self.df = pd.DataFrame(rows).reset_index(drop=True)
        print(f"HiFiGANDataset: {len(self.df)} examples from {metadata_csv}")

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        r = self.df.iloc[idx]
        utt = str(r['utt_id'])
        mel = torch.from_numpy(load_mel_for_vocoder(utt).T).float()
        wav = torch.from_numpy(load_wav(r['path'], sr=self.sr)).float()
        return mel, wav, utt

In [10]:
def hifigan_collate(batch):
    mels, wavs, utts = zip(*batch)
    mel_lens = [m.shape[1] for m in mels]
    wav_lens = [w.shape[0] for w in wavs]

    mel_pad = torch.zeros(len(mels), N_MELS, max(mel_lens))
    wav_pad = torch.zeros(len(wavs), max(wav_lens))
    for i, (m, w) in enumerate(zip(mels, wavs)):
        mel_pad[i, :, :m.shape[1]] = m
        wav_pad[i, :w.shape[0]] = w

    return mel_pad, torch.tensor(mel_lens), wav_pad, torch.tensor(wav_lens), list(utts)


### Import HiFi-GAN Models from Repo

In [12]:
import os
os.chdir(r"C:\Users\ADMIN\Downloads\SSP\VoiceProject\hifi-gan")

In [13]:
from meldataset import mel_spectrogram
from models import Generator, MultiPeriodDiscriminator, MultiScaleDiscriminator


### Initialize Models & Optimizers

In [15]:
class AttrDict(dict):
    def __init__(self, *args, **kwargs):
        super(AttrDict, self).__init__(*args, **kwargs)
        self.__dict__ = self

In [16]:
import json

with open(r"C:\Users\ADMIN\Downloads\SSP\VoiceProject\hifi-gan\config_v1.json") as f:
    config = AttrDict(json.load(f))

In [17]:
gen = Generator(config).to(DEVICE)
mpd = MultiPeriodDiscriminator().to(DEVICE)
msd = MultiScaleDiscriminator().to(DEVICE)

opt_g = torch.optim.AdamW(gen.parameters(), lr=2e-4, betas=(0.8, 0.99))
opt_d = torch.optim.AdamW(list(mpd.parameters()) + list(msd.parameters()), lr=2e-4, betas=(0.8, 0.99))

scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)

print("Generator params (M):", sum(p.numel() for p in gen.parameters())/1e6)

  WeightNorm.apply(module, name, dim)


Generator params (M): 13.93613


  scaler = torch.cuda.amp.GradScaler(enabled=USE_AMP)


### DATALOADERS

In [18]:
BATCH_SIZE = 4
train_ds = HiFiGANDataset(TRAIN_META, NORM_MEL_DIR)
val_ds   = HiFiGANDataset(TEST_META, NORM_MEL_DIR)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, collate_fn=hifigan_collate)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, collate_fn=hifigan_collate)

HiFiGANDataset: 29102 examples from C:\Users\ADMIN\Downloads\SSP\VoiceProject\processed\train_metadata.csv
HiFiGANDataset: 4134 examples from C:\Users\ADMIN\Downloads\SSP\VoiceProject\processed\test_metadata.csv
