<a href="https://www.kaggle.com/code/nishchalamukku/pitch-estimation?scriptVersionId=248975508" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import torch
from torch import nn 
from torch.nn import functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
import re
from torch.nn.utils.rnn import pad_sequence
import matplotlib.pyplot as plt
import numpy as np
from librosa.filters import mel
from librosa.util import pad_center
from scipy.signal import get_window
import os

In [26]:
PAD_IDX=0
BATCH_SIZE = 16
N_CLASS=360

In [3]:
TRAINING_DEVICE = ("cuda" if torch.cuda.is_available() else "cpu")
#TRAINING_DEVICE = "cpu"
print(f"Using {TRAINING_DEVICE} device")

Using cuda device


In [5]:
# data_path = "/speech/nishanth/clean_research/ptdb_full_data/mudit_full_final_data_1.008"
data_path="/kaggle/input/pitch-data/data_1.008"

In [6]:
train_X = os.listdir(f"{data_path}/train/gd_1.008")
train_X = [os.path.join(f"{data_path}/train/gd_1.008", x) for x in train_X]
train_y = os.listdir(f"{data_path}/train/labels")
train_y = [os.path.join(f"{data_path}/train/labels", y) for y in train_y]

valid_X = os.listdir(f"{data_path}/valid/gd_1.008")
valid_X = [os.path.join(f"{data_path}/valid/gd_1.008", x) for x in valid_X]
valid_y = os.listdir(f"{data_path}/valid/labels")
valid_y = [os.path.join(f"{data_path}/valid/labels", y) for y in valid_y]

In [7]:
train_data = list(zip(train_X, train_y))
valid_data = list(zip(valid_X, valid_y))

In [8]:
class ModelCheckpointStore:
    def __init__(self, dump_dir):
        self.dump_dir = dump_dir
        self.best_param_epoch = None
        self.last_save_step = 0

    def __call__(self, model, training_metrics, metric, current_step):
        param = "acc" if metric.endswith("acc") else "loss"
        model_save_dir = os.path.join(self.dump_dir, "checkpoints")
        if not os.path.exists(model_save_dir):
            os.makedirs(model_save_dir)

        new_valid = os.path.join(model_save_dir, f"step_{current_step}_{param}_latest.pth")
        old_valid = os.path.join(model_save_dir, f"step_{self.last_save_step}_{param}_latest.pth")
        torch.save(model, new_valid)
        self.last_save_step = current_step

        if len(training_metrics[metric]) == 1:
            best_checkpoint = os.path.join(model_save_dir, f"step_{current_step}_{param}_best.pth")
            torch.save(model.state_dict(), best_checkpoint)
            self.best_param_epoch = current_step
        else:
            is_best = False
            if param == "acc" and training_metrics[metric][-1] > training_metrics[metric][-2]:
                is_best = True
            elif param == "loss" and training_metrics[metric][-1] < training_metrics[metric][-2]:
                is_best = True

            if is_best:
                best_checkpoint = os.path.join(model_save_dir, f"step_{current_step}_{param}_best.pth")
                torch.save(model, best_checkpoint)
                old_best = os.path.join(model_save_dir, f"step_{self.best_param_epoch}_{param}_best.pth")
                if os.path.exists(old_best):
                    os.remove(old_best)
                self.best_param_epoch = current_step

In [9]:
def display(step, training_metrics):
    to_print = f"Epoch : {step}\n" + "\n".join([f"{key} : {value[-1]}" for key, value in training_metrics.items()])        
    print(to_print)

In [10]:
def plot(dump_dir, model_name, **kwargs):
    plot_save_path = os.path.join(dump_dir, f"{model_name}_plots")
    if not os.path.exists(plot_save_path):
        os.makedirs(plot_save_path)
    acc_plot = plt.figure(figsize=(15, 8))
    loss_plot = plt.figure(figsize=(15, 8))

    for key, value in kwargs.items():
        if key.endswith('acc'):
            plt.figure(acc_plot.number)
            plt.plot(value, label=key)
            plt.xlabel('steps')
            plt.ylabel('Accuracy')
            plt.legend()
            plt.title('Accuracy')

        elif key.endswith('loss'):
            plt.figure(loss_plot.number)
            plt.plot(value, label=key)
            plt.xlabel('steps')
            plt.ylabel('Loss')
            plt.legend()
            plt.title('Loss')

    plt.figure(acc_plot.number)
    plt.savefig(os.path.join(plot_save_path, 'acc.png'))
    plt.close(acc_plot)

    plt.figure(loss_plot.number)
    plt.savefig(os.path.join(plot_save_path, 'loss.png'))
    plt.close(loss_plot)

In [11]:
def hz_to_bin(f):
    mask = np.where(f==0.0)
    
    cent = 1200 * np.log2((f / 10) + 1e-9)
    cent -= (1997.37 - 20)
    cent[mask] = 0.0
    bin_ = np.floor(cent / 20)

    return np.minimum(bin_, 300)

In [35]:
def collate_fn(batch, device=TRAINING_DEVICE):
    feats, labels = [], []
    for gd_path, lab_path in batch:
        gd   = np.load(gd_path)         # (1, T, 512)
        if gd.ndim == 3 and gd.shape[0] == 1 and gd.shape[2] == 512:
            # feats.append(torch.tensor(gd.squeeze(0), dtype=torch.float32))   # (T,512)
            # labels.append(torch.tensor(hz_to_bin(np.load(lab_path)), dtype=torch.long))
            feats.append(torch.tensor(gd.squeeze(0), dtype=torch.float32))   # (T,512)
            labels.append(torch.tensor(hz_to_bin(np.load(lab_path)), dtype=torch.long))

    if len(feats) == 0:
        raise RuntimeError("All items in batch were invalid.")

    feats  = pad_sequence(feats,  padding_value=PAD_IDX, batch_first=False)  # (Tmax,B,512)
    labels = pad_sequence(labels, padding_value=PAD_IDX, batch_first=False)  # (Tmax,B)

    feats  = feats.permute(1,0,2).unsqueeze(1)   # (B,1,T,512)
    labels = labels.permute(1,0)                                 # (B,Tmax)

    return feats.to(device), labels.to(device)

In [36]:
train_loader = DataLoader(train_data, batch_size=BATCH_SIZE,shuffle=True,  collate_fn=collate_fn)
valid_loader = DataLoader(valid_data, batch_size=BATCH_SIZE,shuffle=False, collate_fn=collate_fn)

In [37]:
for x, y in train_loader:
    print("Input shape:", x.shape)   # e.g., (B, 3, T, 512)
    print("Label shape:", y.shape)   # e.g., (B, T)
    break  # Just inspect the first batch

Input shape: torch.Size([16, 1, 886, 512])
Label shape: torch.Size([16, 886])


Model architecture

In [15]:
class ConvBlockRes(nn.Module):
    def __init__(self, in_channels, out_channels, momentum=0.01):
        super(ConvBlockRes, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels=in_channels,
                      out_channels=out_channels,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=(1, 1),
                      bias=False),
            nn.BatchNorm2d(out_channels, momentum=momentum),
            nn.ReLU(),

            nn.Conv2d(in_channels=out_channels,
                      out_channels=out_channels,
                      kernel_size=(3, 3),
                      stride=(1, 1),
                      padding=(1, 1),
                      bias=False),
            nn.BatchNorm2d(out_channels, momentum=momentum),
            nn.ReLU(),
        )
        if in_channels != out_channels:
            self.shortcut = nn.Conv2d(in_channels, out_channels, (1, 1))
            self.is_shortcut = True
        else:
            self.is_shortcut = False

    def forward(self, x):
        if self.is_shortcut:
            return self.conv(x) + self.shortcut(x)
        else:
            return self.conv(x) + x

In [16]:
class ResEncoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, n_blocks=1, momentum=0.01):
        super(ResEncoderBlock, self).__init__()
        self.n_blocks = n_blocks
        self.conv = nn.ModuleList()
        self.conv.append(ConvBlockRes(in_channels, out_channels, momentum))
        for i in range(n_blocks - 1):
            self.conv.append(ConvBlockRes(out_channels, out_channels, momentum))
        self.kernel_size = kernel_size
        if self.kernel_size is not None:
            self.pool = nn.AvgPool2d(kernel_size=kernel_size)

    def forward(self, x):
        for i in range(self.n_blocks):
            x = self.conv[i](x)
        if self.kernel_size is not None:
            return x, self.pool(x)
        else:
            return x

In [17]:
class ResDecoderBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride, n_blocks=1, momentum=0.01):
        super(ResDecoderBlock, self).__init__()
        out_padding = (0, 1) if stride == (1, 2) else (1, 1)
        self.n_blocks = n_blocks
        self.conv1 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=in_channels,
                               out_channels=out_channels,
                               kernel_size=(3, 3),
                               stride=stride,
                               padding=(1, 1),
                               output_padding=out_padding,
                               bias=False),
            nn.BatchNorm2d(out_channels, momentum=momentum),
            nn.ReLU(),
        )
        self.conv2 = nn.ModuleList()
        self.conv2.append(ConvBlockRes(out_channels * 2, out_channels, momentum))
        for i in range(n_blocks-1):
            self.conv2.append(ConvBlockRes(out_channels, out_channels, momentum))

    def forward(self, x, concat_tensor):
        x = self.conv1(x)
        if x.shape[2:] != concat_tensor.shape[2:]:
            min_t = min(x.size(2), concat_tensor.size(2))
            min_f = min(x.size(3), concat_tensor.size(3))
            x = x[:, :, :min_t, :min_f]
            concat_tensor = concat_tensor[:, :, :min_t, :min_f]
        x = torch.cat((x, concat_tensor), dim=1)
        for i in range(self.n_blocks):
            x = self.conv2[i](x)
        return x

In [18]:
## ENCODER, DECODER, TIMBRE_FILTER AND INTERMEDIATE

class Encoder(nn.Module):
    def __init__(self, in_channels, in_size, n_encoders, kernel_size, n_blocks, out_channels=16, momentum=0.01):
        super(Encoder, self).__init__()
        self.n_encoders = n_encoders
        self.bn = nn.BatchNorm2d(in_channels, momentum=momentum)
        self.layers = nn.ModuleList()
        self.latent_channels = []
        for i in range(self.n_encoders):
            self.layers.append(ResEncoderBlock(in_channels, out_channels, kernel_size, n_blocks, momentum=momentum))
            self.latent_channels.append([out_channels, in_size])
            in_channels = out_channels
            out_channels *= 2
            in_size //= 2
        self.out_size = in_size
        self.out_channel = out_channels

    def forward(self, x):
        concat_tensors = []
        x = self.bn(x)
        for i in range(self.n_encoders):
            _, x = self.layers[i](x)
            concat_tensors.append(_)
        return x, concat_tensors

class Intermediate(nn.Module):
    def __init__(self, in_channels, out_channels, n_inters, n_blocks, momentum=0.01):
        super(Intermediate, self).__init__()
        self.n_inters = n_inters
        self.layers = nn.ModuleList()
        self.layers.append(ResEncoderBlock(in_channels, out_channels, None, n_blocks, momentum))
        for i in range(self.n_inters-1):
            self.layers.append(ResEncoderBlock(out_channels, out_channels, None, n_blocks, momentum))

    def forward(self, x):
        for i in range(self.n_inters):
            x = self.layers[i](x)
        return x

class Decoder(nn.Module):
    def __init__(self, in_channels, n_decoders, stride, n_blocks, momentum=0.01):
        super(Decoder, self).__init__()
        self.layers = nn.ModuleList()
        self.n_decoders = n_decoders
        for i in range(self.n_decoders):
            out_channels = in_channels // 2
            self.layers.append(ResDecoderBlock(in_channels, out_channels, stride, n_blocks, momentum))
            in_channels = out_channels

    def forward(self, x, concat_tensors):
        for i in range(self.n_decoders):
            x = self.layers[i](x, concat_tensors[-1-i])
        return x

class TimbreFilter(nn.Module):
    def __init__(self, latent_rep_channels):
        super(TimbreFilter, self).__init__()
        self.layers = nn.ModuleList()
        for latent_rep in latent_rep_channels:
            self.layers.append(ConvBlockRes(latent_rep[0], latent_rep[0]))

    def forward(self, x_tensors):
        out_tensors = []
        for i, layer in enumerate(self.layers):
            out_tensors.append(layer(x_tensors[i]))
        return out_tensors

In [19]:
class DeepUnet(nn.Module):
    def __init__(self, kernel_size, n_blocks, en_de_layers=5, inter_layers=4, in_channels=1, en_out_channels=16):
        super(DeepUnet, self).__init__()
        self.encoder = Encoder(in_channels, 512, en_de_layers, kernel_size, n_blocks, en_out_channels)
        self.intermediate = Intermediate(self.encoder.out_channel // 2, self.encoder.out_channel, inter_layers, n_blocks)
        self.tf = TimbreFilter(self.encoder.latent_channels)
        self.decoder = Decoder(self.encoder.out_channel, en_de_layers, kernel_size, n_blocks)

    def forward(self, x):
        x, concat_tensors = self.encoder(x)
        x = self.intermediate(x)
        concat_tensors = self.tf(concat_tensors)
        x = self.decoder(x, concat_tensors)
        return x

class BiGRU(nn.Module):
    def __init__(self, input_features, hidden_features, num_layers):
        super(BiGRU, self).__init__()
        self.gru = nn.GRU(input_features, hidden_features, num_layers=num_layers, batch_first=True, bidirectional=True)

    def forward(self, x):
        self.gru.flatten_parameters()
        return self.gru(x)[0]

In [20]:
## E2E Class

class E2E(nn.Module):
    def __init__(self,
                 n_blocks, n_gru, kernel_size,
                 en_de_layers=5, inter_layers=4,
                 in_channels=1,          # 1 → RRCGD channels
                 en_out_channels=16):
        super().__init__()
        self.unet = DeepUnet(kernel_size, n_blocks,
                             en_de_layers, inter_layers,
                             in_channels, en_out_channels)

        self.cnn = nn.Conv2d(en_out_channels, 1,kernel_size=(3, 3), padding=(1, 1))

        feat_dim = 512                  # 512 RRCGD features
        if n_gru:
            self.fc = nn.Sequential(
                BiGRU(feat_dim, 256, n_gru),
                nn.Linear(512, N_CLASS),
                nn.Dropout(0.25),
                nn.Sigmoid()
            )
        else:
            self.fc = nn.Sequential(
                nn.Linear(feat_dim, N_CLASS),
                nn.Dropout(0.25),
                nn.Sigmoid()
            )

    def forward(self, feats):             # feats : (B, 1, T, 512)
        x = self.cnn(self.unet(feats))    # (B, 1, T, 512)
        x = x.transpose(1, 2).flatten(-2) # (B, T, 512)  ← now matches GRU

        hidden_vec = None
        for i, layer in enumerate(self.fc):
            x = layer(x)
            if i == 0:                    # BiGRU output
                hidden_vec = x            # (B, T, 512)

        return hidden_vec, x              # (B, T, 512)

In [21]:
def train_and_eval(model, train_loader, valid_loader, optimizer, criterion, max_steps, TRAINING_DEVICE, dump_dir, model_name, metric, last_epoch=0):
    
    checkpint_store = ModelCheckpointStore(dump_dir)
    
    training_metrics = {"train_avg_loss":[],
                        "train_avg_acc":[],
                        "valid_avg_loss":[],
                        "valid_avg_acc":[]}
    for step in range(last_epoch, max_steps+last_epoch):
        train_step_loss = 0
        train_step_correct_pred = 0
        valid_step_loss = 0
        valid_step_correct_pred = 0
        model.train()
        for batch_idx, (x, y) in enumerate(train_loader):
            print(f"At {batch_idx} in training loop")
            # x = torch.transpose(x, 0, 1)
            # y = torch.transpose(y, 0, 1)
            x, y = x.to(TRAINING_DEVICE), y.to(TRAINING_DEVICE)

            optimizer.zero_grad()
            _,train_out = model(x)
            # print("train_out shape:", train_out.shape)
            # print("y shape:", y.shape)
            # train_out:  (B, T_feat, 360)
            # y         : (B, T_lab)
            T = train_out.size(1)
            if y.size(1) != T:          # crop labels if longer
                y = y[:, :T]
            # print('new shape:',y.shape)

            loss = criterion(train_out.reshape(-1, N_CLASS), y.flatten())
            loss.backward()
            optimizer.step()
            train_step_loss += loss.item()
            _, train_correct_outputs = torch.max(train_out.reshape(-1, N_CLASS), dim=1)
            train_step_correct_pred += (train_correct_outputs == y.flatten()).sum().item()

        training_metrics["train_avg_loss"].append(train_step_loss / len(train_loader))
        training_metrics["train_avg_acc"].append(train_step_correct_pred / len(train_loader))
        
        model.eval()
        with torch.no_grad():
            for batch_idx, (x_val, y_val) in enumerate(valid_loader):
                print(f"At {batch_idx} in validation loop")

                x_val, y_val = x_val.to(TRAINING_DEVICE), y_val.to(TRAINING_DEVICE)
                _, valid_out = model(x_val)           # (B, T_feat, N_CLASS)

                # ---------- align lengths ------------
                T = min(valid_out.size(1), y_val.size(1))
                valid_out = valid_out[:, :T, :]       # <-- fixed
                y_val     = y_val[:, :T]
                # -------------------------------------

                # loss & accuracy
                valid_loss = criterion(valid_out.reshape(-1, N_CLASS), y_val.flatten())
                valid_step_loss += valid_loss.item()

                _, valid_pred = torch.max(valid_out.reshape(-1, N_CLASS), dim=1)
                valid_step_correct_pred += (valid_pred == y_val.flatten()).sum().item()

            training_metrics["valid_avg_loss"].append(valid_step_loss / len(valid_loader))
            training_metrics["valid_avg_acc"].append(valid_step_correct_pred / len(valid_loader))
        torch.cuda.empty_cache()
        display(step, training_metrics)
        checkpint_store(model, training_metrics, metric, step)
        plot(dump_dir, model_name, **training_metrics)

In [None]:
import torch
import gc

# Clear unused memory
torch.cuda.empty_cache()
gc.collect()

In [23]:
torch.cuda.empty_cache()

In [38]:
print("After clearing:")
t = torch.cuda.get_device_properties(0).total_memory
r = torch.cuda.memory_reserved(0)
a = torch.cuda.memory_allocated(0)
f = r - a
print(f"Total: {t}, Reserved: {r}, Allocated: {a}, Free inside reserved: {f}")

After clearing:
Total: 17059545088, Reserved: 517996544, Allocated: 451589632, Free inside reserved: 66406912


In [27]:
# DEFINING MODEL AND PARAMETER SIZE
model = nn.DataParallel(E2E(n_blocks=4,n_gru=1,kernel_size=(2, 2),en_de_layers=5,inter_layers=4,in_channels=1)).to(TRAINING_DEVICE)

In [28]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Parameter Count: {count_parameters(model)}")

Parameter Count: 92192795


In [29]:
cross_entropy_loss_weights = torch.full((N_CLASS,), 1.5)
cross_entropy_loss_weights[0] = 1
cross_entropy_loss_weights = cross_entropy_loss_weights.float().to(TRAINING_DEVICE)

In [30]:
loss_fn = nn.CrossEntropyLoss(weight=cross_entropy_loss_weights)
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-4)

In [31]:
# dump_dir = "/speech/nishanth/clean_research/nishchala/ptdb_full_data/mudit_4_layer_pos_gd_1.008"
dump_dir = "/kaggle/working/pitch-estimate/trail1"
os.makedirs(dump_dir, exist_ok=True)

In [32]:
last_model_save_epoch = 0
model_name = "new_rrcgd_rmvpe_1"
epochs = 30

In [33]:
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Reserved:  {torch.cuda.memory_reserved() / 1e9:.2f} GB")

Allocated: 0.39 GB
Reserved:  0.44 GB


In [39]:
train_and_eval(model, train_loader, valid_loader, optimizer, loss_fn, epochs, TRAINING_DEVICE, dump_dir, model_name, "valid_avg_loss", last_epoch=0)

At 0 in training loop


OutOfMemoryError: CUDA out of memory. Tried to allocate 52.00 MiB. GPU 0 has a total capacity of 15.89 GiB of which 27.12 MiB is free. Process 3470 has 15.86 GiB memory in use. Of the allocated memory 15.45 GiB is allocated by PyTorch, and 130.67 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)