In [3]:
import os
os.environ["KERAS_BACKEND"] = "torch"

import torch
import torch.nn as nn
from torchsummary import summary
import math
import keras
import keras.models
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Input, GlobalAveragePooling2D, Dropout, Reshape
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
import torchaudio
from torch.utils.tensorboard import SummaryWriter
import torchaudio.transforms as T
import sklearn.metrics

In [4]:
def conv3x3(in_planes, out_planes, stride=1):
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False)

class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, base_width=64):
        super(BasicBlock, self).__init__()
        if base_width != 64:
            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = nn.BatchNorm2d(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = nn.BatchNorm2d(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, base_width=64):
        super(Bottleneck, self).__init__()
        width = int(planes * (base_width / 64.0))
        self.conv1 = nn.Conv2d(inplanes, width, kernel_size=1, bias=False)
        self.bn1 = nn.BatchNorm2d(width)
        self.conv2 = nn.Conv2d(width, width, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(width)
        self.conv3 = nn.Conv2d(width, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)
        return out


class ResNet(nn.Module):

    def __init__(self, block, layers, in_channel=1, dropout=None, width_per_group=64):
        self.inplanes = 64
        super(ResNet, self).__init__()
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(in_channel, 64, kernel_size=7, stride=2, padding=3, bias=False) 
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) 
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
        self.avgpool = nn.AvgPool2d(7, stride=1)

        self.use_dropout = True if dropout else False
        if self.use_dropout:
            print(f'Using dropout: {dropout}')
            self.dropout = nn.Dropout(p=dropout)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1):
        downsample = None
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion),
            )
        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.base_width))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, base_width=self.base_width))

        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = torch.flatten(x, 1)

        return x


def resnet18(**kwargs):
    return ResNet(BasicBlock, [2, 2, 2, 2], **kwargs)

model_dict = {
    'resnet18': [resnet18, 512]
}

class Encoder(nn.Module):
    def __init__(self, name='resnet18'):
        super(Encoder, self).__init__()
        model_fun, dim_in = model_dict[name]
        self.encoder = model_fun()

    def forward(self, x):
        feat = self.encoder(x)
        return feat

model = Encoder()
model.cuda()
summary(model, (1,216,216))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 108, 108]           3,136
       BatchNorm2d-2         [-1, 64, 108, 108]             128
              ReLU-3         [-1, 64, 108, 108]               0
         MaxPool2d-4           [-1, 64, 54, 54]               0
            Conv2d-5           [-1, 64, 54, 54]          36,864
       BatchNorm2d-6           [-1, 64, 54, 54]             128
              ReLU-7           [-1, 64, 54, 54]               0
            Conv2d-8           [-1, 64, 54, 54]          36,864
       BatchNorm2d-9           [-1, 64, 54, 54]             128
             ReLU-10           [-1, 64, 54, 54]               0
       BasicBlock-11           [-1, 64, 54, 54]               0
           Conv2d-12           [-1, 64, 54, 54]          36,864
      BatchNorm2d-13           [-1, 64, 54, 54]             128
             ReLU-14           [-1, 64,

In [5]:
batch_size = 128
spectrograms_array = np.load('spectrograms.npy')
labels_array = np.load('labels.npy')

spectrograms_array = spectrograms_array/80 + 1
spectrograms_array = np.expand_dims(spectrograms_array, axis=1)

dataset = torch.utils.data.TensorDataset(
    torch.from_numpy(spectrograms_array), torch.from_numpy(labels_array)
)

train_size = int(0.8 * len(dataset))  
val_size = int(0.1 * len(dataset))    
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, val_size, test_size]
)

train_dataloader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)

val_dataloader = torch.utils.data.DataLoader(
    val_dataset, batch_size=batch_size, shuffle=False
    
)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=False
)


In [6]:
def jaccard_distance(x1, x2):
    
    x1 = x1[x1 !=0 ]
    x2 = x2[x2 !=0 ]
    
    x1_instruments = x1[0::3]
    x2_instruments = x2[0::3]

    x1_pitches = x1[2::3]
    x2_pitches = x2[2::3]

    ipx1 = set([(x1[i], x1[i+2]) for i in range(0, len(x1), 3)])
    ipx2 = set([(x2[i], x2[i+2]) for i in range(0, len(x2), 3)])

    shared_dist = len(ipx1.intersection(ipx2)) / len(ipx1.union(ipx2))
    

    instrument_dist = np.intersect1d(x1_instruments, x2_instruments).size / np.union1d(x1_instruments, x2_instruments).size
    pitches_dist = np.intersect1d(x1_pitches, x2_pitches).size / np.union1d(x1_pitches, x2_pitches).size
    
    
    return 1 - (0.5*pitches_dist + 0.5*instrument_dist + 0*shared_dist)

In [7]:
class LabelDifference(nn.Module):
    def __init__(self, distance_type='jaccard'):
        super(LabelDifference, self).__init__()
        self.distance_type = distance_type

    def forward(self, labels):
        #labels: [bs, label_dim]
        #output: [bs, bs]

        labels = labels.cpu()
        x = labels.shape[0]

        matrix = np.zeros((x,x))

        for i in range(x):
            for j in range(x):
                matrix[i][j] = jaccard_distance(labels[i],labels[j])

        return torch.from_numpy(matrix).to('cuda')

class FeatureSimilarity(nn.Module):
    def __init__(self, similarity_type='l2'):
        super(FeatureSimilarity, self).__init__()
        self.similarity_type = similarity_type

    def forward(self, features):
        # labels: [bs, feat_dim]
        # output: [bs, bs]
        if self.similarity_type == 'l2':
            return -(features[:, None, :] - features[None, :, :]).norm(2, dim=-1)
        else:
            raise ValueError(self.similarity_type)


class RnCLoss(nn.Module):
    def __init__(self, temperature=2, label_diff='jaccard', feature_sim='l2'):
        super(RnCLoss, self).__init__()
        self.t = temperature
        self.label_diff_fn = LabelDifference(label_diff)
        self.feature_sim_fn = FeatureSimilarity(feature_sim)

    def forward(self, features, labels):
        # features: [bs, 2, feat_dim]
        # labels: [bs, label_dim]

        features = torch.cat([features[:, 0], features[:, 1]], dim=0)  # [2bs, feat_dim]
        labels = labels.repeat(2, 1)  # [2bs, label_dim]

        label_diffs = self.label_diff_fn(labels)
        logits = self.feature_sim_fn(features).div(self.t)
        logits_max, _ = torch.max(logits, dim=1, keepdim=True)
        logits -= logits_max.detach()
        exp_logits = logits.exp()

        n = logits.shape[0]  # n = 2bs

        # remove diagonal
        logits = logits.masked_select((1 - torch.eye(n).to(logits.device)).bool()).view(n, n - 1)
        exp_logits = exp_logits.masked_select((1 - torch.eye(n).to(logits.device)).bool()).view(n, n - 1)
        label_diffs = label_diffs.masked_select((1 - torch.eye(n).to(logits.device)).bool()).view(n, n - 1)

        loss = 0.
        for k in range(n - 1):
            pos_logits = logits[:, k]  # 2bs
            pos_label_diffs = label_diffs[:, k]  # 2bs
            neg_mask = (label_diffs >= pos_label_diffs.view(-1, 1)).float()  # [2bs, 2bs - 1]
            pos_log_probs = pos_logits - torch.log((neg_mask * exp_logits).sum(dim=-1))  # 2bs
            loss += - (pos_log_probs / (n * (n - 1))).sum()

        return loss

In [8]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.5, momentum=0.9, weight_decay=1e-4)
loss_fn = RnCLoss()

In [9]:
epochs = 300
best_vloss = 1_000_000.
writer = SummaryWriter()

for epoch in range(epochs):
    
    running_loss = 0.
    last_loss = 0.
    model.train(True)
    
    for step, (inputs, targets) in enumerate(train_dataloader):#

        inputs, targets = inputs.cuda(), targets.cuda()
        
        #augmentation
        time_masking = T.TimeMasking(time_mask_param=0)
        freq_masking = T.FrequencyMasking(freq_mask_param=0)

        inputs2 = inputs.clone()

        time_masked1 = time_masking(inputs)
        aug_inputs1 = freq_masking(time_masked1)

        time_masked2 = time_masking(inputs2)
        aug_inputs2 = freq_masking(time_masked2)
        
        logits1 = model(inputs)
        logits2 = model(inputs2)

        features = torch.cat((logits1.unsqueeze(1), logits2.unsqueeze(1)), dim=1)
        
        loss = loss_fn(features, targets)

        writer.add_scalar("Loss/train", loss, epoch)

        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        
        # Optimizer variable updates
        optimizer.step()
        
        if step % 50 == 0:
            print(
                f"Training loss (for 1 batch) at step {step}: {float(loss):.4f}"
            )
            print(f"Seen so far: {(step + 1) * batch_size} samples")
 
    running_vloss = 0.0
    model.eval()
    num_batches = 0
    
    with torch.no_grad():
        for i, vdata in enumerate(val_dataloader):
            vinputs, vlabels = vdata
            vinputs, vlabels = vinputs.cuda(), vlabels.cuda()
            voutputs = model(vinputs)
            val_features = torch.cat((voutputs.unsqueeze(1), voutputs.unsqueeze(1)), dim=1)
            vloss = loss_fn(val_features, vlabels)
            running_vloss += vloss
            num_batches += 1
    

    avg_vloss = running_vloss / num_batches
    writer.add_scalar("Loss/val", avg_vloss, epoch)

    print("vloss: ", avg_vloss)
    
    if avg_vloss < best_vloss:
        best_vloss = avg_vloss
        #torch.save(model.state_dict(), f"models/epoch_{epoch + 1}_val_loss_{best_vloss:.4f}.pth")

torch.save(model.state_dict(), 'models/final.pth')

Training loss (for 1 batch) at step 0: 5.6259
Seen so far: 128 samples
vloss:  tensor(6.2773, device='cuda:0')
Training loss (for 1 batch) at step 0: 5.3325
Seen so far: 128 samples
vloss:  tensor(9.4614, device='cuda:0')


KeyboardInterrupt: 