In [56]:
import torch
from torch import nn
import torchvision
import torch.nn.functional as F
from collections import OrderedDict
import math

In [57]:
cqt = torch.randn((2, 1, 71, 500))

In [62]:
conv1 = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=(12,75),stride=(1,1), bias=False)
x = conv1(cqt)
x.shape

torch.Size([2, 64, 60, 426])

In [63]:
conv2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(12,3),stride=(12,1), bias=False)
x2 = conv2(x)
x2.shape

torch.Size([2, 64, 5, 424])

In [14]:
pool = nn.AdaptiveMaxPool2d((1,None))
pool(x2).shape

torch.Size([2, 64, 1, 424])

# CQT Model is here

In [60]:
def SPP(x, pool_size):
    N, C, H, W = x.size()
    for i in range(len(pool_size)):
        maxpool = nn.AdaptiveMaxPool2d((H, pool_size[i]))
        if i==0: spp = maxpool(x).view(N, -1)
        else: spp = torch.cat((spp, maxpool(x).view(N, -1)),1)
    return spp


class CQTSPPNet(nn.Module):
    def __init__(self):
        super().__init__()
        # input N, C, 84, L
        # First convolution
        #频带卷积结构，输入nx84,经过75x12的卷积核，得到[N, C, 73, L]的特征
        #再经过12x1的池化操作得到[64, 1, n-d+1 x1 ]
        self.features = nn.Sequential(OrderedDict([
            ('conv0', nn.Conv2d(in_channels=1, out_channels= 64,kernel_size=(12, 75),
                                stride=(1, 1), bias=False)),
            ('norm0', nn.BatchNorm2d(64)),
            ('relu0', nn.ReLU(inplace=True)),
            ('conv1', nn.Conv2d(in_channels=64, out_channels=128, kernel_size=(12,3), 
                                stride=(12,1), bias=False)),
            ('norm1', nn.BatchNorm2d(128)),
            ('relu1', nn.ReLU(inplace=True)),
            ('pool0', nn.AdaptiveMaxPool2d((1,None))),
        ]))
        self.conv = nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(in_channels=128, out_channels=256,
                                kernel_size=(1,3), stride=1,bias=False)),
            ('norm1', nn.BatchNorm2d(256)),
            ('relu1', nn.ReLU(inplace=True)),
        ]))
        #self.spp = SPP([32,16,10,8,6,4,2,1])
        self.fc0 = nn.Linear(20224, 300)
        self.fc1 = nn.Linear(300, 10000)
    def forward(self, x):
        # input [N, C, H, W] (W = 396)
        N = x.size()[0]
        print(x.shape)
        x = self.features(x) # [N, 128, 1, W - 75 + 1]
        print(x.shape)
        x = self.conv(x) #  [N, 256, 1, W - 75 +1 - 3 + 1]
        print(x.shape)
        x = SPP(x, [32,16,10,8,6,4,2,1]) # [N, 256, 1, sum()=79]
        print(x.shape)
        x = x.view(N,-1)
        feature = self.fc0(x)

        x = self.fc1(feature)

        return x, feature

In [61]:
net = CQTSPPNet()
x, f = net(cqt)

torch.Size([2, 1, 71, 500])
torch.Size([2, 128, 1, 424])
torch.Size([2, 256, 1, 422])
torch.Size([2, 20224])


In [1]:
import numpy as np
a = np.load('../data/youtube_cqt_npy/8352_1.npy')
a.shape

(84, 520)

In [52]:
    import numpy as np
    dis2d = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
    R_m = np.mean(dis2d, axis=1, keepdims=True)
    R = dis2d - R_m
    B = R / (np.linalg.norm(R, axis=1, keepdims=True) + 1e-20)
    dis2d = 1 - np.matmul(B, B.T)

In [53]:
R_m

array([[2.],
       [5.],
       [8.]])

In [54]:
R

array([[-1.,  0.,  1.],
       [-1.,  0.,  1.],
       [-1.,  0.,  1.]])

In [49]:
B

array([[-0.57735027, -0.57735027, -0.57735027],
       [ 0.        ,  0.        ,  0.        ],
       [ 0.57735027,  0.57735027,  0.57735027]])

In [45]:
np.sqrt(1 / 3)

0.5773502691896257