# Pitch提取
使用Thickstn估计泛音音级的步骤

In [1]:
import sys
import numpy as np
from tqdm import tqdm
sys.path.append('./thickstun/lib/') 
sys.path.insert(0,'lib/')

import thickstun.lib.base_model as base_model
import tensorflow as tf
import os,mmap
import librosa
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

# WAV数据集位置
filePath = "./musicnet"

# output file
outputfile = './resultPitch/'

In [2]:
# 滤波器
def create_filters(d,k):
    x = np.linspace(0, 2*np.pi, d, endpoint=False)
    wsin = np.empty((1,d,1,k), dtype=np.float32)
    wcos = np.empty((1,d,1,k), dtype=np.float32)
    start_freq = 50.
    end_freq = 6000.
    num_cycles = start_freq*d/44100.
    scaling_ind = np.log(end_freq/start_freq)/k
    window_mask = 1.0-1.0*np.cos(x)
    for ind in range(k):
        wsin[0,:,0,ind] = window_mask*np.sin(np.exp(ind*scaling_ind)*num_cycles*x)
        wcos[0,:,0,ind] = window_mask*np.cos(np.exp(ind*scaling_ind)*num_cycles*x)
            
    return wsin,wcos


In [3]:
# 模型读取
class Spectrograms(base_model.Model):
    def __init__(self, *args, **kwargs):
        super(Spectrograms, self).__init__(*args, **kwargs)

    def define_graph(self):
        super(Spectrograms, self).define_graph()
        
        # lvl1 convolutions are shared between regions
        self.k = 512                # lvl1 nodes
        self.d = 4096               # lvl1 receptive field
        
        d2_x = 1          # lvl2 input dims_x
        d2_y = 128          # lvl2 input dims_y
        k2 = 128        # num lvl2 filters
        stride_y = 2    # lvl2 stride
        
        d3_x = 25 # lvl3 input dims_x
        d3_y = 1 # lvl3 input dims_y (fully connected)
        k3 = 4096 # num lvl3 filters

        num_regions  = 1 + (self.window-self.d)/self.stride
        #print 'First layer regions: ({},{})'.format(num_regions,self.k)
        num_regions2_x  = 1 + (num_regions-d2_x)/1
        num_regions2_y = 1 + (self.k-d2_y)/stride_y
        #print 'Second layer regions: ({},{})'.format(num_regions2_x,num_regions2_y)
        num_regions3_x = 1 + (num_regions2_x - d3_x)/1
        num_regions3_y = 1 + (num_regions2_y - d3_y)/1

        wsin,wcos = create_filters(self.d,self.k)

        print ('---- Weights ----')
        wscale = .0001
        with tf.compat.v1.variable_scope('parameters'):
            w = tf.Variable(wscale*tf.random.normal([d2_x,d2_y,1,k2],seed=999))
            print ('w',w)
            wavg = self.register_weights(w,'w',average=.9998)
            w2 = tf.Variable(wscale*tf.random.normal([d3_x,d3_y,k2,k3],seed=999))
            print ('w2',w2)
            w2avg = self.register_weights(w2,'w2',average=.9998)
            beta = tf.Variable(wscale*tf.random.normal([int(num_regions3_x*num_regions3_y*k3),self.m],seed=999))
            print ('beta',beta)
            betaavg = self.register_weights(beta,'beta',average=.9998)

        print ('---- Layers ----')
        with tf.compat.v1.variable_scope('queued_model'):
            zx = tf.square(tf.nn.conv2d(self.xq,wsin,strides=[1,1,self.stride,1],padding='VALID')) \
               + tf.square(tf.nn.conv2d(self.xq,wcos,strides=[1,1,self.stride,1],padding='VALID'))
            print ('zx',zx)
            z2 = tf.nn.relu(tf.nn.conv2d(tf.math.log(zx+10e-15),w,strides=[1,1,1,stride_y],padding='VALID',data_format='NCHW'))
            print ('z2',z2)
            z3 = tf.nn.relu(tf.nn.conv2d(z2,w2,strides=[1,1,1,1],padding='VALID',data_format='NCHW'))
            print ('z3',z3)
            y = tf.matmul(tf.reshape(z3,[self.batch_size,int(num_regions3_x*num_regions3_y*k3)]),beta)
            print ('y',y)
            self.loss = tf.reduce_mean(tf.nn.l2_loss(y-tf.reshape(self.yq,[self.batch_size,self.m])))

        with tf.compat.v1.variable_scope('direct_model'):
            self.zx = tf.square(tf.nn.conv2d(self.xd,wsin,strides=[1,1,self.stride,1],padding='VALID')) \
                    + tf.square(tf.nn.conv2d(self.xd,wcos,strides=[1,1,self.stride,1],padding='VALID'))
            self.z2 = tf.nn.relu(tf.nn.conv2d(tf.math.log(self.zx+10e-15),wavg,strides=[1,1,1,stride_y],padding='VALID',data_format='NCHW'))
            self.z3 = tf.nn.relu(tf.nn.conv2d(self.z2,w2avg,strides=[1,1,1,1],padding='VALID',data_format='NCHW'))
            self.y_direct = tf.matmul(tf.reshape(self.z3,[tf.shape(self.xd)[0],int(num_regions3_x*num_regions3_y*k3)]),betaavg)
            self.loss_direct = tf.reduce_mean(tf.nn.l2_loss(self.y_direct-self.yd))

预测函数，通过读取路径path的音频文件，将其转化为预测数据

In [4]:
def predict(path):
    labels = None
    try: model.stop()
    except NameError: pass
    model = Spectrograms(labels,checkpoint_path='./thickstun/convnet_experimental2_morelvl3/', outputs=1, window=16384, mmap=True,
                         normalize=True, extended_test_set=False, use_mirex=True, init=False, pitch_transforms=5, jitter=.1,
                         restrict=False,isTest=False)
    print ('finish model loading...')
    for i,f in enumerate(os.listdir('./thickstun/data/records/')[:]):
        if (not os.path.isfile(path+f)):
            try:
                print(f + ' complete!')
                mse_test, Yhat, Y, mse_breakdown, avp_breakdown = model.sample_records(int(f[:-4]), 10000, fixed_stride=512)
                np.save(path+f,Yhat.T)
            except Exception as e: print (e)
        else: print ('exist') 



In [5]:
'''
    labels = None
    try: model.stop()
    except NameError: pass
    model = Spectrograms(labels,checkpoint_path='./thickstun/convnet_experimental2_morelvl3/', outputs=1, window=16384, mmap=True,
                         normalize=True, extended_test_set=False, use_mirex=True, init=False, pitch_transforms=5, jitter=.1,
                         restrict=False)
    print ('finish model loading...')
    print(name)
    data, y = librosa.load('./mp3/'+name, sr=44100)
    np.save('./thickstun/tmp/test.npy',data)
    fd = open('./thickstun/tmp/test.npy', 'r+b')
    buff = mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_DEFAULT)
    mse_test, Yhat, Y, mse_breakdown, avp_breakdown = model.sample_records(buff, 10000, fixed_stride=512)

    np.save('./thickstun/pitch/'+name[:-4]+'.npy', Yhat)
'''

# 将文件目录中的所有音频文件进行预测，将其保存到resultPitch文件夹中
labels = None
# 先清空可能存在的模型内存（一般不会有）
try: model.stop()
except NameError: pass
# 读取已有模型
model = Spectrograms(labels,checkpoint_path='./thickstun/Model_Data/convnet_experimental2_morelvl3/', outputs=1, window=16384, mmap=True,
                     normalize=True, extended_test_set=False, use_mirex=True, init=False, pitch_transforms=5, jitter=.1,
                     restrict=False)
print ('finish model loading...')



---- Weights ----
w <tf.Variable 'parameters/Variable:0' shape=(1, 128, 1, 128) dtype=float32>
Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
w2 <tf.Variable 'parameters/Variable_2:0' shape=(25, 1, 128, 4096) dtype=float32>
beta <tf.Variable 'parameters/Variable_4:0' shape=(790528, 128) dtype=float32>
---- Layers ----
zx Tensor("queued_model/add:0", shape=(150, 1, 25, 512), dtype=float32)
z2 Tensor("queued_model/Relu:0", shape=(150, 128, 25, 193), dtype=float32)
z3 Tensor("queued_model/Relu_1:0", shape=(150, 4096, 1, 193), dtype=float32)
y Tensor("queued_model/MatMul:0", shape=(150, 128), dtype=float32)
finish model loading...


In [8]:
data, y = librosa.load("D:/Instruments Recoginition/musicnet/Predict.wav", sr=44100)
mse_test, Yhat, Y, mse_breakdown, avp_breakdown = model.sample_records(data, fixed_stride=512)
np.save('Predict_pitch.npy', Yhat)

In [6]:
# 计算文件数
lin = 0
for root, dirs, files in os.walk(filePath):
    for file in files:
        path = os.path.join(root, file)
        if '.wav' in path:
            lin += 1
            
with tqdm(total=lin) as pbar:
    for root, dirs, pfiles in os.walk(filePath):
        for pfile in pfiles:
            path = os.path.join(root, pfile)
            if '.wav' in path or '.mp3' in path:
                name = pfile.replace(".wav", "")
                name = name.replace(".mp3", "")
                data, y = librosa.load(path, sr=44100)
                # 避免加载模型之后导致内存不够，因此采用mmap动态存取数据，不过有些错误，这里不准备使用
                '''
                np.save('temp.npy', data)
                fd = open('temp.npy', 'wb')
                buff = mmap.mmap(fd.fileno(), 0, access=mmap.ACCESS_DEFAULT)
                '''
                # print(pfile)
                mse_test, Yhat, Y, mse_breakdown, avp_breakdown = model.sample_records(data, fixed_stride=512)
                
                np.save(outputfile + name + '.npy', Yhat)
                
                pbar.update(1)

100%|████████████████████████████████████████████████████████████████████████████████| 330/330 [43:11<00:00,  7.85s/it]


In [None]:
'''1759.wav
8584704 True
4149
512'''