In [1]:
import torch
from torch import optim
import torch.nn as nn
import matplotlib.pyplot as plt 
import numpy as np
import random, pdb, math

class VoiceTechniqueClassifier:
    def __init__(self, config, spmel_params):
        """ initialise configurations"""
        self.config = config
        self.device = torch.device(f'cuda:{self.config.which_cuda}' if torch.cuda.is_available() else 'cpu')
        melsteps_per_second = spmel_params['sr'] / spmel_params['hop_size']
        self.window_size = math.ceil(config.chunk_seconds * melsteps_per_second)

        if config.is_wilkins == True:
            self.model = models.WilkinsAudioCNN(config)
        else:
            self.model = models.Luo2019AsIs(config, spmel_params)

        self.optimizer = optim.Adam(self.model.parameters(), lr=config.lr, weight_decay=config.reg)
        if self.config.load_ckpt != '': 
            g_checkpoint = torch.load(self.config.load_ckpt)
            self.model.load_state_dict(g_checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(g_checkpoint['self.optimizer_state_dict'])
            # pdb.set_trace()
            # fixes tensors on different devices error
            # https://github.com/pytorch/pytorch/issues/2830
            for state in self.optimizer.state.values():
                for k, v in state.items():
                    if isinstance(v, torch.Tensor):
                        state[k] = v.cuda(self.config.which_cuda)
            self.previous_ckpt_iters = g_checkpoint['epoch']
        else:
            self.previous_ckpt_iters = 0 
        self.model.to(self.device)

    def infer(self, epoch, loader, history_list, writer, examples_per_epoch, mode):

        def batch_iterate():

            print(f'=====> {split_name}: ')
            accum_loss = 0 
            accum_corrects = 0 
            for batch_num, (x_data, y_data, singer_id)  in enumerate(loader):

                x_data = x_data.to(self.device, dtype=torch.float)
                y_data = y_data.to(self.device)

                np.save('x_data_numpy', x_data.cpu().detach().numpy())
                np.save('y_data_numpy', y_data.cpu().detach().numpy())

                #######################
    
                #Split up each example in subchunks
    
                chunk_nums = []
                for i, example in enumerate(x_data):
                    chunk_num = math.ceil(example.shape[0] / self.window_size)
                    chunk_nums.append(chunk_num)

                new_x_data_batch = []
                for i in range(len(x_data)):
                    for j in range(chunk_num):
                        offset = j * self.window_size
                        batch = x_data[j][offset : offset+self.window_size]
                        new_x_data_batch.append(batch)
                        pdb.set_trace()
                pdb.set_trace()
                new_x_data_batch = new_x_data_batch.to(self.device, dtype=torch.float)

                ######################

                tester = [1,2,3]
                prediction = self.model(new_x_data_batch, chunk_nums)
                loss = nn.functional.cross_entropy(prediction, y_data)
                _, predicted = torch.max(prediction.data, 1)
                corrects = (predicted == y_data).sum().item()
                accum_corrects += corrects
                accuracy = corrects / y_data.shape[0]
                accum_loss += loss.item()

                if mode == 'train':
                    self.optimizer.zero_grad()
                    loss.backward()
                    self.optimizer.step()

                print('Epoch {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}\tAcc: {:.6f}'.format(
                    # inaccurate reading here if on last batch and drop_last=False
                    epoch,
                    batch_num * self.config.batch_size,
                    examples_per_epoch,
                    100. * batch_num / len(loader),
                    loss.item(),
                    accuracy)) # calculates average loss per example

                y_data = np.expand_dims(y_data.cpu(),1)
                singer_id = np.expand_dims(singer_id.cpu(),1)
                if batch_num == 0:
                    labels = np.hstack((y_data,singer_id))
                else:
                    tmp =  np.hstack((y_data,singer_id))
                    labels = np.vstack((labels, tmp))

            return labels, accum_loss, accum_corrects

        if mode == 'train':
            self.model.train()
            loss_hist=history_list[0]
            acc_hist=history_list[1]
            split_name = 'train'
            labels, accum_loss, accum_corrects = batch_iterate()
        elif mode == 'eval':
            self.model.eval()
            loss_hist=history_list[2]
            acc_hist=history_list[3]
            split_name = 'test'
            with torch.no_grad():
                labels, accum_loss, accum_corrects = batch_iterate()

        epoch_loss = accum_loss / len(loader)
        epoch_accuracy = accum_corrects / examples_per_epoch
        if self.config.is_wilkins:
            writer.add_scalar(f"Number Correct/{split_name}", accum_corrects, epoch)
            writer.add_scalar(f"Accuracy/{split_name}", epoch_accuracy, epoch)
            writer.add_scalar(f"Loss/{split_name}", epoch_loss, epoch)
            writer.add_histogram(f"layer_seq1.bias", self.model.layer_seq1[0].bias, epoch)
            writer.add_histogram(f"layer_seq1.weight", self.model.layer_seq1[0].weight, epoch)
            writer.add_histogram(f"layer_seq1.weight.grad", self.model.layer_seq1[0].weight.grad, epoch)
        else:
            writer.add_scalar(f"Number Correct/{split_name}", accum_corrects, epoch)
            writer.add_scalar(f"Accuracy/{split_name}", epoch_accuracy, epoch)
            writer.add_scalar(f"Loss/{split_name}", epoch_loss, epoch)
#            writer.add_histogram(f"enc_convs_conv_layer1", self.model.enc_convs[0][0].bias, epoch)
#            writer.add_histogram(f"enc_convs_conv_layer1.weight", self.model.enc_convs[0][0].weight, epoch)
#            writer.add_histogram(f"enc_convs_conv_layer1.weight.grad", self.model.enc_convs[0][0].weight.grad, epoch) 
        print()
        print('Epoch {} Loss: {:.4f}, Acc: {:.4f}'.format(epoch, epoch_loss, epoch_accuracy))
        loss_hist.append(epoch_loss)
        acc_hist.append(epoch_accuracy)
        save_path = './results/' +self.config.file_name +'/' +str(epoch) +'Epoch_checkpoint.pth.tar'
        self.save_checkpoints(epoch, epoch_loss, epoch_accuracy, save_path)

        return labels

    def save_checkpoints(self, epoch, loss, accuracy, save_path):
        if epoch == self.config.epochs or epoch % self.config.ckpt_freq == 0:
            print('saving model')
            checkpoint = {'model_state_dict' : self.model.state_dict(),
                'optimizer_state_dict': self.optimizer.state_dict(),
                'epoch': epoch,
                'loss': loss,
                'accuracy': accuracy}
            torch.save(checkpoint, save_path)

In [2]:
import models
from utils import saveHistory
from data import pathSpecDataset, audioSnippetDataset
import pickle, argparse, re, pdb, json, yaml, random, time, os, csv 
import numpy as np
from tqdm import tqdm
import torch
from torch import optim
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler

In [3]:
config = pickle.load(open('config_object','rb'))

In [4]:
torch.manual_seed(1)
device = torch.device(f"cuda:{config.which_cuda}" if config.cuda else "cpu")
seconds = time.time()

results_dir = './results'
if not os.path.exists(results_dir):
    os.mkdir(results_dir)
    
file_name_dir = './results/' +config.file_name
if not os.path.exists(file_name_dir):
    os.mkdir(file_name_dir)
results_csv='./results/' +config.file_name +'/RandomSearchReport.csv'


history_list=[[], [], [], []]

"""For classification technique we can do a train/test split like in Wilkins, of 0.75"""
# look at the amount of files in folder
# randomly choose which singers will be used in training set
# sort directory melspec by name and generate indices for training set and test set
#if config.split_by == 'singer':
print('here1', time.time() - seconds)
m_list = ['m1_','m2_','m3_','m4_','m5_','m6_','m7_','m8_','m9_','m10_','m11_']
f_list = ['f1_','f2_','f3_','f4_','f5_','f6_','f7_','f8_','f9_']
random.shuffle(m_list)
random.shuffle(f_list)
train_m_list, test_m_list = (m_list[:-3],m_list[-3:])
train_f_list, test_f_list = (f_list[:-2],f_list[-2:])

train_list = train_m_list + train_f_list
test_list = test_m_list + test_f_list
#print('test_list', test_list)
""" its too complex to write a universal specPathDataset with different subfolder structures.
More ignostic to automatically upload from one shallow directory, and sort from there using the filename analysis.
Make sure the dataset is fed data in same order as sorted fileList"""

with open('spmel_desilenced/spmel_params.yaml') as File:
    spmel_params = yaml.load(File, Loader=yaml.FullLoader)

here1 0.04800152778625488


In [5]:
from torch.utils.data import Dataset, DataLoader
import soundfile as sf
import numpy as np
import os, pdb, pickle, random, math, torch

from multiprocessing import Process, Manager


class pathSpecDataset(Dataset):
    """Dataset class for using a path to spec folders,
        path for labels,
        generates random windowed subspec examples,
        associated labels,
        optional conditioning."""
    def __init__(self, config, spmel_params):
        """Initialize and preprocess the dataset."""
        self.spmel_dir = config.data_dir
        melsteps_per_second = spmel_params['sr'] / spmel_params['hop_size']
        self.window_size = math.ceil(config.chunk_seconds * melsteps_per_second)
    
        style_names = ['belt','lip_trill','straight','vocal_fry','vibrato','breathy']
        singer_names = ['m1_','m2_','m3_','m4_','m5_','m6_','m7_','m8_','m9_','m10_','m11_','f1_','f2_','f3_','f4_','f5_','f6_','f7_','f8_','f9_']
        #self.one_hot_array = np.eye(len(class_names))[np.arange(len(class_names))]
        dir_name, _, fileList = next(os.walk(self.spmel_dir))
        fileList = sorted(fileList)
        dataset = []
        for file_name in fileList:
            if file_name.endswith('.npy'):
                file_path = os.path.join(dir_name, file_name)
                spmel = np.load(file_path)
                print(file_path, spmel)
                for style_idx, style_name in enumerate(style_names):
                    if style_name in file_name:
                        for singer_idx, singer_name in enumerate(singer_names):
                            if singer_name in file_name:
                                dataset.append((spmel, style_idx, singer_idx))
                                break
                        break

        self.dataset = dataset
        self.num_specs = len(dataset)

    """__getitem__ selects a speaker and chooses a random subset of data (in this case
    an utterance) and randomly crops that data. It also selects the corresponding speaker
    embedding and loads that up. It will now also get corresponding pitch contour for such a file"""
    def __getitem__(self, index):
        # pick a random speaker
        dataset = self.dataset
        # spkr_data is literally a list of skpr_id, emb, and utterances from a single speaker
        spmel, style_idx, singer_idx  = dataset[index]
        # pick random spmel_chunk with random crop


        """old incorrect way of feeding data to network"""
############################################################################
#
#        if spmel.shape[0] < self.window_size:
#            len_pad = self.window_size - spmel.shape[0]
#            spmel_chunk = np.pad(spmel, ((0,len_pad),(0,0)), 'constant')
#            #pitch = np.pad(pitch_info, ((0,len_pad),(0,0)), 'constant')
#        elif spmel.shape[0] > self.window_size:
#            left = np.random.randint(spmel.shape[0]-self.window_size)
#            spmel_chunk = spmel[left:left+self.window_size, :]
#            #pitch = pitch_info[left:left+self.window_size, :]
#        else:
#            spmel_chunk = spmel
#            #pitch = pitch_info
# 
#        return spmel_chunk, style_idx, singer_idx
############################################################################

        return spmel, style_idx, singer_idx

    def __len__(self):
        """Return the number of spkrs."""
        return self.num_specs

In [9]:
if config.is_wilkins == True:
    dataset = audioSnippetDataset(config)
    model = models.WilkinsAudioCNN(config)
    fileList = pickle.load(open(config.data_dir, 'rb'))
else:
    _, _, fileList = next(os.walk('./spmel_desilenced'))
    dataset = pathSpecDataset(config, spmel_params)
    model = models.Luo2019AsIs(config, spmel_params)

./spmel_desilenced/f1_arpeggios_belt_c_a.npy [[0.29959017 0.1493526  0.32010865 ... 0.         0.         0.        ]
 [0.28951624 0.18382694 0.30707982 ... 0.         0.         0.        ]
 [0.24534573 0.1824132  0.22762547 ... 0.         0.         0.        ]
 ...
 [0.25090325 0.26602906 0.34207055 ... 0.11491679 0.12848753 0.00342309]
 [0.29453263 0.31976104 0.3282418  ... 0.12637556 0.09548488 0.        ]
 [0.30461282 0.2854759  0.28093994 ... 0.14514104 0.096545   0.01303203]]
./spmel_desilenced/f1_arpeggios_belt_c_e.npy [[2.5329652e-01 2.2147758e-01 2.8205168e-01 ... 0.0000000e+00
  7.6246755e-03 0.0000000e+00]
 [3.3521706e-01 3.1734410e-01 2.5910199e-01 ... 0.0000000e+00
  6.3396022e-02 0.0000000e+00]
 [3.3309507e-01 3.2818154e-01 2.4708140e-01 ... 9.7090388e-03
  9.2109762e-02 4.5766219e-04]
 ...
 [1.7456688e-01 1.3918921e-01 3.3258632e-01 ... 1.1929559e-01
  1.2559254e-01 3.1871054e-02]
 [1.3321258e-01 2.3114456e-01 4.2542762e-01 ... 1.0131822e-01
  7.7184275e-02 0.0000000e+

./spmel_desilenced/f2_arpeggios_straight_i.npy [[0.40391967 0.38389343 0.3511994  ... 0.0665544  0.01469094 0.        ]
 [0.34527045 0.3263387  0.32375005 ... 0.09520954 0.03042493 0.        ]
 [0.20785424 0.16790755 0.20780842 ... 0.12502374 0.03664133 0.        ]
 ...
 [0.28034154 0.20223781 0.26926804 ... 0.12515658 0.03158337 0.        ]
 [0.27031413 0.19031708 0.24518621 ... 0.11440247 0.01704982 0.        ]
 [0.3955557  0.39304942 0.42410886 ... 0.08661994 0.         0.        ]]
./spmel_desilenced/f2_arpeggios_straight_o.npy [[0.36921188 0.33250922 0.38492167 ... 0.         0.         0.        ]
 [0.3524403  0.31294665 0.32901517 ... 0.         0.         0.        ]
 [0.19716436 0.19716164 0.24938996 ... 0.         0.         0.        ]
 ...
 [0.22162585 0.20981218 0.22220589 ... 0.07141799 0.04549874 0.        ]
 [0.26206452 0.23067245 0.30920607 ... 0.04768779 0.03948347 0.        ]
 [0.41115168 0.40609846 0.41544294 ... 0.03541121 0.02258547 0.        ]]
./spmel_desilenced

./spmel_desilenced/f4_arpeggios_belt_o.npy [[0.32296145 0.32811385 0.36373624 ... 0.         0.         0.        ]
 [0.2615108  0.2698956  0.29887888 ... 0.05524438 0.         0.        ]
 [0.1861012  0.04687087 0.13200341 ... 0.0810057  0.02497311 0.        ]
 ...
 [0.13720112 0.00977504 0.10553104 ... 0.         0.         0.        ]
 [0.23610152 0.19825836 0.25355694 ... 0.         0.         0.        ]
 [0.35049948 0.35452285 0.3735579  ... 0.         0.         0.        ]]
./spmel_desilenced/f4_arpeggios_belt_u.npy [[0.28662387 0.33706355 0.3982979  ... 0.         0.         0.        ]
 [0.2599016  0.2775806  0.33936137 ... 0.         0.         0.        ]
 [0.09214168 0.09837021 0.16957839 ... 0.         0.         0.        ]
 ...
 [0.12854096 0.         0.27460817 ... 0.03532206 0.         0.        ]
 [0.22590543 0.27056456 0.3730615  ... 0.         0.         0.        ]
 [0.23779944 0.3151846  0.4230614  ... 0.         0.         0.        ]]
./spmel_desilenced/f4_arpe

./spmel_desilenced/f5_scales_lip_trill_i.npy [[0.22647206 0.3033033  0.4641353  ... 0.0615642  0.01394496 0.02075207]
 [0.30500397 0.33308277 0.4432204  ... 0.03719877 0.         0.        ]
 [0.36659285 0.3966152  0.45396107 ... 0.0723511  0.0211297  0.        ]
 ...
 [0.2810237  0.29641244 0.28002176 ... 0.         0.         0.        ]
 [0.28684914 0.2597662  0.28889433 ... 0.00684977 0.         0.        ]
 [0.30524045 0.28684235 0.3078101  ... 0.         0.         0.        ]]
./spmel_desilenced/f5_scales_lip_trill_o.npy [[0.4352266  0.39287555 0.39637476 ... 0.01467708 0.07369453 0.        ]
 [0.3969656  0.3699185  0.4503544  ... 0.03612532 0.06527162 0.        ]
 [0.43633765 0.38733745 0.44929725 ... 0.07496508 0.07873051 0.        ]
 ...
 [0.25045177 0.31781504 0.3468592  ... 0.         0.01130365 0.        ]
 [0.31386    0.32523715 0.3476322  ... 0.         0.         0.        ]
 [0.32460493 0.3266382  0.3469097  ... 0.         0.         0.        ]]
./spmel_desilenced/f5_

./spmel_desilenced/f7_arpeggios_belt_a.npy [[0.18115498 0.23729952 0.23784624 ... 0.         0.         0.        ]
 [0.29022852 0.30293617 0.271499   ... 0.         0.         0.        ]
 [0.33656108 0.35164014 0.27200216 ... 0.         0.         0.        ]
 ...
 [0.23870125 0.15706345 0.295961   ... 0.02967938 0.01441102 0.        ]
 [0.38903064 0.37938285 0.41899335 ... 0.01477897 0.01039355 0.        ]
 [0.44491997 0.44616798 0.47918695 ... 0.00850374 0.00951441 0.        ]]
./spmel_desilenced/f7_arpeggios_belt_e.npy [[0.22800307 0.21493988 0.24972074 ... 0.03709858 0.         0.        ]
 [0.23772186 0.18306734 0.3188458  ... 0.10105643 0.0391267  0.        ]
 [0.23512028 0.2364736  0.2993796  ... 0.147538   0.08670549 0.        ]
 ...
 [0.23885188 0.1425752  0.2591407  ... 0.07670709 0.05075603 0.        ]
 [0.23664798 0.18594006 0.3579087  ... 0.05608937 0.03624995 0.        ]
 [0.3438057  0.3377319  0.42867175 ... 0.01165585 0.00825067 0.        ]]
./spmel_desilenced/f7_arpe

./spmel_desilenced/f8_scales_breathy_a.npy [[0.27846643 0.17663652 0.23819482 ... 0.         0.         0.        ]
 [0.24632019 0.13965838 0.24858132 ... 0.         0.         0.        ]
 [0.20299165 0.12331256 0.19935115 ... 0.         0.         0.        ]
 ...
 [0.15868929 0.1032104  0.17754453 ... 0.00810905 0.         0.        ]
 [0.21196783 0.10146102 0.15235047 ... 0.         0.         0.        ]
 [0.269366   0.26102257 0.3240651  ... 0.         0.         0.        ]]
./spmel_desilenced/f8_scales_breathy_e.npy [[0.37408912 0.3798525  0.41610202 ... 0.02881193 0.         0.        ]
 [0.30725065 0.3225452  0.34896427 ... 0.03945404 0.         0.        ]
 [0.14745115 0.1239041  0.1891227  ... 0.06019022 0.         0.        ]
 ...
 [0.18784873 0.1074045  0.16792026 ... 0.05619746 0.         0.        ]
 [0.18315765 0.09471008 0.16457021 ... 0.04602976 0.         0.        ]
 [0.28711772 0.31901997 0.36126766 ... 0.02151834 0.         0.        ]]
./spmel_desilenced/f8_scal

./spmel_desilenced/m10_arpeggios_straight_e.npy [[0.54860526 0.40505242 0.35023278 ... 0.         0.         0.03839037]
 [0.5549526  0.43269774 0.35369557 ... 0.         0.         0.        ]
 [0.56974167 0.46015766 0.32999304 ... 0.         0.         0.        ]
 ...
 [0.51216114 0.521805   0.34400815 ... 0.05707501 0.         0.        ]
 [0.51634514 0.5091465  0.3288529  ... 0.03672112 0.         0.        ]
 [0.52068126 0.47328275 0.36242056 ... 0.00477505 0.         0.        ]]
./spmel_desilenced/m10_arpeggios_straight_i.npy [[0.545503   0.45021975 0.42631492 ... 0.         0.01129768 0.        ]
 [0.5495534  0.41689667 0.4129799  ... 0.01104232 0.01628792 0.        ]
 [0.55957615 0.45218202 0.37066615 ... 0.04837112 0.04656932 0.        ]
 ...
 [0.595086   0.49950412 0.38983178 ... 0.         0.         0.        ]
 [0.5648366  0.47181442 0.35662338 ... 0.         0.         0.        ]
 [0.5290536  0.3987812  0.28441042 ... 0.         0.         0.        ]]
./spmel_desilenc

./spmel_desilenced/m11_scales_vocal_fry_i.npy [[0.51023334 0.4830598  0.43613735 ... 0.10854769 0.01262871 0.02659195]
 [0.44764423 0.4352517  0.37328202 ... 0.07578301 0.         0.        ]
 [0.2995957  0.34832755 0.31058624 ... 0.         0.         0.        ]
 ...
 [0.5416696  0.48165312 0.38488477 ... 0.07230919 0.11323049 0.        ]
 [0.5579126  0.46312258 0.40829375 ... 0.06486326 0.10901599 0.        ]
 [0.5263023  0.46050993 0.4097306  ... 0.06683011 0.09068031 0.        ]]
./spmel_desilenced/m11_scales_vocal_fry_o.npy [[0.48325878 0.43987045 0.41343138 ... 0.         0.         0.        ]
 [0.45085835 0.45507503 0.3995458  ... 0.         0.         0.        ]
 [0.44377267 0.4866743  0.45582917 ... 0.         0.         0.        ]
 ...
 [0.44508034 0.48322827 0.42692557 ... 0.01854822 0.         0.        ]
 [0.5188222  0.48971045 0.4633886  ... 0.06359421 0.04252215 0.        ]
 [0.5294335  0.39799845 0.41427842 ... 0.03116214 0.02112528 0.        ]]
./spmel_desilenced/m

./spmel_desilenced/m2_scales_vibrato_o.npy [[0.50924325 0.50557315 0.47373274 ... 0.         0.         0.        ]
 [0.55463123 0.53252995 0.42304638 ... 0.00741959 0.         0.        ]
 [0.5755317  0.524406   0.3282568  ... 0.03644187 0.         0.        ]
 ...
 [0.5981462  0.63418317 0.49696726 ... 0.         0.         0.        ]
 [0.59996396 0.6237487  0.4797718  ... 0.         0.         0.        ]
 [0.56878185 0.58496976 0.46193406 ... 0.         0.         0.        ]]
./spmel_desilenced/m2_scales_vibrato_u.npy [[0.5878442  0.51360756 0.46572012 ... 0.02375646 0.         0.00591828]
 [0.5871374  0.49565655 0.5047212  ... 0.03722971 0.00481921 0.        ]
 [0.61261046 0.5473333  0.4814993  ... 0.04160604 0.01545153 0.        ]
 ...
 [0.5356887  0.56825    0.41749963 ... 0.         0.         0.        ]
 [0.56345946 0.5653758  0.40758705 ... 0.         0.         0.        ]
 [0.5539221  0.5376699  0.3810674  ... 0.         0.         0.        ]]
./spmel_desilenced/m2_scal

./spmel_desilenced/m4_arpeggios_vocal_fry_u.npy [[0.30776784 0.4111583  0.45251    ... 0.         0.         0.        ]
 [0.43046466 0.38651484 0.38955733 ... 0.         0.         0.        ]
 [0.4488518  0.32659835 0.31015298 ... 0.         0.         0.        ]
 ...
 [0.45519412 0.33137906 0.35434783 ... 0.         0.         0.        ]
 [0.42878714 0.30514053 0.3736439  ... 0.         0.         0.        ]
 [0.42764938 0.37888196 0.39806914 ... 0.         0.         0.        ]]
./spmel_desilenced/m4_scales_belt_a.npy [[0.42524907 0.38635147 0.32242116 ... 0.         0.         0.        ]
 [0.47053978 0.3929807  0.2813473  ... 0.0418027  0.03038478 0.        ]
 [0.48895445 0.41676566 0.20584919 ... 0.06531461 0.03793444 0.        ]
 ...
 [0.50920355 0.4813895  0.28005365 ... 0.0238581  0.         0.        ]
 [0.51430607 0.47232723 0.2844825  ... 0.00170359 0.         0.        ]
 [0.4860827  0.43707752 0.3171024  ... 0.         0.         0.        ]]
./spmel_desilenced/m4_sc

./spmel_desilenced/m6_arpeggios_lip_trill_e.npy [[0.53045774 0.5041673  0.4598034  ... 0.02066801 0.00222068 0.        ]
 [0.5662747  0.5298255  0.46965548 ... 0.093264   0.06680348 0.        ]
 [0.58664453 0.5203989  0.43794787 ... 0.11142044 0.06980813 0.        ]
 ...
 [0.5643808  0.53221864 0.36846575 ... 0.         0.         0.        ]
 [0.5516031  0.49897587 0.35722497 ... 0.         0.         0.        ]
 [0.5228616  0.48693845 0.42401472 ... 0.         0.         0.        ]]
./spmel_desilenced/m6_arpeggios_lip_trill_i.npy [[0.53211415 0.47045574 0.44050178 ... 0.18003641 0.12921408 0.03544629]
 [0.5650859  0.5703532  0.47333583 ... 0.19793291 0.1324444  0.02997442]
 [0.58682775 0.5668445  0.4888357  ... 0.13262777 0.06144188 0.        ]
 ...
 [0.5360594  0.5660263  0.43219584 ... 0.00449412 0.         0.        ]
 [0.5414134  0.5510497  0.42610878 ... 0.00392435 0.         0.        ]
 [0.5017337  0.5244452  0.41060507 ... 0.         0.         0.        ]]
./spmel_desilenc

./spmel_desilenced/m7_scales_vibrato_e.npy [[0.5365273  0.55540174 0.49108398 ... 0.07144883 0.03891568 0.01463058]
 [0.6048405  0.58851796 0.5134188  ... 0.05249888 0.06088996 0.        ]
 [0.63871676 0.593679   0.41357142 ... 0.08818957 0.11688679 0.        ]
 ...
 [0.6159213  0.5924633  0.3897157  ... 0.14674795 0.10175133 0.        ]
 [0.5831121  0.5906124  0.42180362 ... 0.16246364 0.08300646 0.        ]
 [0.55239224 0.5658417  0.4450941  ... 0.08204623 0.00965229 0.        ]]
./spmel_desilenced/m7_scales_vibrato_i.npy [[0.60241616 0.5975912  0.47995892 ... 0.07143284 0.03915206 0.02035237]
 [0.6392596  0.57788646 0.4370845  ... 0.06636854 0.00821977 0.        ]
 [0.65601176 0.59555656 0.4575905  ... 0.07734005 0.01655182 0.        ]
 ...
 [0.61061585 0.5581403  0.38285533 ... 0.         0.07246874 0.        ]
 [0.57965744 0.49849594 0.4001667  ... 0.         0.04475231 0.        ]
 [0.533768   0.40697396 0.3531048  ... 0.         0.01996271 0.        ]]
./spmel_desilenced/m7_scal

./spmel_desilenced/m9_scales_belt_u.npy [[0.5516883  0.425971   0.37418666 ... 0.07157373 0.11106243 0.04563925]
 [0.5659776  0.4938423  0.410591   ... 0.07075157 0.07452945 0.        ]
 [0.57328844 0.4941422  0.3698542  ... 0.06042698 0.00719933 0.        ]
 ...
 [0.6005216  0.5000272  0.41407698 ... 0.         0.         0.        ]
 [0.58348596 0.46376947 0.36567318 ... 0.         0.         0.        ]
 [0.54192555 0.47263765 0.3453141  ... 0.         0.         0.        ]]
./spmel_desilenced/m9_scales_breathy_a.npy [[0.40429503 0.38745713 0.40475494 ... 0.         0.         0.        ]
 [0.34665376 0.32304513 0.34843582 ... 0.         0.         0.        ]
 [0.22365735 0.12973087 0.2034375  ... 0.         0.         0.        ]
 ...
 [0.26684704 0.15759178 0.19978693 ... 0.         0.         0.        ]
 [0.29089534 0.13756914 0.18196931 ... 0.         0.         0.        ]
 [0.26824647 0.2600234  0.2792483  ... 0.         0.         0.        ]]
./spmel_desilenced/m9_scales_

In [10]:
len(dataset)

1182

In [11]:
import matplotlib.pyplot as plt

for i in range(300, 330):
    print(dataset[i])
#     spmel, style_idx, singer_idx = dataset[i]
#     spmel = np.rot90(spmel)
#     plt.imshow(spmel, cmap='gray')
#     plt.show()

(array([[0.31437698, 0.3109059 , 0.3758287 , ..., 0.03468204, 0.061722  ,
        0.04981634],
       [0.2696787 , 0.2557307 , 0.33660832, ..., 0.08457197, 0.08997739,
        0.02715467],
       [0.28922784, 0.27464887, 0.25309232, ..., 0.1134089 , 0.11415967,
        0.02747736],
       ...,
       [0.40383855, 0.36857477, 0.46687073, ..., 0.20071225, 0.19498213,
        0.09243079],
       [0.35930568, 0.33234033, 0.50640625, ..., 0.22182497, 0.1809126 ,
        0.06895987],
       [0.32490703, 0.29686552, 0.47484055, ..., 0.2013507 , 0.16020489,
        0.03028215]], dtype=float32), 0, 16)
(array([[0.47130412, 0.43765637, 0.48287433, ..., 0.12018529, 0.12304845,
        0.08129006],
       [0.42288345, 0.36718762, 0.4376165 , ..., 0.15552287, 0.16589521,
        0.08602896],
       [0.30436292, 0.32568854, 0.2906685 , ..., 0.17925352, 0.18859184,
        0.11517309],
       ...,
       [0.4236338 , 0.40818825, 0.40753156, ..., 0.33015165, 0.23229504,
        0.15368904],
       [0.

In [15]:
file_path_list = sorted(fileList)
fileList = [os.path.basename(x) for x in file_path_list]
train_indices_list = []
test_indices_list = []

for fileName_idx, fileName in enumerate(fileList):
    for substring in train_list:
        if substring in fileName:
            train_indices_list.append(fileName_idx)
    for substring in test_list:
        if substring in fileName:
            test_indices_list.append(fileName_idx)
if config.short==True:
    train_indices_list = train_indices_list[:8]
    test_indices_list = test_indices_list[:4]
    config.batch_size = 2

writer = SummaryWriter(comment = '_' +config.file_name)

epoch_labels = []
"""https://stackoverflow.com/questions/50544730/how-do-i-split-a-custom-dataset-into-training-and-test-datasets"""
train_sampler = SubsetRandomSampler(train_indices_list)
test_sampler = SubsetRandomSampler(test_indices_list)
train_loader = DataLoader(dataset, batch_size=config.batch_size, sampler=train_sampler, shuffle=False, drop_last=False)
test_loader = DataLoader(dataset, batch_size=config.batch_size, sampler=test_sampler, shuffle=False, drop_last=False)
vt_classer = VoiceTechniqueClassifier(config, spmel_params)

#example_data, example_targets, ex_singer_ids = iter(test_loader).next()
#writer.add_graph(model, example_data.float())

# if config.load_ckpt != '':

# if config.load_ckpt != '':
#     previous_epochs = int(re.findall('\d+', config.load_ckpt)[0])
# else: previous_epochs = 0


In [16]:
previous_epochs = 0
for epoch in range(previous_epochs+1, previous_epochs+config.epochs+1):
    # history_list gets extended while inside these functions
    train_labels = vt_classer.infer(epoch, train_loader, history_list, writer, len(train_indices_list), 'train')
    val_labels = vt_classer.infer(epoch, test_loader, history_list, writer, len(test_indices_list), 'eval')
    epoch_labels.append((train_labels, val_labels))
#     writer.flush()

# writer.close()
# model_file_name = os.path.basename(__file__)[:-3]

saveHistory(history_list, file_name_dir, string_config, epoch_labels)
#invalid terms just
bestLoss = history_list[0][0]

for idx, loss in enumerate(history_list[0]):
    if loss <= bestLoss:
        bestLoss=loss
        bestEpoch=idx

bestAcc = history_list[3][bestEpoch]


=====> train: 


RuntimeError: stack expects each tensor to be equal size, but got [893, 96] at entry 0 and [990, 96] at entry 1