## Post processing

The goal of this notebook is to pass our data through the post_processing.py script from cp_drums to get a similar form of the data

In [16]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
"""

import pickle5 as pickle #for older python versions you may need pickle5
import numpy as np
from random import shuffle
from aux_files import create_onehot_dict
from tqdm import tqdm

data_path = '..\..\data\preprocessed_dadagp_8_1000.pickle'


In [17]:
"""1.Post process to onehot encoding dictionaries"""

with open(data_path, 'rb') as handle:
    fDict = pickle.load(handle)

#calculate occurences (vocab sizes) for each CP stream and max encoder-decoder 
#sequence lengths

max_enc_length = 0 
max_dec_length = 0 

allEnc_Occs = [] #Encoder

allDec_Occs = [] #Decoder 

for k in range(0, len(fDict['Encoder_RG'])):
    
    #get max seq_lengths
    if max_enc_length < len(fDict['Encoder_RG'][k]):
        max_enc_length = len(fDict['Encoder_RG'][k])
    if max_dec_length < len(fDict['Decoder_Bass'][k]):
        max_dec_length = len(fDict['Decoder_Bass'][k])
    
    #get allEncoder and Decoder events and store them to the lists
    allEnc_Occs.extend(list(set(fDict['Encoder_RG'][k])))
    allDec_Occs.extend(list(set(fDict['Decoder_Bass'][k])))
 
        
#Add in the vocabulories the EOS SOS flags Parallel
allEnc_Occs.extend(['sos','eos'])
allDec_Occs.extend(['sos','eos'])
#Create one-hot dictionaries
Enc_Encoder = create_onehot_dict(allEnc_Occs)
Dec_Encoder = create_onehot_dict(allDec_Occs)

#vocabulory sizes
enc_vocab = Enc_Encoder.categories_[0].shape[0]  #31
dec_vocab = Dec_Encoder.categories_[0].shape[0]  #31


#save the Encoders for the generation stage
encoders_path = '..\..\data\drums_encoders_cp.pickle'
with open(encoders_path, 'wb') as handle:
    pickle.dump([Enc_Encoder, Dec_Encoder], 
                handle, protocol=pickle.HIGHEST_PROTOCOL)


In [18]:
'''2. Transform the dictionaries to one-hot encodings and add padding'''

#set sequence length encoder decoder 
dec_seq_length = max_dec_length + 1 #for sos or eos #545
enc_seq_length = max_enc_length + 2 #for sos and eos indications #597



trainDict = {'All_Events': [],
              'Encoder_Input': [],
            'Decoder_Input': [],
            'Decoder_Output': []}


for t in tqdm(range(0, len(fDict['Encoder_RG']))):
    #store All_Events for later use
    allEvents_seq = fDict['All_Events'][t]
    trainDict['All_Events'].append(allEvents_seq)
    
    #prepare data for encoders decoders CP
    aEnc_seq = fDict['Encoder_RG'][t]
    
    aDec_seq = fDict['Decoder_Bass'][t]
      
    pad_lgt_enc_P = enc_seq_length-len(aEnc_seq)-2 #calculate paddings
    pad_lgt_dec_P = dec_seq_length-len(aDec_seq)-1 #same for both outputs

    
    '''Encoder'''
    Enc_pad_emb = np.array(pad_lgt_enc_P*[0])   
    
    Enc_Input = Enc_Encoder.transform(np.array(['sos']+aEnc_seq+['eos']).reshape(-1, 1)).toarray()
    Enc_Input = [np.where(r==1)[0][0] for r in Enc_Input] #for embeddings
    Enc_Input = [x+1 for x in Enc_Input] #shift by one in order to have 0 as pad
    trainDict['Encoder_Input'].append(np.concatenate((Enc_Input,Enc_pad_emb), axis = 0))
    
    '''Decoder'''
    Dec_pad_emb = np.array(pad_lgt_dec_P*[0]) 
    
    Dec_Input = Dec_Encoder.transform(np.array(['sos']+aDec_seq).reshape(-1, 1)).toarray()
    Dec_Input = [np.where(r==1)[0][0] for r in Dec_Input] 
    Dec_Input = [x+1 for x in Dec_Input] 
    trainDict['Decoder_Input'].append(np.concatenate((Dec_Input,Dec_pad_emb), axis = 0))  

    Dec_Tf = Dec_Encoder.transform(np.array(aDec_seq+['eos']).reshape(-1, 1)).toarray()
    Dec_Tf = [np.where(r==1)[0][0] for r in Dec_Tf] 
    Dec_Tf = [x+1 for x in Dec_Tf] 
    trainDict['Decoder_Output'].append(np.concatenate((Dec_Tf, Dec_pad_emb), axis = 0)) 
    

100%|██████████| 12992/12992 [03:16<00:00, 65.99it/s]


In [24]:
print(dec_seq_length, enc_seq_length, dec_vocab, enc_vocab)

738 975 1078 1724


In [23]:
'''Split the dataset to train test 85-15'''
index_shuf = list(range(len(trainDict['Encoder_Input']))) #random shufling
shuffle(index_shuf)

trainSet = {'All_Events': [],
              'Encoder_Input': [],
            'Decoder_Input': [],
            'Decoder_Output': []}

testSet = {'All_Events': [],
              'Encoder_Input': [],
            'Decoder_Input': [],
            'Decoder_Output': []}


trIDXs = int(0.85*len(index_shuf))
for i in range(0,trIDXs):
    trainSet['All_Events'].append(trainDict['All_Events'][index_shuf[i]])
    trainSet['Encoder_Input'].append(trainDict['Encoder_Input'][index_shuf[i]])
    trainSet['Decoder_Input'].append(trainDict['Decoder_Input'][index_shuf[i]])
    trainSet['Decoder_Output'].append(trainDict['Decoder_Output'][index_shuf[i]])



for i in range(trIDXs,len(index_shuf)):
    testSet['All_Events'].append(trainDict['All_Events'][index_shuf[i]])
    testSet['Encoder_Input'].append(trainDict['Encoder_Input'][index_shuf[i]])
    testSet['Decoder_Input'].append(trainDict['Decoder_Input'][index_shuf[i]])
    testSet['Decoder_Output'].append(trainDict['Decoder_Output'][index_shuf[i]])


#save them
train_path = "..\..\data\\train_set_streams.pickle"
test_path = '..\..\data\\test_set_streams.pickle'

with open(train_path, 'wb') as handle:
    pickle.dump(trainSet, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
with open(test_path, 'wb') as handle:
    pickle.dump(testSet, handle, protocol=pickle.HIGHEST_PROTOCOL)  
